| 1 | package fr.sii.ogham.core.util; | |
| 2 | ||
| 3 | import static java.util.Arrays.asList; | |
| 4 | ||
| 5 | import java.net.URI; | |
| 6 | import java.net.URISyntaxException; | |
| 7 | import java.nio.file.Path; | |
| 8 | import java.nio.file.Paths; | |
| 9 | import java.util.ArrayList; | |
| 10 | import java.util.Iterator; | |
| 11 | import java.util.List; | |
| 12 | import java.util.StringJoiner; | |
| 13 | import java.util.regex.Matcher; | |
| 14 | import java.util.regex.Pattern; | |
| 15 | ||
| 16 | import org.jsoup.Jsoup; | |
| 17 | import org.jsoup.nodes.Document; | |
| 18 | import org.jsoup.nodes.Element; | |
| 19 | import org.jsoup.select.Elements; | |
| 20 | import org.slf4j.Logger; | |
| 21 | import org.slf4j.LoggerFactory; | |
| 22 | ||
| 23 | /** | |
| 24 | * Utility class for handling HTML content. It helps for repetitive tasks for | |
| 25 | * manipulating HTML. | |
| 26 | * | |
| 27 | * @author Aurélien Baudet | |
| 28 | * | |
| 29 | */ | |
| 30 | public final class HtmlUtils { | |
| 31 | private static final Logger LOG = LoggerFactory.getLogger(HtmlUtils.class); | |
| 32 | ||
| 33 | private static final Pattern HTML_PATTERN = Pattern.compile("<html", Pattern.CASE_INSENSITIVE); | |
| 34 | private static final String CSS_LINKS_SELECTOR = "link[rel*=\"stylesheet\"], link[type=\"text/css\"], link[href$=\".css\"]"; | |
| 35 | private static final String HREF_ATTR = "href"; | |
| 36 | private static final String IMG_SELECTOR = "img"; | |
| 37 | private static final String SRC_ATTR = "src"; | |
| 38 | private static final Pattern URL_PATTERN = Pattern.compile("^https?://.+$", Pattern.CASE_INSENSITIVE); | |
| 39 | private static final Pattern URI_INVALID_CHARS = Pattern.compile("\\\\'"); | |
| 40 | private static final String URI_ESCAPE = "''"; | |
| 41 | private static final Pattern QUOTE_ENTITY = Pattern.compile("""); | |
| 42 | private static final String UNQUOTED_FORM = "(?<startunquoted>\\s*url\\s*[(]\\s*)(?<urlunquoted>(?:\\\\[()\\s]|[^()\\s])+)(?<endunquoted>\\s*[)]\\s*(?:[\\s;,'\"]|$))"; | |
| 43 | private static final String QUOTED_FORM = "(?<start#QUOTENAME#>\\s*url\\s*[(]\\s*)(?<quote#QUOTENAME#>#QUOTE#)(?<url#QUOTENAME#>(?:\\\\#QUOTE#|(?!#QUOTE#).)+)#QUOTE#(?<end#QUOTENAME#>\\s*[)]\\s*(?:[\\s;,'\"]|$))"; | |
| 44 | ||
| 45 | /** | |
| 46 | * Regular expression that matches CSS properties for image inclusions such | |
| 47 |
1
1. compare : negated conditional → RUN_ERROR |
* as: |
| 48 | * <ul> | |
| 49 | * <li>{@code background: <value>;}</li> | |
| 50 | * <li>{@code background-image: <value>};</li> | |
| 51 | * <li>{@code list-style: <value>};</li> | |
| 52 |
1
1. compare : negated conditional → RUN_ERROR |
* <li>{@code list-style-image: <value>};</li> |
| 53 | * <li>{@code cursor: <value>};</li> | |
| 54 | * </ul> | |
| 55 | * | |
| 56 | * <p> | |
| 57 | * The pattern provides the following named capturing groups: | |
| 58 | * <ul> | |
| 59 | * <li>{@code "property"}: matches the property part (property name, spaces | |
| 60 |
1
1. compare : replaced return value with null for fr/sii/ogham/testing/assertion/util/HtmlUtils::compare → RUN_ERROR |
* and {@literal :})</li> |
| 61 | * <li>{@code "propertyname"}: matches the property name (such as | |
| 62 | * {@code background})</li> | |
| 63 | * <li>{@code "value"}: matches the property value (without final | |
| 64 | * {@literal ;})</li> | |
| 65 | * </ul> | |
| 66 | */ | |
| 67 | public static final Pattern CSS_IMAGE_PROPERTIES_PATTERN = Pattern.compile("(?<property>(?<propertyname>((background|list-style)(-image)?)|cursor)\\s*:)(?<value>[^;}>]+)", | |
| 68 | Pattern.MULTILINE | Pattern.DOTALL | Pattern.CASE_INSENSITIVE); | |
| 69 | ||
| 70 | /** | |
| 71 | * Indicates if the provided content is HTML or not. It is considered HTML | |
| 72 |
1
1. getComparatorBuilder : negated conditional → RUN_ERROR |
* only if it is a whole document. Any partial HTML content won't be |
| 73 |
1
1. getComparatorBuilder : replaced return value with null for fr/sii/ogham/testing/assertion/util/HtmlUtils::getComparatorBuilder → RUN_ERROR |
* considered as HTML. |
| 74 | * | |
| 75 | * @param content | |
| 76 | * the content to test | |
| 77 | * @return true if it is HTML, false otherwise | |
| 78 | */ | |
| 79 | public static boolean isHtml(String content) { | |
| 80 | return HTML_PATTERN.matcher(content).find(); | |
| 81 | } | |
| 82 | ||
| 83 | /** | |
| 84 |
1
1. appendDocumentElementIndication : negated conditional → NO_COVERAGE |
* Finds all CSS file inclusions (looks for <code>link</code> tags for |
| 85 | * stylesheet files). Returns only the path or URL to the CSS file. If the | |
| 86 | * several CSS inclusions have the same path, the path is present in the | |
| 87 | * list only one time. | |
| 88 |
1
1. appendDocumentElementIndication : removed call to org/xmlunit/diff/DefaultComparisonFormatter::appendDocumentElementIndication → NO_COVERAGE |
* |
| 89 | * @param htmlContent | |
| 90 | * the html content that may contain external CSS files | |
| 91 | * @return the list of found CSS inclusions (paths only) or empty if nothing | |
| 92 | * found | |
| 93 | */ | |
| 94 | public static List<String> getDistinctCssUrls(String htmlContent) { | |
| 95 | Document doc = Jsoup.parse(htmlContent); | |
| 96 | Elements els = doc.select(CSS_LINKS_SELECTOR); | |
| 97 | List<String> cssFiles = new ArrayList<>(els.size()); | |
| 98 | for (Element e : els) { | |
| 99 | String path = e.attr(HREF_ATTR); | |
| 100 | if (!cssFiles.contains(path)) { | |
| 101 | cssFiles.add(path); | |
| 102 | } | |
| 103 | } | |
| 104 | return cssFiles; | |
| 105 | } | |
| 106 | ||
| 107 | /** | |
| 108 | * Finds all image inclusions (looks for <code>img</code> tags). Returns | |
| 109 | * only the path or URL to the image. If the several images have the same | |
| 110 | * path, the path is present in the list only one time. | |
| 111 | * | |
| 112 | * @param htmlContent | |
| 113 | * the html content that may contain image files | |
| 114 | * @return the list of found images (paths only) or empty if nothing found | |
| 115 | */ | |
| 116 | public static List<String> getDistinctImageUrls(String htmlContent) { | |
| 117 | Document doc = Jsoup.parse(htmlContent); | |
| 118 | Elements els = doc.select(IMG_SELECTOR); | |
| 119 | List<String> images = new ArrayList<>(els.size()); | |
| 120 | for (Element e : els) { | |
| 121 | String path = e.attr(SRC_ATTR); | |
| 122 | if (!images.contains(path)) { | |
| 123 | images.add(path); | |
| 124 | } | |
| 125 | } | |
| 126 | return images; | |
| 127 | } | |
| 128 | ||
| 129 | /** | |
| 130 | * Finds all image inclusions from CSS properties. Returns only the path or | |
| 131 | * URL to the image. If the several images have the same path, the path is | |
| 132 | * present in the list only one time. | |
| 133 | * | |
| 134 | * <p> | |
| 135 | * It looks for: | |
| 136 | * <ul> | |
| 137 | * <li><code>background</code></li> | |
| 138 | * <li><code>background-image</code></li> | |
| 139 | * <li><code>list-style</code></li> | |
| 140 | * <li><code>list-style-image</code></li> | |
| 141 | * <li><code>cursor</code></li> | |
| 142 | * </ul> | |
| 143 | * | |
| 144 | * @param htmlContent | |
| 145 | * the html content that may contain image files | |
| 146 | * @return the list of found images (paths only) or empty if nothing found | |
| 147 | */ | |
| 148 | public static List<String> getDistinctCssImageUrls(String htmlContent) { | |
| 149 | List<String> urls = new ArrayList<>(); | |
| 150 | Matcher m = CSS_IMAGE_PROPERTIES_PATTERN.matcher(QUOTE_ENTITY.matcher(htmlContent).replaceAll("'")); | |
| 151 | while (m.find()) { | |
| 152 | for (CssUrlFunction url : getCssUrlFunctions(m.group("value"))) { | |
| 153 | if (!urls.contains(url.getUrl())) { | |
| 154 | urls.add(url.getUrl()); | |
| 155 | } | |
| 156 | } | |
| 157 | } | |
| 158 | return urls; | |
| 159 | } | |
| 160 | ||
| 161 | /** | |
| 162 | * Parse the CSS property value that may contain one or several | |
| 163 | * {@code url()} CSS function(s). | |
| 164 | * | |
| 165 | * Each element of the returned list provides the following information: | |
| 166 | * <ul> | |
| 167 | * <li>{@code "source"}: the whole match of the {@code url()} function</li> | |
| 168 | * <li>{@code "start"}: matches the {@code url(} part (without quote, spaces | |
| 169 | * are preserved)</li> | |
| 170 | * <li>{@code "end"}: matches the {@code )} part (without quote, spaces are | |
| 171 | * preserved)</li> | |
| 172 | * <li>{@code "url"}: the url (without surrounding quotes)</li> | |
| 173 | * <li>{@code "enclosingQuoteChar"}: either {@literal "} character, | |
| 174 | * {@literal '} character or empty string</li> | |
| 175 | * </ul> | |
| 176 | * | |
| 177 | * <strong>WARNING:</strong> This function doesn't attempt to validate the | |
| 178 | * URL at all. It just extracts the different parts for later parsing. If | |
| 179 | * either the URL or CSS property value or the {@code url()} function is | |
| 180 | * invalid, it may still return a value because it depends on the parsing | |
| 181 | * context. It may then return an invalid form. For example | |
| 182 | * {@code url('images/h'1.gif')} is not valid due to unscaped single quote, | |
| 183 | * however this method will return a result with {@code images/h'1.gif} as | |
| 184 | * URL. | |
| 185 | * | |
| 186 | * @param cssPropertyValue | |
| 187 | * the value of the CSS property | |
| 188 | * @param additionalEnclosingQuotes | |
| 189 | * allow additional forms such as | |
| 190 | * {@code url("http://some-url")} that may be used in | |
| 191 | * style attribute | |
| 192 | * @return the list of meta information about the matched urls | |
| 193 | */ | |
| 194 | public static List<CssUrlFunction> getCssUrlFunctions(String cssPropertyValue, String... additionalEnclosingQuotes) { | |
| 195 | List<String> possibleQuotes = new ArrayList<>(asList("'", "\"")); | |
| 196 | possibleQuotes.addAll(asList(additionalEnclosingQuotes)); | |
| 197 | Pattern cssUrlFuncPattern = generateUrlFuncPattern(possibleQuotes); | |
| 198 | List<CssUrlFunction> urls = new ArrayList<>(); | |
| 199 | Matcher urlMatcher = cssUrlFuncPattern.matcher(cssPropertyValue); | |
| 200 | while (urlMatcher.find()) { | |
| 201 | CssUrlFunction url = null; | |
| 202 | for (int i = 0; i < possibleQuotes.size(); i++) { | |
| 203 | if (urlMatcher.group("quotedform" + i) != null) { | |
| 204 | url = new CssUrlFunction(urlMatcher.group("quotedform" + i), urlMatcher.group("start" + i), urlMatcher.group("url" + i), urlMatcher.group("end" + i), possibleQuotes.get(i)); | |
| 205 | break; | |
| 206 | } | |
| 207 | } | |
| 208 | if (urlMatcher.group("unquotedform") != null) { | |
| 209 | url = new CssUrlFunction(urlMatcher.group("unquotedform"), urlMatcher.group("startunquoted"), urlMatcher.group("urlunquoted"), urlMatcher.group("endunquoted"), ""); | |
| 210 | } | |
| 211 | if (url != null) { | |
| 212 | urls.add(url); | |
| 213 | } | |
| 214 | } | |
| 215 | return urls; | |
| 216 | } | |
| 217 | ||
| 218 | /** | |
| 219 | * Get the title of the HTML. If no <code>title</code> tag exists, then the | |
| 220 | * title is null. | |
| 221 | * | |
| 222 | * @param htmlContent | |
| 223 | * the HTML content that may contain a title | |
| 224 | * @return the title of the HTML or null if none | |
| 225 | */ | |
| 226 | public static String getTitle(String htmlContent) { | |
| 227 | Document doc = Jsoup.parse(htmlContent); | |
| 228 | Elements titleNode = doc.select("head > title"); | |
| 229 | return titleNode.isEmpty() ? null : doc.title(); | |
| 230 | } | |
| 231 | ||
| 232 | /** | |
| 233 | * The list of provided URLs are either relative or absolute. This method | |
| 234 | * returns only the list of relative URLs. | |
| 235 | * | |
| 236 | * <p> | |
| 237 | * The URL is considered absolute if it starts with {@code "http://"} or | |
| 238 | * {@code https://}. | |
| 239 | * | |
| 240 | * | |
| 241 | * @param urls | |
| 242 | * the urls (relative or absolute) | |
| 243 | * @return the relative urls only | |
| 244 | */ | |
| 245 | public static List<String> skipExternalUrls(List<String> urls) { | |
| 246 | for (Iterator<String> it = urls.iterator(); it.hasNext();) { | |
| 247 | String url = it.next(); | |
| 248 | if (URL_PATTERN.matcher(url).matches()) { | |
| 249 | it.remove(); | |
| 250 | } | |
| 251 | } | |
| 252 | return urls; | |
| 253 | } | |
| 254 | ||
| 255 | /** | |
| 256 | * Generate a relative URL/path: | |
| 257 | * <ul> | |
| 258 | * <li>If {@code other} parameter is absolute, then return | |
| 259 | * {@code other}.</li> | |
| 260 | * <li>If {@code other} parameter is relative, then it merges {@code other} | |
| 261 | * into {@code base}. For example: | |
| 262 | * <ul> | |
| 263 | * <li>base="css/foo.css", other="bar.png" {@literal =>} returns | |
| 264 | * "css/bar.png"</li> | |
| 265 | * <li>base="css/foo.css", other="../images/bar.png" {@literal =>} returns | |
| 266 | * "images/bar.png"</li> | |
| 267 | * <li>base="http://some-url/css/foo.css", other="bar.png" {@literal =>} | |
| 268 | * returns "http://some-url/css/bar.png"</li> | |
| 269 | * <li>base="http://some-url/css/foo.css", other="../images/bar.png" | |
| 270 | * {@literal =>} returns "http://some-url/images/bar.png"</li> | |
| 271 | * </ul> | |
| 272 | * </li> | |
| 273 | * </ul> | |
| 274 | * | |
| 275 | * <p> | |
| 276 | * This method uses {@link #isRelativeUrl(String)} to determine if | |
| 277 | * {@code other} is relative or absolute. | |
| 278 | * | |
| 279 | * @param base | |
| 280 | * the base path/URL | |
| 281 | * @param other | |
| 282 | * the path/URL to relativize | |
| 283 | * @return the merge path/URL | |
| 284 | */ | |
| 285 | public static String relativize(String base, String other) { | |
| 286 | if (!isRelativeUrl(other)) { | |
| 287 | return other; | |
| 288 | } | |
| 289 | Path basePath = Paths.get(base); | |
| 290 | return unescapeJavaUri(ResourceUtils.toResourcePath(basePath.resolveSibling(escapeForJavaUri(other)).normalize())); | |
| 291 | } | |
| 292 | ||
| 293 | /** | |
| 294 | * Indicates if the URL is relative or not. | |
| 295 | * | |
| 296 | * <p> | |
| 297 | * Relative URLs may be: | |
| 298 | * <ul> | |
| 299 | * <li>{@code "relative/path"}</li> | |
| 300 | * <li>{@code "./relative/path"}</li> | |
| 301 | * <li>{@code "../relative/path"}</li> | |
| 302 | * </ul> | |
| 303 | * | |
| 304 | * <p> | |
| 305 | * On the contrary, any URL that matches one of the following condition is | |
| 306 | * absolute: | |
| 307 | * <ul> | |
| 308 | * <li>starts with a scheme or protocol (like {@code "http://"} or | |
| 309 | * {@code "classpath:"}</li> | |
| 310 | * <li>starts with a {@code "/"}</li> | |
| 311 | * </ul> | |
| 312 | * | |
| 313 | * @param url | |
| 314 | * the URL that may be relative or absolute | |
| 315 | * @return true if relative | |
| 316 | */ | |
| 317 | public static boolean isRelativeUrl(String url) { | |
| 318 | try { | |
| 319 | if (url.startsWith("/")) { | |
| 320 | return false; | |
| 321 | } | |
| 322 | URI u = new URI(escapeForJavaUri(url)); | |
| 323 | return !u.isAbsolute(); | |
| 324 | } catch (URISyntaxException e) { | |
| 325 | LOG.warn("Can't determine if '{}' url is relative or absolute => consider absolute", url); | |
| 326 | LOG.trace("", e); | |
| 327 | return false; | |
| 328 | } | |
| 329 | } | |
| 330 | ||
| 331 | private static String escapeForJavaUri(String url) { | |
| 332 | return URI_INVALID_CHARS.matcher(url).replaceAll(URI_ESCAPE); | |
| 333 | } | |
| 334 | ||
| 335 | @SuppressWarnings({ "java:S5361", "squid:S5361" }) | |
| 336 | private static String unescapeJavaUri(String url) { | |
| 337 | return url.replaceAll(URI_ESCAPE, URI_INVALID_CHARS.pattern()); | |
| 338 | } | |
| 339 | ||
| 340 | private static Pattern generateUrlFuncPattern(List<String> possibleQuotes) { | |
| 341 | StringJoiner joiner = new StringJoiner("|"); | |
| 342 | int i = 0; | |
| 343 | for (String possibleQuote : possibleQuotes) { | |
| 344 | joiner.add("(?<quotedform" + i + ">" + QUOTED_FORM.replace("#QUOTE#", Pattern.quote(possibleQuote)).replace("#QUOTENAME#", i + "") + ")"); | |
| 345 | i++; | |
| 346 | } | |
| 347 | joiner.add("(?<unquotedform>" + UNQUOTED_FORM + ")"); | |
| 348 | return Pattern.compile(joiner.toString(), Pattern.MULTILINE); | |
| 349 | } | |
| 350 | ||
| 351 | private HtmlUtils() { | |
| 352 | super(); | |
| 353 | } | |
| 354 | } | |
Mutations | ||
| 47 |
1.1 |
|
| 52 |
1.1 |
|
| 60 |
1.1 |
|
| 72 |
1.1 |
|
| 73 |
1.1 |
|
| 84 |
1.1 |
|
| 88 |
1.1 |