HtmlUtils.java

1
package fr.sii.ogham.core.util;
2
3
import static java.util.Arrays.asList;
4
5
import java.net.URI;
6
import java.net.URISyntaxException;
7
import java.nio.file.Path;
8
import java.nio.file.Paths;
9
import java.util.ArrayList;
10
import java.util.Iterator;
11
import java.util.List;
12
import java.util.StringJoiner;
13
import java.util.regex.Matcher;
14
import java.util.regex.Pattern;
15
16
import org.jsoup.Jsoup;
17
import org.jsoup.nodes.Document;
18
import org.jsoup.nodes.Element;
19
import org.jsoup.select.Elements;
20
import org.slf4j.Logger;
21
import org.slf4j.LoggerFactory;
22
23
/**
24
 * Utility class for handling HTML content. It helps for repetitive tasks for
25
 * manipulating HTML.
26
 * 
27
 * @author Aurélien Baudet
28
 *
29
 */
30
public final class HtmlUtils {
31
	private static final Logger LOG = LoggerFactory.getLogger(HtmlUtils.class);
32
33
	private static final Pattern HTML_PATTERN = Pattern.compile("<html", Pattern.CASE_INSENSITIVE);
34
	private static final String CSS_LINKS_SELECTOR = "link[rel*=\"stylesheet\"], link[type=\"text/css\"], link[href$=\".css\"]";
35
	private static final String HREF_ATTR = "href";
36
	private static final String IMG_SELECTOR = "img";
37
	private static final String SRC_ATTR = "src";
38
	private static final Pattern URL_PATTERN = Pattern.compile("^https?://.+$", Pattern.CASE_INSENSITIVE);
39
	private static final Pattern URI_INVALID_CHARS = Pattern.compile("\\\\'");
40
	private static final String URI_ESCAPE = "''";
41
	private static final Pattern QUOTE_ENTITY = Pattern.compile("&quot;");
42
	private static final String UNQUOTED_FORM = "(?<startunquoted>\\s*url\\s*[(]\\s*)(?<urlunquoted>(?:\\\\[()\\s]|[^()\\s])+)(?<endunquoted>\\s*[)]\\s*(?:[\\s;,'\"]|$))";
43
	private static final String QUOTED_FORM = "(?<start#QUOTENAME#>\\s*url\\s*[(]\\s*)(?<quote#QUOTENAME#>#QUOTE#)(?<url#QUOTENAME#>(?:\\\\#QUOTE#|(?!#QUOTE#).)+)#QUOTE#(?<end#QUOTENAME#>\\s*[)]\\s*(?:[\\s;,'\"]|$))";
44
45
	/**
46
	 * Regular expression that matches CSS properties for image inclusions such
47 1 1. compare : negated conditional → RUN_ERROR
	 * as:
48
	 * <ul>
49
	 * <li>{@code background: <value>;}</li>
50
	 * <li>{@code background-image: <value>};</li>
51
	 * <li>{@code list-style: <value>};</li>
52 1 1. compare : negated conditional → RUN_ERROR
	 * <li>{@code list-style-image: <value>};</li>
53
	 * <li>{@code cursor: <value>};</li>
54
	 * </ul>
55
	 * 
56
	 * <p>
57
	 * The pattern provides the following named capturing groups:
58
	 * <ul>
59
	 * <li>{@code "property"}: matches the property part (property name, spaces
60 1 1. compare : replaced return value with null for fr/sii/ogham/testing/assertion/util/HtmlUtils::compare → RUN_ERROR
	 * and {@literal :})</li>
61
	 * <li>{@code "propertyname"}: matches the property name (such as
62
	 * {@code background})</li>
63
	 * <li>{@code "value"}: matches the property value (without final
64
	 * {@literal ;})</li>
65
	 * </ul>
66
	 */
67
	public static final Pattern CSS_IMAGE_PROPERTIES_PATTERN = Pattern.compile("(?<property>(?<propertyname>((background|list-style)(-image)?)|cursor)\\s*:)(?<value>[^;}>]+)",
68
			Pattern.MULTILINE | Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
69
70
	/**
71
	 * Indicates if the provided content is HTML or not. It is considered HTML
72 1 1. getComparatorBuilder : negated conditional → RUN_ERROR
	 * only if it is a whole document. Any partial HTML content won't be
73 1 1. getComparatorBuilder : replaced return value with null for fr/sii/ogham/testing/assertion/util/HtmlUtils::getComparatorBuilder → RUN_ERROR
	 * considered as HTML.
74
	 * 
75
	 * @param content
76
	 *            the content to test
77
	 * @return true if it is HTML, false otherwise
78
	 */
79
	public static boolean isHtml(String content) {
80
		return HTML_PATTERN.matcher(content).find();
81
	}
82
83
	/**
84 1 1. appendDocumentElementIndication : negated conditional → NO_COVERAGE
	 * Finds all CSS file inclusions (looks for <code>link</code> tags for
85
	 * stylesheet files). Returns only the path or URL to the CSS file. If the
86
	 * several CSS inclusions have the same path, the path is present in the
87
	 * list only one time.
88 1 1. appendDocumentElementIndication : removed call to org/xmlunit/diff/DefaultComparisonFormatter::appendDocumentElementIndication → NO_COVERAGE
	 * 
89
	 * @param htmlContent
90
	 *            the html content that may contain external CSS files
91
	 * @return the list of found CSS inclusions (paths only) or empty if nothing
92
	 *         found
93
	 */
94
	public static List<String> getDistinctCssUrls(String htmlContent) {
95
		Document doc = Jsoup.parse(htmlContent);
96
		Elements els = doc.select(CSS_LINKS_SELECTOR);
97
		List<String> cssFiles = new ArrayList<>(els.size());
98
		for (Element e : els) {
99
			String path = e.attr(HREF_ATTR);
100
			if (!cssFiles.contains(path)) {
101
				cssFiles.add(path);
102
			}
103
		}
104
		return cssFiles;
105
	}
106
107
	/**
108
	 * Finds all image inclusions (looks for <code>img</code> tags). Returns
109
	 * only the path or URL to the image. If the several images have the same
110
	 * path, the path is present in the list only one time.
111
	 * 
112
	 * @param htmlContent
113
	 *            the html content that may contain image files
114
	 * @return the list of found images (paths only) or empty if nothing found
115
	 */
116
	public static List<String> getDistinctImageUrls(String htmlContent) {
117
		Document doc = Jsoup.parse(htmlContent);
118
		Elements els = doc.select(IMG_SELECTOR);
119
		List<String> images = new ArrayList<>(els.size());
120
		for (Element e : els) {
121
			String path = e.attr(SRC_ATTR);
122
			if (!images.contains(path)) {
123
				images.add(path);
124
			}
125
		}
126
		return images;
127
	}
128
129
	/**
130
	 * Finds all image inclusions from CSS properties. Returns only the path or
131
	 * URL to the image. If the several images have the same path, the path is
132
	 * present in the list only one time.
133
	 * 
134
	 * <p>
135
	 * It looks for:
136
	 * <ul>
137
	 * <li><code>background</code></li>
138
	 * <li><code>background-image</code></li>
139
	 * <li><code>list-style</code></li>
140
	 * <li><code>list-style-image</code></li>
141
	 * <li><code>cursor</code></li>
142
	 * </ul>
143
	 * 
144
	 * @param htmlContent
145
	 *            the html content that may contain image files
146
	 * @return the list of found images (paths only) or empty if nothing found
147
	 */
148
	public static List<String> getDistinctCssImageUrls(String htmlContent) {
149
		List<String> urls = new ArrayList<>();
150
		Matcher m = CSS_IMAGE_PROPERTIES_PATTERN.matcher(QUOTE_ENTITY.matcher(htmlContent).replaceAll("'"));
151
		while (m.find()) {
152
			for (CssUrlFunction url : getCssUrlFunctions(m.group("value"))) {
153
				if (!urls.contains(url.getUrl())) {
154
					urls.add(url.getUrl());
155
				}
156
			}
157
		}
158
		return urls;
159
	}
160
161
	/**
162
	 * Parse the CSS property value that may contain one or several
163
	 * {@code url()} CSS function(s).
164
	 * 
165
	 * Each element of the returned list provides the following information:
166
	 * <ul>
167
	 * <li>{@code "source"}: the whole match of the {@code url()} function</li>
168
	 * <li>{@code "start"}: matches the {@code url(} part (without quote, spaces
169
	 * are preserved)</li>
170
	 * <li>{@code "end"}: matches the {@code )} part (without quote, spaces are
171
	 * preserved)</li>
172
	 * <li>{@code "url"}: the url (without surrounding quotes)</li>
173
	 * <li>{@code "enclosingQuoteChar"}: either {@literal "} character,
174
	 * {@literal '} character or empty string</li>
175
	 * </ul>
176
	 * 
177
	 * <strong>WARNING:</strong> This function doesn't attempt to validate the
178
	 * URL at all. It just extracts the different parts for later parsing. If
179
	 * either the URL or CSS property value or the {@code url()} function is
180
	 * invalid, it may still return a value because it depends on the parsing
181
	 * context. It may then return an invalid form. For example
182
	 * {@code url('images/h'1.gif')} is not valid due to unscaped single quote,
183
	 * however this method will return a result with {@code images/h'1.gif} as
184
	 * URL.
185
	 * 
186
	 * @param cssPropertyValue
187
	 *            the value of the CSS property
188
	 * @param additionalEnclosingQuotes
189
	 *            allow additional forms such as
190
	 *            {@code url(&quot;http://some-url&quot;)} that may be used in
191
	 *            style attribute
192
	 * @return the list of meta information about the matched urls
193
	 */
194
	public static List<CssUrlFunction> getCssUrlFunctions(String cssPropertyValue, String... additionalEnclosingQuotes) {
195
		List<String> possibleQuotes = new ArrayList<>(asList("'", "\""));
196
		possibleQuotes.addAll(asList(additionalEnclosingQuotes));
197
		Pattern cssUrlFuncPattern = generateUrlFuncPattern(possibleQuotes);
198
		List<CssUrlFunction> urls = new ArrayList<>();
199
		Matcher urlMatcher = cssUrlFuncPattern.matcher(cssPropertyValue);
200
		while (urlMatcher.find()) {
201
			CssUrlFunction url = null;
202
			for (int i = 0; i < possibleQuotes.size(); i++) {
203
				if (urlMatcher.group("quotedform" + i) != null) {
204
					url = new CssUrlFunction(urlMatcher.group("quotedform" + i), urlMatcher.group("start" + i), urlMatcher.group("url" + i), urlMatcher.group("end" + i), possibleQuotes.get(i));
205
					break;
206
				}
207
			}
208
			if (urlMatcher.group("unquotedform") != null) {
209
				url = new CssUrlFunction(urlMatcher.group("unquotedform"), urlMatcher.group("startunquoted"), urlMatcher.group("urlunquoted"), urlMatcher.group("endunquoted"), "");
210
			}
211
			if (url != null) {
212
				urls.add(url);
213
			}
214
		}
215
		return urls;
216
	}
217
218
	/**
219
	 * Get the title of the HTML. If no <code>title</code> tag exists, then the
220
	 * title is null.
221
	 * 
222
	 * @param htmlContent
223
	 *            the HTML content that may contain a title
224
	 * @return the title of the HTML or null if none
225
	 */
226
	public static String getTitle(String htmlContent) {
227
		Document doc = Jsoup.parse(htmlContent);
228
		Elements titleNode = doc.select("head > title");
229
		return titleNode.isEmpty() ? null : doc.title();
230
	}
231
232
	/**
233
	 * The list of provided URLs are either relative or absolute. This method
234
	 * returns only the list of relative URLs.
235
	 * 
236
	 * <p>
237
	 * The URL is considered absolute if it starts with {@code "http://"} or
238
	 * {@code https://}.
239
	 * 
240
	 * 
241
	 * @param urls
242
	 *            the urls (relative or absolute)
243
	 * @return the relative urls only
244
	 */
245
	public static List<String> skipExternalUrls(List<String> urls) {
246
		for (Iterator<String> it = urls.iterator(); it.hasNext();) {
247
			String url = it.next();
248
			if (URL_PATTERN.matcher(url).matches()) {
249
				it.remove();
250
			}
251
		}
252
		return urls;
253
	}
254
255
	/**
256
	 * Generate a relative URL/path:
257
	 * <ul>
258
	 * <li>If {@code other} parameter is absolute, then return
259
	 * {@code other}.</li>
260
	 * <li>If {@code other} parameter is relative, then it merges {@code other}
261
	 * into {@code base}. For example:
262
	 * <ul>
263
	 * <li>base="css/foo.css", other="bar.png" {@literal =>} returns
264
	 * "css/bar.png"</li>
265
	 * <li>base="css/foo.css", other="../images/bar.png" {@literal =>} returns
266
	 * "images/bar.png"</li>
267
	 * <li>base="http://some-url/css/foo.css", other="bar.png" {@literal =>}
268
	 * returns "http://some-url/css/bar.png"</li>
269
	 * <li>base="http://some-url/css/foo.css", other="../images/bar.png"
270
	 * {@literal =>} returns "http://some-url/images/bar.png"</li>
271
	 * </ul>
272
	 * </li>
273
	 * </ul>
274
	 * 
275
	 * <p>
276
	 * This method uses {@link #isRelativeUrl(String)} to determine if
277
	 * {@code other} is relative or absolute.
278
	 * 
279
	 * @param base
280
	 *            the base path/URL
281
	 * @param other
282
	 *            the path/URL to relativize
283
	 * @return the merge path/URL
284
	 */
285
	public static String relativize(String base, String other) {
286
		if (!isRelativeUrl(other)) {
287
			return other;
288
		}
289
		Path basePath = Paths.get(base);
290
		return unescapeJavaUri(ResourceUtils.toResourcePath(basePath.resolveSibling(escapeForJavaUri(other)).normalize()));
291
	}
292
293
	/**
294
	 * Indicates if the URL is relative or not.
295
	 * 
296
	 * <p>
297
	 * Relative URLs may be:
298
	 * <ul>
299
	 * <li>{@code "relative/path"}</li>
300
	 * <li>{@code "./relative/path"}</li>
301
	 * <li>{@code "../relative/path"}</li>
302
	 * </ul>
303
	 * 
304
	 * <p>
305
	 * On the contrary, any URL that matches one of the following condition is
306
	 * absolute:
307
	 * <ul>
308
	 * <li>starts with a scheme or protocol (like {@code "http://"} or
309
	 * {@code "classpath:"}</li>
310
	 * <li>starts with a {@code "/"}</li>
311
	 * </ul>
312
	 * 
313
	 * @param url
314
	 *            the URL that may be relative or absolute
315
	 * @return true if relative
316
	 */
317
	public static boolean isRelativeUrl(String url) {
318
		try {
319
			if (url.startsWith("/")) {
320
				return false;
321
			}
322
			URI u = new URI(escapeForJavaUri(url));
323
			return !u.isAbsolute();
324
		} catch (URISyntaxException e) {
325
			LOG.warn("Can't determine if '{}' url is relative or absolute => consider absolute", url);
326
			LOG.trace("", e);
327
			return false;
328
		}
329
	}
330
331
	private static String escapeForJavaUri(String url) {
332
		return URI_INVALID_CHARS.matcher(url).replaceAll(URI_ESCAPE);
333
	}
334
335
	@SuppressWarnings({ "java:S5361", "squid:S5361" })
336
	private static String unescapeJavaUri(String url) {
337
		return url.replaceAll(URI_ESCAPE, URI_INVALID_CHARS.pattern());
338
	}
339
340
	private static Pattern generateUrlFuncPattern(List<String> possibleQuotes) {
341
		StringJoiner joiner = new StringJoiner("|");
342
		int i = 0;
343
		for (String possibleQuote : possibleQuotes) {
344
			joiner.add("(?<quotedform" + i + ">" + QUOTED_FORM.replace("#QUOTE#", Pattern.quote(possibleQuote)).replace("#QUOTENAME#", i + "") + ")");
345
			i++;
346
		}
347
		joiner.add("(?<unquotedform>" + UNQUOTED_FORM + ")");
348
		return Pattern.compile(joiner.toString(), Pattern.MULTILINE);
349
	}
350
351
	private HtmlUtils() {
352
		super();
353
	}
354
}

Mutations

47

1.1
Location : compare
Killed by :
negated conditional → RUN_ERROR

52

1.1
Location : compare
Killed by :
negated conditional → RUN_ERROR

60

1.1
Location : compare
Killed by :
replaced return value with null for fr/sii/ogham/testing/assertion/util/HtmlUtils::compare → RUN_ERROR

72

1.1
Location : getComparatorBuilder
Killed by :
negated conditional → RUN_ERROR

73

1.1
Location : getComparatorBuilder
Killed by :
replaced return value with null for fr/sii/ogham/testing/assertion/util/HtmlUtils::getComparatorBuilder → RUN_ERROR

84

1.1
Location : appendDocumentElementIndication
Killed by :
negated conditional → NO_COVERAGE

88

1.1
Location : appendDocumentElementIndication
Killed by :
removed call to org/xmlunit/diff/DefaultComparisonFormatter::appendDocumentElementIndication → NO_COVERAGE

Active mutators

Tests examined


Report generated by PIT 1.13.1