-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHtmlParser.java
More file actions
69 lines (60 loc) · 2.51 KB
/
HtmlParser.java
File metadata and controls
69 lines (60 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import java.util.*;
public class HtmlParser {
private static final String OPENING_TAG_REGEX = "^<[^/!][^>]*>$";
private static final String CLOSING_TAG_REGEX = "^</[^>]*>$";
private static final String INVALID_TEXT_REGEX = "^<[!].*>$|^<.*>$";
/**
* Parses the HTML and returns the deepest text found.
* @param htmlLines List of HTML lines.
* @return Deeper text of "malformed HTML" if not found.
*/
public String findDeepestText(List<String> htmlLines) {
Deque<String> tagStack = new ArrayDeque<>();
Map<Integer, String> deepestTexts = new HashMap<>();
int maxDepth = 0, currentDepth = 0;
boolean insideScriptOrStyle = false;
for (String line : htmlLines) {
if (isOpeningTag(line)) {
String tagName = extractTagName(line);
if (isScriptOrStyle(tagName)) {
insideScriptOrStyle = true;
}
tagStack.push(tagName);
currentDepth++;
} else if (isClosingTag(line)) {
String tagName = extractTagName(line);
if (isScriptOrStyle(tagName)) {
insideScriptOrStyle = false;
}
if (!tagStack.isEmpty()) {
tagStack.pop();
currentDepth--;
}
} else if (!insideScriptOrStyle && isValidText(line)) {
// Removes HTML tags and keeps only the textual content
String cleanedLine = HtmlCleaner.stripHtmlTags(line);
if (currentDepth >= maxDepth) {
maxDepth = currentDepth;
deepestTexts.putIfAbsent(currentDepth, cleanedLine);
}
}
}
return deepestTexts.getOrDefault(maxDepth, "malformed HTML");
}
private boolean isOpeningTag(String line) {
return line.matches(OPENING_TAG_REGEX);
}
private boolean isClosingTag(String line) {
return line.matches(CLOSING_TAG_REGEX);
}
private boolean isValidText(String line) {
return !line.matches(INVALID_TEXT_REGEX);
}
private boolean isScriptOrStyle(String tagName) {
return "script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName);
}
private String extractTagName(String tag) {
// Remove '<', '>' and '/' characters to get the tag name
return tag.replaceAll("[<>/]", "").split("\\s+")[0];
}
}