1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.junit.After;
22 import org.junit.Assert;
23 import org.junit.Test;
24 import org.eclipse.rdf4j.repository.RepositoryException;
25 import org.w3c.dom.Document;
26 import org.w3c.dom.NamedNodeMap;
27 import org.w3c.dom.Node;
28 import org.w3c.dom.NodeList;
29
30 import java.io.BufferedInputStream;
31 import java.io.ByteArrayInputStream;
32 import java.io.ByteArrayOutputStream;
33 import java.io.IOException;
34 import java.io.InputStream;
35 import java.io.PrintStream;
36 import java.nio.charset.StandardCharsets;
37
38
39
40
41
42
43
44
45 public class TagSoupParserTest {
46
47 private static final String page = "http://semanticweb.org/wiki/Knud_M%C3%B6ller";
48
49 private TagSoupParser tagSoupParser;
50
51 @After
52 public void tearDown() throws RepositoryException {
53 this.tagSoupParser = null;
54
55 }
56
57 @Test
58 public void testParseSimpleHTML() throws IOException {
59 String html = "<html><head><title>Test</title></head><body><h1>Hello!</h1></body></html>";
60 InputStream input = new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8));
61 Node document = new TagSoupParser(input, "http://example.com/").getDOM();
62 Assert.assertEquals("Test", new HTMLDocument(document).find("//TITLE"));
63 Assert.assertEquals("Hello!", new HTMLDocument(document).find("//H1"));
64 }
65
66 @Test
67 public void testExplicitEncodingBehavior() throws IOException, ExtractionException, RepositoryException {
68 this.tagSoupParser = new TagSoupParser(
69 new BufferedInputStream(this.getClass().getResourceAsStream("/html/encoding-test.html")), page,
70 "UTF-8");
71
72 Assert.assertEquals(this.tagSoupParser.getDOM().getElementsByTagName("title").item(0).getTextContent(),
73 "Knud M\u00F6ller - semanticweb.org");
74 }
75
76
77
78
79
80
81
82
83
84
85
86
87 @Test
88 public void testImplicitEncodingBehavior() throws IOException, ExtractionException, RepositoryException {
89 this.tagSoupParser = new TagSoupParser(
90 new BufferedInputStream(this.getClass().getResourceAsStream("/html/encoding-test.html")), page);
91 Assert.assertNotSame(this.tagSoupParser.getDOM().getElementsByTagName("title").item(0).getTextContent(),
92 "Knud M\u00F6ller - semanticweb.org");
93 }
94
95
96
97
98
99
100
101 @Test
102 public void testEmptySpanElements() throws IOException {
103 final String page = "http://example.com/test-page";
104 InputStream brokenEmptySpanHtml = new BufferedInputStream(
105 this.getClass().getResourceAsStream("/html/empty-span-broken.html"));
106 InputStream worksEmptySpanHtml = new BufferedInputStream(
107 this.getClass().getResourceAsStream("/html/empty-span-works.html"));
108 this.tagSoupParser = new TagSoupParser(brokenEmptySpanHtml, page);
109 Document brokenElementDom = this.tagSoupParser.getDOM();
110 this.tagSoupParser = null;
111
112 this.tagSoupParser = new TagSoupParser(worksEmptySpanHtml, page);
113 Document worksElementDom = this.tagSoupParser.getDOM();
114
115 NodeList brokenNodeList = brokenElementDom.getElementsByTagName("span");
116 Assert.assertEquals(3, brokenNodeList.getLength());
117
118 NodeList worksNodeList = worksElementDom.getElementsByTagName("span");
119 Assert.assertEquals(3, worksNodeList.getLength());
120
121 final ByteArrayOutputStream out1 = new ByteArrayOutputStream();
122 PrintStream psOut1 = new PrintStream(out1, true, StandardCharsets.UTF_8);
123 for (int i = 0; i < worksNodeList.getLength(); i++) {
124 printNode(worksNodeList.item(i), psOut1);
125 }
126 psOut1.close();
127
128 final ByteArrayOutputStream out2 = new ByteArrayOutputStream();
129 PrintStream psOut2 = new PrintStream(out2, true, StandardCharsets.UTF_8);
130 for (int i = 0; i < brokenNodeList.getLength(); i++) {
131 printNode(brokenNodeList.item(i), psOut2);
132 }
133 psOut2.close();
134
135 Assert.assertEquals(out1.toString(StandardCharsets.UTF_8), out2.toString(StandardCharsets.UTF_8));
136 }
137
138 private void printNode(Node node, PrintStream printStream) {
139 printStream.println("node name:" + node.getNodeName());
140 printStream.println("node value:" + node.getNodeValue());
141 printStream.println("node has child:" + node.hasChildNodes());
142 printStream.println("node # child:" + node.getChildNodes().getLength());
143
144 printStream.println("node child:");
145 NodeList childNodes = node.getChildNodes();
146 for (int j = 0; j < childNodes.getLength(); j++) {
147 Node brokenChild = childNodes.item(j);
148 printStream.println(" node name:" + brokenChild.getNodeName());
149 printStream.println(" node type:" + brokenChild.getNodeType());
150 printStream.println(" node value:" + trimValue(brokenChild.getNodeValue()));
151 }
152
153 printStream.println("node attributes:");
154 NamedNodeMap namedNodeMap = node.getAttributes();
155 for (int j = 0; j < namedNodeMap.getLength(); j++) {
156 Node attribute = namedNodeMap.item(j);
157 printStream.println(" attribute name:" + attribute.getNodeName());
158 printStream.println(" attribute value:" + trimValue(attribute.getNodeValue()));
159 }
160 printStream.println();
161 }
162
163 private String trimValue(String in) {
164 return in == null ? "" : in.trim();
165 }
166
167 }