1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.any23.extractor.html;
18
19 import org.apache.any23.extractor.ExtractorFactory;
20 import org.apache.any23.extractor.IssueReport;
21 import org.apache.any23.rdf.RDFUtils;
22 import org.apache.any23.vocab.FOAF;
23 import org.junit.Test;
24
25
26
27
28
29 public class EmbeddedJSONLDExtractorTest extends AbstractExtractorTestCase {
30
31 @Test
32 public void testEmbeddedJSONLDInHead() throws Exception {
33 assertExtract("/html/html-embedded-jsonld-extractor.html");
34 assertModelNotEmpty();
35 assertStatementsSize(null, null, null, 3);
36 }
37
38 @Test
39 public void testSeveralEmbeddedJSONLDInHead() throws Exception {
40 assertExtract("/html/html-embedded-jsonld-extractor-multiple.html");
41 assertModelNotEmpty();
42 assertStatementsSize(null, null, null, 7);
43 }
44
45 @Test
46 public void testEmbeddedJSONLDInBody() throws Exception {
47 assertExtract("/html/html-body-embedded-jsonld-extractor.html");
48 assertModelNotEmpty();
49 assertStatementsSize(null, null, null, 3);
50 }
51
52 @Test
53 public void testEmbeddedJSONLDInHeadAndBody() throws Exception {
54 assertExtract("/html/html-head-and-body-embedded-jsonld-extractor.html");
55 assertModelNotEmpty();
56 assertStatementsSize(null, null, null, 7);
57 }
58
59 @Test
60 public void testJSONLDCommentStripping() throws Exception {
61 assertExtract("/html/html-jsonld-strip-comments.html");
62 assertModelNotEmpty();
63 assertStatementsSize(null, null, null, 3);
64 assertContains(RDFUtils.iri(FOAF.NS, "name"), "Robert\\\" Millar\\\\\"\"\\\\");
65 }
66
67 @Test
68 public void testJSONLDCommaNormalization() {
69 assertExtract("/html/html-jsonld-commas.html");
70 assertModelNotEmpty();
71 assertStatementsSize(null, null, null, 30);
72 }
73
74 @Test
75 public void testJSONLDUnescapedCharacters() {
76 assertExtract("/html/html-jsonld-unescaped-characters.html");
77 assertModelNotEmpty();
78 assertStatementsSize(null, null, null, 375);
79 assertContains(RDFUtils.iri("http://schema.org/name"), "Weezer & Pixies\u0008");
80 assertContains(RDFUtils.iri("http://schema.org/description"),
81 "#1 MAGIC SHOW IN L.A.\nThe current WINNER of the CW’s Penn & Teller’s FOOL US, Illusionist "
82 + "extraordinaire Ivan Amodei is on a national tour with his show INTIMATE ILLUSIONS."
83 + "\n\nCurrently, on an ei...");
84 }
85
86 @Test
87 public void testJSONLDFatalError() {
88 assertExtract("/html/html-jsonld-fatal-error.html", false);
89 assertIssue(IssueReport.IssueLevel.FATAL,
90 ".*Unexpected character .* was expecting comma to separate Object entries.*");
91 assertStatementsSize(null, null, null, 4);
92 }
93
94 @Test
95 public void testJSONLDBadCharacter() throws Exception {
96 assertExtract("/html/html-jsonld-bad-character.html");
97 assertStatementsSize(null, null, null, 12);
98 }
99
100 @Override
101 protected ExtractorFactory<?> getExtractorFactory() {
102 return new EmbeddedJSONLDExtractorFactory();
103 }
104
105 }