1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor;
19
20 import org.apache.any23.AbstractAny23TestBase;
21 import org.apache.any23.configuration.DefaultConfiguration;
22 import org.apache.any23.configuration.ModifiableConfiguration;
23 import org.apache.any23.extractor.html.HTMLFixture;
24 import org.apache.any23.mime.TikaMIMETypeDetector;
25 import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
26 import org.apache.any23.vocab.ICAL;
27 import org.apache.any23.vocab.Review;
28 import org.apache.any23.vocab.SINDICE;
29 import org.apache.any23.vocab.VCard;
30 import org.apache.any23.writer.CompositeTripleHandler;
31 import org.apache.any23.writer.RDFXMLWriter;
32 import org.apache.any23.writer.RepositoryWriter;
33 import org.apache.any23.writer.TripleHandlerException;
34 import org.junit.After;
35 import org.junit.Assert;
36 import org.junit.Before;
37 import org.junit.Test;
38 import org.eclipse.rdf4j.model.Resource;
39 import org.eclipse.rdf4j.model.Statement;
40 import org.eclipse.rdf4j.model.IRI;
41 import org.eclipse.rdf4j.model.Value;
42 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
43 import org.eclipse.rdf4j.repository.RepositoryConnection;
44 import org.eclipse.rdf4j.repository.RepositoryException;
45 import org.eclipse.rdf4j.repository.RepositoryResult;
46 import org.eclipse.rdf4j.repository.sail.SailRepository;
47 import org.eclipse.rdf4j.sail.Sail;
48 import org.eclipse.rdf4j.sail.SailException;
49 import org.eclipse.rdf4j.sail.memory.MemoryStore;
50 import org.slf4j.Logger;
51 import org.slf4j.LoggerFactory;
52
53 import java.io.ByteArrayOutputStream;
54 import java.io.FileNotFoundException;
55 import java.io.IOException;
56 import java.nio.charset.StandardCharsets;
57 import java.util.Locale;
58
59
60
61
62
63
64
65
66 public class SingleDocumentExtractionTest extends AbstractAny23TestBase {
67
68 private static final SINDICE vSINDICE = SINDICE.getInstance();
69 private static final ICAL vICAL = ICAL.getInstance();
70 private static final Review vREVIEW = Review.getInstance();
71 private static final VCard vVCARD = VCard.getInstance();
72
73 private static final Logger logger = LoggerFactory.getLogger(SingleDocumentExtractionTest.class);
74
75 private SingleDocumentExtraction singleDocumentExtraction;
76
77 private ExtractorGroup extractorGroup;
78
79 private Sail store;
80
81 private RepositoryConnection conn;
82
83 RepositoryWriter repositoryWriter;
84
85 ByteArrayOutputStream baos;
86
87 RDFXMLWriter rdfxmlWriter;
88
89 @Before
90 public void setUp() throws Exception {
91 super.setUp();
92 extractorGroup = ExtractorRegistryImpl.getInstance().getExtractorGroup();
93 store = new MemoryStore();
94 store.init();
95 conn = new SailRepository(store).getConnection();
96 }
97
98 @After
99 public void tearDown() throws SailException, RepositoryException, TripleHandlerException {
100 rdfxmlWriter.close();
101 repositoryWriter.close();
102 logger.debug(baos.toString(StandardCharsets.UTF_8));
103
104 singleDocumentExtraction = null;
105 extractorGroup = null;
106 conn.close();
107 conn = null;
108 store.shutDown();
109 store = null;
110 }
111
112
113
114
115
116
117
118
119
120
121
122 @Test
123 public void testMicroformatDomains() throws IOException, ExtractionException, RepositoryException {
124 singleDocumentExtraction = getInstance("/microformats/microformat-domains.html");
125 singleDocumentExtraction.run();
126 logStorageContent();
127 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
128 }
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144 @Test
145 public void testNestedMicroformats() throws IOException, ExtractionException, RepositoryException {
146 singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html");
147 singleDocumentExtraction.run();
148
149 logStorageContent();
150
151 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
152 assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
153 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
154 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
155 }
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171 @Test
172 public void testNestedVCardAdr() throws IOException, ExtractionException, RepositoryException {
173 singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html");
174 singleDocumentExtraction.run();
175
176 logStorageContent();
177
178 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), (Value) null, 0);
179 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null, 0);
180 }
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200 @Test
201 public void testNestedMicroformatsInduced() throws IOException, ExtractionException, RepositoryException {
202 singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html");
203 singleDocumentExtraction.run();
204
205 logStorageContent();
206
207 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
208 assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
209 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
210 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
211 }
212
213
214
215
216
217
218
219
220
221
222
223
224
225 @Test
226
227
228
229
230
231
232 public void testNestedMicroformatsManaged() throws IOException, ExtractionException, RepositoryException {
233 singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html");
234 singleDocumentExtraction.run();
235
236 logStorageContent();
237
238 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 3);
239 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING), (Value) null, 1);
240 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
241
242 assertTripleCount(vVCARD.url, (Value) null, 1);
243 Value object = getTripleObject(null, vREVIEW.hasReview);
244 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), object, 1);
245 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
246 }
247
248 private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
249 baos = new ByteArrayOutputStream();
250 rdfxmlWriter = new RDFXMLWriter(baos);
251 repositoryWriter = new RepositoryWriter(conn);
252
253 final CompositeTripleHandler cth = new CompositeTripleHandler();
254 cth.addChild(rdfxmlWriter);
255 cth.addChild(repositoryWriter);
256
257 final ModifiableConfiguration configuration = DefaultConfiguration.copy();
258 configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
259 SingleDocumentExtraction instance = new SingleDocumentExtraction(configuration,
260 new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"), extractorGroup, cth);
261 instance.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier()));
262 return instance;
263 }
264
265
266
267
268
269
270
271 private void logStorageContent() throws RepositoryException {
272 RepositoryResult<Statement> result = conn.getStatements(null, null, null, false);
273 while (result.hasNext()) {
274 Statement statement = result.next();
275 logger.debug(statement.toString());
276 }
277 }
278
279
280
281
282
283
284
285
286
287
288 private void assertTripleCount(IRI predicate, Value value, int occurrences) throws RepositoryException {
289 RepositoryResult<Statement> statements = conn.getStatements(null, predicate, value, false);
290 int count = 0;
291 while (statements.hasNext()) {
292 statements.next();
293 count++;
294 }
295 Assert.assertEquals(
296 String.format(Locale.ROOT, "Cannot find triple (* %s %s) %d times", predicate, value, occurrences),
297 occurrences, count);
298 }
299
300
301
302
303
304
305
306
307
308
309 private void assertTripleCount(IRI predicate, String value, int occurrences) throws RepositoryException {
310 assertTripleCount(predicate, SimpleValueFactory.getInstance().createLiteral(value), occurrences);
311 }
312
313
314
315
316
317
318
319
320
321 private void assertTriple(IRI predicate, Value value) throws RepositoryException {
322 assertTripleCount(predicate, value, 1);
323 }
324
325
326
327
328
329
330
331
332
333 @SuppressWarnings("unused")
334 private void assertTriple(IRI predicate, String value) throws RepositoryException {
335 assertTriple(predicate, SimpleValueFactory.getInstance().createLiteral(value));
336 }
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351 private Value getTripleObject(Resource sub, IRI prop) throws RepositoryException {
352 RepositoryResult<Statement> statements = conn.getStatements(sub, prop, null, false);
353 Assert.assertTrue(statements.hasNext());
354 Statement statement = statements.next();
355 Value value = statement.getObject();
356 Assert.assertFalse("Expected just one result.", statements.hasNext());
357 statements.close();
358 return value;
359 }
360
361 }