1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractorFactory;
21 import org.apache.any23.rdf.RDFUtils;
22 import org.apache.any23.vocab.DCTerms;
23 import org.apache.any23.vocab.SINDICE;
24 import org.junit.Test;
25 import org.eclipse.rdf4j.model.Literal;
26 import org.eclipse.rdf4j.repository.RepositoryException;
27
28
29
30
31
32 public class TitleExtractorTest extends AbstractExtractorTestCase {
33
34 private static final DCTerms vDCTERMS = DCTerms.getInstance();
35 private static final SINDICE vSINDICE = SINDICE.getInstance();
36
37 private Literal helloLiteral = RDFUtils.literal("Hello World!");
38
39 @Override
40 protected ExtractorFactory<?> getExtractorFactory() {
41 return new TitleExtractorFactory();
42 }
43
44 @Test
45 public void testExtractPageTitle() throws RepositoryException {
46 assertExtract("/microformats/xfn/simple-me.html");
47 assertContains(baseIRI, vDCTERMS.title, helloLiteral);
48 }
49
50 @Test
51 public void testStripSpacesFromTitle() throws RepositoryException {
52 assertExtract("/microformats/xfn/strip-spaces.html");
53 assertContains(baseIRI, vDCTERMS.title, helloLiteral);
54 }
55
56 @Test
57 public void testNoPageTitle() throws RepositoryException {
58 assertExtract("/microformats/xfn/tagsoup.html");
59 assertModelEmpty();
60 }
61
62 @Test
63 public void testMixedCaseTitleTag() throws RepositoryException {
64 assertExtract("/microformats/xfn/mixed-case.html");
65 assertContains(baseIRI, vDCTERMS.title, helloLiteral);
66 }
67
68
69
70
71
72
73
74 @Test
75 public void testTitleWithDefaultLanguage() throws RepositoryException {
76 assertExtract("/html/default-language.html");
77 assertContains(baseIRI, vDCTERMS.title, RDFUtils.literal("Welcome to mydomain.net", "en"));
78 assertNotContains(baseIRI, vDCTERMS.title, RDFUtils.literal("Welcome to mydomain.net", (String) null));
79 }
80
81 }