Coverage Report

Coverage Report - org.apache.any23.plugin.htmlscraper.HTMLScraperExtractor

Classes in this File

Line Coverage

Branch Coverage

Complexity

HTMLScraperExtractor

0/32

0/4

2.25

HTMLScraperExtractor$ExtractionRule

0/11

0/6

2.25

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.plugin.htmlscraper;
 
 import de.l3s.boilerpipe.BoilerpipeExtractor;
 import de.l3s.boilerpipe.BoilerpipeProcessingException;
 import de.l3s.boilerpipe.extractors.ArticleExtractor;
 import de.l3s.boilerpipe.extractors.CanolaExtractor;
 import de.l3s.boilerpipe.extractors.DefaultExtractor;
 import de.l3s.boilerpipe.extractors.LargestContentExtractor;
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.ExtractorFactory;
 import org.apache.any23.extractor.SimpleExtractorFactory;
 import org.apache.any23.vocab.SINDICE;
 import org.openrdf.model.URI;
 import org.openrdf.model.impl.ValueFactoryImpl;
 
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
 /**
  * Implementation of content extractor for performing <i>HTML<i/> scraping.
  *
  * @see HTMLScraperPlugin
  * @author Michele Mostarda (mostarda@fbk.eu)
  */
 public class HTMLScraperExtractor implements Extractor.ContentExtractor {
 
     public final static String NAME = "html-scraper";
 
     public final static URI PAGE_CONTENT_DE_PROPERTY  =
             ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/de");
     public final static URI PAGE_CONTENT_AE_PROPERTY  =
             ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/ae");
     public final static URI PAGE_CONTENT_LCE_PROPERTY =
             ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/lce");
     public final static URI PAGE_CONTENT_CE_PROPERTY  =
             ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/ce");
 
     protected final static ExtractorFactory<HTMLScraperExtractor> factory =
             SimpleExtractorFactory.create(
                     NAME,
                     null,
                     Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
                     null,
                     HTMLScraperExtractor.class
             );
 
     private final List<ExtractionRule> extractionRules = new ArrayList<ExtractionRule>();
 
     public HTMLScraperExtractor() {
         loadDefaultRules();
     }
 
     public void addTextExtractor(String name, URI property, BoilerpipeExtractor extractor) {
         extractionRules.add( new ExtractionRule(name, property, extractor) );
     }
 
     public String[] getTextExtractors() {
         final List<String> extractors = new ArrayList<String>();
         for(ExtractionRule er : extractionRules) {
             extractors.add(er.name);
         }
         return extractors.toArray( new String[extractors.size()] );
     }
 
     public void run(
             ExtractionParameters extractionParameters,
             ExtractionContext extractionContext,
             InputStream inputStream,
             ExtractionResult extractionResult
     ) throws IOException, ExtractionException {
         try {
             final URI documentURI = extractionContext.getDocumentURI();
             for (ExtractionRule extractionRule : extractionRules) {
                 final String content = extractionRule.boilerpipeExtractor.getText(new InputStreamReader(inputStream));
                 extractionResult.writeTriple(
                         documentURI,
                         extractionRule.property,
                         ValueFactoryImpl.getInstance().createLiteral(content)
                 );
             }
         } catch (BoilerpipeProcessingException bpe) {
             throw new ExtractionException("Error while applying text processor " + ArticleExtractor.class, bpe);
         }
     }
 
     public ExtractorFactory getDescription() {
         return factory;
     }
 
     public void setStopAtFirstError(boolean b) {
         // Ignored.
     }
 
     private void loadDefaultRules() {
         addTextExtractor("default-extractor"      , PAGE_CONTENT_DE_PROPERTY , DefaultExtractor.getInstance());
         addTextExtractor("article-extractor"      , PAGE_CONTENT_AE_PROPERTY , ArticleExtractor.getInstance());
         addTextExtractor("large-content-extractor", PAGE_CONTENT_LCE_PROPERTY, LargestContentExtractor.getInstance());
         addTextExtractor("canola-extractor"       , PAGE_CONTENT_CE_PROPERTY , CanolaExtractor.getInstance());
     }
 
     /**
      * This class associates a <i>BoilerPipe</i> extractor with the property going to host the extracted content.
      */
     class ExtractionRule {
 
         public final String name;
         public final URI property;
         public final BoilerpipeExtractor boilerpipeExtractor;
 
         ExtractionRule(String name, URI property, BoilerpipeExtractor boilerpipeExtractor) {
             if(name == null) {
                 throw new NullPointerException("name cannot be null.");
             }
             if(property == null) {
                 throw new NullPointerException("property cannot be null.");
             }
             if(boilerpipeExtractor == null) {
                 throw new NullPointerException("extractor cannot be null.");
             }
             this.name = name;
             this.property = property;
             this.boilerpipeExtractor = boilerpipeExtractor;
         }
 
     }
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.plugin.htmlscraper;
19
20		import de.l3s.boilerpipe.BoilerpipeExtractor;
21		import de.l3s.boilerpipe.BoilerpipeProcessingException;
22		import de.l3s.boilerpipe.extractors.ArticleExtractor;
23		import de.l3s.boilerpipe.extractors.CanolaExtractor;
24		import de.l3s.boilerpipe.extractors.DefaultExtractor;
25		import de.l3s.boilerpipe.extractors.LargestContentExtractor;
26		import org.apache.any23.extractor.ExtractionContext;
27		import org.apache.any23.extractor.ExtractionException;
28		import org.apache.any23.extractor.ExtractionParameters;
29		import org.apache.any23.extractor.ExtractionResult;
30		import org.apache.any23.extractor.Extractor;
31		import org.apache.any23.extractor.ExtractorFactory;
32		import org.apache.any23.extractor.SimpleExtractorFactory;
33		import org.apache.any23.vocab.SINDICE;
34		import org.openrdf.model.URI;
35		import org.openrdf.model.impl.ValueFactoryImpl;
36
37		import java.io.IOException;
38		import java.io.InputStream;
39		import java.io.InputStreamReader;
40		import java.util.ArrayList;
41		import java.util.Arrays;
42		import java.util.List;
43
44		/**
45		* Implementation of content extractor for performing <i>HTML<i/> scraping.
46		*
47		* @see HTMLScraperPlugin
48		* @author Michele Mostarda (mostarda@fbk.eu)
49		*/
50	0	public class HTMLScraperExtractor implements Extractor.ContentExtractor {
51
52		public final static String NAME = "html-scraper";
53
54	0	public final static URI PAGE_CONTENT_DE_PROPERTY =
55		ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/de");
56	0	public final static URI PAGE_CONTENT_AE_PROPERTY =
57		ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/ae");
58	0	public final static URI PAGE_CONTENT_LCE_PROPERTY =
59		ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/lce");
60	0	public final static URI PAGE_CONTENT_CE_PROPERTY =
61		ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/ce");
62
63	0	protected final static ExtractorFactory<HTMLScraperExtractor> factory =
64		SimpleExtractorFactory.create(
65		NAME,
66		null,
67		Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
68		null,
69		HTMLScraperExtractor.class
70		);
71
72	0	private final List<ExtractionRule> extractionRules = new ArrayList<ExtractionRule>();
73
74	0	public HTMLScraperExtractor() {
75	0	loadDefaultRules();
76	0	}
77
78		public void addTextExtractor(String name, URI property, BoilerpipeExtractor extractor) {
79	0	extractionRules.add( new ExtractionRule(name, property, extractor) );
80	0	}
81
82		public String[] getTextExtractors() {
83	0	final List<String> extractors = new ArrayList<String>();
84	0	for(ExtractionRule er : extractionRules) {
85	0	extractors.add(er.name);
86		}
87	0	return extractors.toArray( new String[extractors.size()] );
88		}
89
90		public void run(
91		ExtractionParameters extractionParameters,
92		ExtractionContext extractionContext,
93		InputStream inputStream,
94		ExtractionResult extractionResult
95		) throws IOException, ExtractionException {
96		try {
97	0	final URI documentURI = extractionContext.getDocumentURI();
98	0	for (ExtractionRule extractionRule : extractionRules) {
99	0	final String content = extractionRule.boilerpipeExtractor.getText(new InputStreamReader(inputStream));
100	0	extractionResult.writeTriple(
101		documentURI,
102		extractionRule.property,
103		ValueFactoryImpl.getInstance().createLiteral(content)
104		);
105	0	}
106	0	} catch (BoilerpipeProcessingException bpe) {
107	0	throw new ExtractionException("Error while applying text processor " + ArticleExtractor.class, bpe);
108	0	}
109	0	}
110
111		public ExtractorFactory getDescription() {
112	0	return factory;
113		}
114
115		public void setStopAtFirstError(boolean b) {
116		// Ignored.
117	0	}
118
119		private void loadDefaultRules() {
120	0	addTextExtractor("default-extractor" , PAGE_CONTENT_DE_PROPERTY , DefaultExtractor.getInstance());
121	0	addTextExtractor("article-extractor" , PAGE_CONTENT_AE_PROPERTY , ArticleExtractor.getInstance());
122	0	addTextExtractor("large-content-extractor", PAGE_CONTENT_LCE_PROPERTY, LargestContentExtractor.getInstance());
123	0	addTextExtractor("canola-extractor" , PAGE_CONTENT_CE_PROPERTY , CanolaExtractor.getInstance());
124	0	}
125
126		/**
127		* This class associates a <i>BoilerPipe</i> extractor with the property going to host the extracted content.
128		*/
129		class ExtractionRule {
130
131		public final String name;
132		public final URI property;
133		public final BoilerpipeExtractor boilerpipeExtractor;
134
135	0	ExtractionRule(String name, URI property, BoilerpipeExtractor boilerpipeExtractor) {
136	0	if(name == null) {
137	0	throw new NullPointerException("name cannot be null.");
138		}
139	0	if(property == null) {
140	0	throw new NullPointerException("property cannot be null.");
141		}
142	0	if(boilerpipeExtractor == null) {
143	0	throw new NullPointerException("extractor cannot be null.");
144		}
145	0	this.name = name;
146	0	this.property = property;
147	0	this.boilerpipeExtractor = boilerpipeExtractor;
148	0	}
149
150		}
151		}