Coverage Report - org.apache.any23.plugin.htmlscraper.HTMLScraperExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
HTMLScraperExtractor
0%
0/32
0%
0/4
2.25
HTMLScraperExtractor$ExtractionRule
0%
0/11
0%
0/6
2.25
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.plugin.htmlscraper;
 19  
 
 20  
 import de.l3s.boilerpipe.BoilerpipeExtractor;
 21  
 import de.l3s.boilerpipe.BoilerpipeProcessingException;
 22  
 import de.l3s.boilerpipe.extractors.ArticleExtractor;
 23  
 import de.l3s.boilerpipe.extractors.CanolaExtractor;
 24  
 import de.l3s.boilerpipe.extractors.DefaultExtractor;
 25  
 import de.l3s.boilerpipe.extractors.LargestContentExtractor;
 26  
 import org.apache.any23.extractor.ExtractionContext;
 27  
 import org.apache.any23.extractor.ExtractionException;
 28  
 import org.apache.any23.extractor.ExtractionParameters;
 29  
 import org.apache.any23.extractor.ExtractionResult;
 30  
 import org.apache.any23.extractor.Extractor;
 31  
 import org.apache.any23.extractor.ExtractorFactory;
 32  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 33  
 import org.apache.any23.vocab.SINDICE;
 34  
 import org.openrdf.model.URI;
 35  
 import org.openrdf.model.impl.ValueFactoryImpl;
 36  
 
 37  
 import java.io.IOException;
 38  
 import java.io.InputStream;
 39  
 import java.io.InputStreamReader;
 40  
 import java.util.ArrayList;
 41  
 import java.util.Arrays;
 42  
 import java.util.List;
 43  
 
 44  
 /**
 45  
  * Implementation of content extractor for performing <i>HTML<i/> scraping.
 46  
  *
 47  
  * @see HTMLScraperPlugin
 48  
  * @author Michele Mostarda (mostarda@fbk.eu)
 49  
  */
 50  0
 public class HTMLScraperExtractor implements Extractor.ContentExtractor {
 51  
 
 52  
     public final static String NAME = "html-scraper";
 53  
 
 54  0
     public final static URI PAGE_CONTENT_DE_PROPERTY  =
 55  
             ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/de");
 56  0
     public final static URI PAGE_CONTENT_AE_PROPERTY  =
 57  
             ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/ae");
 58  0
     public final static URI PAGE_CONTENT_LCE_PROPERTY =
 59  
             ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/lce");
 60  0
     public final static URI PAGE_CONTENT_CE_PROPERTY  =
 61  
             ValueFactoryImpl.getInstance().createURI(SINDICE.NS + "pagecontent/ce");
 62  
 
 63  0
     protected final static ExtractorFactory<HTMLScraperExtractor> factory =
 64  
             SimpleExtractorFactory.create(
 65  
                     NAME,
 66  
                     null,
 67  
                     Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
 68  
                     null,
 69  
                     HTMLScraperExtractor.class
 70  
             );
 71  
 
 72  0
     private final List<ExtractionRule> extractionRules = new ArrayList<ExtractionRule>();
 73  
 
 74  0
     public HTMLScraperExtractor() {
 75  0
         loadDefaultRules();
 76  0
     }
 77  
 
 78  
     public void addTextExtractor(String name, URI property, BoilerpipeExtractor extractor) {
 79  0
         extractionRules.add( new ExtractionRule(name, property, extractor) );
 80  0
     }
 81  
 
 82  
     public String[] getTextExtractors() {
 83  0
         final List<String> extractors = new ArrayList<String>();
 84  0
         for(ExtractionRule er : extractionRules) {
 85  0
             extractors.add(er.name);
 86  
         }
 87  0
         return extractors.toArray( new String[extractors.size()] );
 88  
     }
 89  
 
 90  
     public void run(
 91  
             ExtractionParameters extractionParameters,
 92  
             ExtractionContext extractionContext,
 93  
             InputStream inputStream,
 94  
             ExtractionResult extractionResult
 95  
     ) throws IOException, ExtractionException {
 96  
         try {
 97  0
             final URI documentURI = extractionContext.getDocumentURI();
 98  0
             for (ExtractionRule extractionRule : extractionRules) {
 99  0
                 final String content = extractionRule.boilerpipeExtractor.getText(new InputStreamReader(inputStream));
 100  0
                 extractionResult.writeTriple(
 101  
                         documentURI,
 102  
                         extractionRule.property,
 103  
                         ValueFactoryImpl.getInstance().createLiteral(content)
 104  
                 );
 105  0
             }
 106  0
         } catch (BoilerpipeProcessingException bpe) {
 107  0
             throw new ExtractionException("Error while applying text processor " + ArticleExtractor.class, bpe);
 108  0
         }
 109  0
     }
 110  
 
 111  
     public ExtractorFactory getDescription() {
 112  0
         return factory;
 113  
     }
 114  
 
 115  
     public void setStopAtFirstError(boolean b) {
 116  
         // Ignored.
 117  0
     }
 118  
 
 119  
     private void loadDefaultRules() {
 120  0
         addTextExtractor("default-extractor"      , PAGE_CONTENT_DE_PROPERTY , DefaultExtractor.getInstance());
 121  0
         addTextExtractor("article-extractor"      , PAGE_CONTENT_AE_PROPERTY , ArticleExtractor.getInstance());
 122  0
         addTextExtractor("large-content-extractor", PAGE_CONTENT_LCE_PROPERTY, LargestContentExtractor.getInstance());
 123  0
         addTextExtractor("canola-extractor"       , PAGE_CONTENT_CE_PROPERTY , CanolaExtractor.getInstance());
 124  0
     }
 125  
 
 126  
     /**
 127  
      * This class associates a <i>BoilerPipe</i> extractor with the property going to host the extracted content.
 128  
      */
 129  
     class ExtractionRule {
 130  
 
 131  
         public final String name;
 132  
         public final URI property;
 133  
         public final BoilerpipeExtractor boilerpipeExtractor;
 134  
 
 135  0
         ExtractionRule(String name, URI property, BoilerpipeExtractor boilerpipeExtractor) {
 136  0
             if(name == null) {
 137  0
                 throw new NullPointerException("name cannot be null.");
 138  
             }
 139  0
             if(property == null) {
 140  0
                 throw new NullPointerException("property cannot be null.");
 141  
             }
 142  0
             if(boilerpipeExtractor == null) {
 143  0
                 throw new NullPointerException("extractor cannot be null.");
 144  
             }
 145  0
             this.name = name;
 146  0
             this.property = property;
 147  0
             this.boilerpipeExtractor = boilerpipeExtractor;
 148  0
         }
 149  
 
 150  
     }
 151  
 }