Coverage Report - org.apache.any23.cli.Crawler
 
Classes in this File Line Coverage Branch Coverage Complexity
Crawler
0%
0/59
0%
0/26
4.111
Crawler$1
0%
0/10
N/A
4.111
Crawler$2
0%
0/6
N/A
4.111
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.cli;
 19  
 
 20  
 import edu.uci.ics.crawler4j.crawler.Page;
 21  
 import org.apache.any23.plugin.crawler.CrawlerListener;
 22  
 import org.apache.any23.plugin.crawler.SiteCrawler;
 23  
 import org.apache.any23.source.StringDocumentSource;
 24  
 import org.apache.commons.cli.CommandLine;
 25  
 import org.apache.commons.cli.Option;
 26  
 import org.apache.commons.cli.Options;
 27  
 
 28  
 import java.io.File;
 29  
 import java.io.IOException;
 30  
 import java.net.URL;
 31  
 import java.util.UUID;
 32  
 import java.util.regex.Pattern;
 33  
 import java.util.regex.PatternSyntaxException;
 34  
 
 35  
 /**
 36  
  * Implementation of a <b>CLI crawler</b> based on
 37  
  * {@link Rover}.
 38  
  *
 39  
  * @author Michele Mostarda (mostarda@fbk.eu)
 40  
  */
 41  
 @ToolRunner.Description("Any23 Crawler Command Line Tool.")
 42  0
 public class Crawler extends Rover {
 43  
 
 44  0
     private final Object roverLock = new Object();
 45  
 
 46  
     public static void main(String[] args) {
 47  
         try {
 48  0
             System.exit( new Crawler().run(args) );
 49  0
         } catch (Exception e) {
 50  0
             e.printStackTrace();
 51  0
         }
 52  0
     }
 53  
 
 54  
     @Override
 55  
     public int run(String[] args) {
 56  
         try {
 57  0
             final String[] seeds = super.configure(args);
 58  0
             if(seeds.length != 1) throw new IllegalArgumentException("Expected just one seed.");
 59  0
             final URL seed = new URL(seeds[0]);
 60  
 
 61  0
             final CommandLine commandLine = super.getCommandLine();
 62  
 
 63  0
             final SiteCrawler siteCrawler = new SiteCrawler( getStorageFolder(commandLine) );
 64  
 
 65  0
             final Pattern specifiedPageFilter = getPageFilter(commandLine);
 66  0
             final Pattern pageFilter = specifiedPageFilter == null ? siteCrawler.defaultFilters : specifiedPageFilter;
 67  
 
 68  0
             if(commandLine.hasOption("numcrawlers")) {
 69  0
                 siteCrawler.setNumOfCrawlers( parseInt(commandLine, "numcrawlers") );
 70  
             }
 71  0
             if(commandLine.hasOption("maxpages")) {
 72  0
                 siteCrawler.setMaxPages(parseInt(commandLine, "maxpages"));
 73  
             }
 74  0
             if(commandLine.hasOption("maxdepth")) {
 75  0
                 siteCrawler.setMaxDepth(parseInt(commandLine, "maxdepth"));
 76  
             }
 77  0
             if (commandLine.hasOption("politenessdelay")) {
 78  0
                 final int politenessDelay = parseInt(commandLine, "politenessdelay");
 79  0
                 if(politenessDelay >= 0) siteCrawler.setPolitenessDelay(politenessDelay);
 80  
             }
 81  
 
 82  0
             siteCrawler.addListener(new CrawlerListener() {
 83  
                 @Override
 84  
                 public void visitedPage(Page page) {
 85  0
                     final String pageURL = page.getWebURL().getURL();
 86  0
                     System.err.println( String.format("Processing page: [%s]", pageURL) );
 87  
                     try {
 88  0
                         synchronized (roverLock) {
 89  0
                             Crawler.super.performExtraction(
 90  
                                     new StringDocumentSource(
 91  
                                             page.getHTML(),
 92  
                                             pageURL
 93  
 
 94  
                                     )
 95  
                             );
 96  0
                         }
 97  0
                     } catch (Exception e) {
 98  0
                         System.err.println(
 99  
                                 String.format("Error while processing page [%s], error: %s .", pageURL, e.getMessage())
 100  
                         );
 101  0
                     }
 102  0
                 }
 103  
             });
 104  
 
 105  0
             Runtime.getRuntime().addShutdownHook( new Thread() {
 106  
                 @Override
 107  
                 public void run() {
 108  
                     try {
 109  0
                         System.err.println( Crawler.super.printReports() );
 110  
                         // siteCrawler.stop(); // TODO: cause shutdown hanging.
 111  0
                     } catch (Exception e) {
 112  0
                         e.printStackTrace();
 113  0
                     }
 114  0
                 }
 115  
             });
 116  0
             siteCrawler.start(seed, pageFilter, true);
 117  0
             return 0;
 118  0
         } catch (Exception e) {
 119  0
             if(super.isVerbose()) e.printStackTrace();
 120  0
             if(e instanceof ExitCodeException) {
 121  0
                 return ((ExitCodeException) e).getExitCode();
 122  
             }
 123  0
             return 1;
 124  
         }
 125  
     }
 126  
 
 127  
     @Override
 128  
     protected Options createOptions() {
 129  0
         final Options roverOptions = super.createOptions();
 130  0
         addCrawlerOptions(roverOptions);
 131  0
         return roverOptions;
 132  
     }
 133  
 
 134  
     private void addCrawlerOptions(Options options) {
 135  0
         options.addOption(
 136  
                 new Option("pagefilter"     , true, "Regex used to filter out page URLs during crawling. Default: '" + SiteCrawler.DEFAULT_PAGE_FILTER_RE + "'")
 137  
         );
 138  0
         options.addOption(
 139  
                 new Option("storagefolder"  , true, "Folder used to store crawler temporary data. Default: [" + System.getProperty("java.io.tmpdir")  + "]")
 140  
         );
 141  0
         options.addOption(
 142  
                 new Option("numcrawlers"    , true, "Sets the number of crawlers. Default: " + SiteCrawler.DEFAULT_NUM_OF_CRAWLERS)
 143  
         );
 144  0
         options.addOption(
 145  
                 new Option("maxpages"       , true, "Max number of pages before interrupting crawl. Default: no limit.")
 146  
         );
 147  0
         options.addOption(
 148  
                 new Option("maxdepth"       , true, "Max allowed crawler depth. Default: no limit.")
 149  
         );
 150  0
         options.addOption(
 151  
                 new Option("politenessdelay", true, "Politeness delay in milliseconds. Default: no limit.")
 152  
         );
 153  0
     }
 154  
 
 155  
     private Pattern getPageFilter(CommandLine commandLine) {
 156  0
         if(commandLine.hasOption("pagefilter")) {
 157  
             try {
 158  0
                 return Pattern.compile( commandLine.getOptionValue("pagefilter") );
 159  0
             } catch (PatternSyntaxException pse) {
 160  0
                 throw new ExitCodeException("Invalid page filter, must be a regular expression.", 6);
 161  
             }
 162  
         }
 163  0
         return null;
 164  
     }
 165  
 
 166  
     private File getStorageFolder(CommandLine commandLine) throws IOException {
 167  0
         if(commandLine.hasOption("storagefolder")) {
 168  0
            final File candidate = new  File( commandLine.getOptionValue("storagefolder") );
 169  0
            if(candidate.exists() && candidate.isFile())
 170  0
                throw new IllegalArgumentException("The storage folder must be a directory.");
 171  0
             return candidate;
 172  
         } else {
 173  0
             final File tmpDir = File.createTempFile("crawler-metadata-" + UUID.randomUUID().toString(), "db");
 174  0
             tmpDir.delete();
 175  0
             return tmpDir;
 176  
         }
 177  
     }
 178  
 
 179  
     private int parseInt(CommandLine cl, String option) {
 180  0
         final String value = cl.getOptionValue(option);
 181  
         try {
 182  0
             return Integer.parseInt(value);
 183  0
         } catch (NumberFormatException nfe) {
 184  0
             throw new IllegalArgumentException(String.format("Expected integer for %s found '%s' .", option, value));
 185  
         }
 186  
     }
 187  
 
 188  
 }