Coverage Report - org.apache.maven.wagon.shared.http.HtmlFileListParser
 
Classes in this File Line Coverage Branch Coverage Complexity
HtmlFileListParser
81 %
36/44
88 %
16/18
7,333
 
 1  
 package org.apache.maven.wagon.shared.http;
 2  
 
 3  
 /*
 4  
  * Licensed to the Apache Software Foundation (ASF) under one
 5  
  * or more contributor license agreements.  See the NOTICE file
 6  
  * distributed with this work for additional information
 7  
  * regarding copyright ownership.  The ASF licenses this file
 8  
  * to you under the Apache License, Version 2.0 (the
 9  
  * "License"); you may not use this file except in compliance
 10  
  * with the License.  You may obtain a copy of the License at
 11  
  *
 12  
  *   http://www.apache.org/licenses/LICENSE-2.0
 13  
  *
 14  
  * Unless required by applicable law or agreed to in writing,
 15  
  * software distributed under the License is distributed on an
 16  
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 17  
  * KIND, either express or implied.  See the License for the
 18  
  * specific language governing permissions and limitations
 19  
  * under the License.
 20  
  */
 21  
 
 22  
 import org.apache.commons.io.IOUtils;
 23  
 import org.apache.maven.wagon.TransferFailedException;
 24  
 import org.codehaus.plexus.util.StringUtils;
 25  
 import org.jsoup.Jsoup;
 26  
 import org.jsoup.nodes.Document;
 27  
 import org.jsoup.nodes.Element;
 28  
 import org.jsoup.select.Elements;
 29  
 
 30  
 import java.io.IOException;
 31  
 import java.io.InputStream;
 32  
 import java.io.UnsupportedEncodingException;
 33  
 import java.net.URI;
 34  
 import java.net.URISyntaxException;
 35  
 import java.net.URLDecoder;
 36  
 import java.util.ArrayList;
 37  
 import java.util.HashSet;
 38  
 import java.util.List;
 39  
 import java.util.Set;
 40  
 import java.util.regex.Pattern;
 41  
 
 42  
 /**
 43  
  * Html File List Parser.
 44  
  */
 45  0
 public class HtmlFileListParser
 46  
 {
 47  
     // Apache Fancy Index Sort Headers
 48  1
     private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
 49  
 
 50  
     // URLs with excessive paths.
 51  1
     private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
 52  
 
 53  
     // URLs that to a parent directory.
 54  1
     private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
 55  
 
 56  
     // mailto urls
 57  1
     private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
 58  
 
 59  1
     private static final Pattern[] SKIPS =
 60  
         new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
 61  
 
 62  
     /**
 63  
      * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
 64  
      *
 65  
      * @param stream the input stream.
 66  
      * @return the file list.
 67  
      * @throws TransferFailedException if there was a problem fetching the raw html.
 68  
      */
 69  
     public static List<String> parseFileList( String baseurl, InputStream stream )
 70  
         throws TransferFailedException
 71  
     {
 72  
         try
 73  
         {
 74  10
             URI baseURI = new URI( baseurl );
 75  
             // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
 76  
             // assumption.
 77  10
             String content = IOUtils.toString( stream, "utf-8" );
 78  10
             Document doc = Jsoup.parse( content, baseurl );
 79  10
             Elements links = doc.getElementsByTag( "a" );
 80  10
             Set<String> results = new HashSet<String>();
 81  524
             for ( int lx = 0; lx < links.size(); lx++ )
 82  
             {
 83  514
                 Element link = links.get( lx );
 84  
                 /*
 85  
                  * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
 86  
                  */
 87  514
                 String target = link.attr( "href" );
 88  514
                 if ( target != null )
 89  
                 {
 90  514
                     String clean = cleanLink( baseURI, target );
 91  514
                     if ( isAcceptableLink( clean ) )
 92  
                     {
 93  462
                         results.add( clean );
 94  
                     }
 95  
                 }
 96  
 
 97  
             }
 98  
 
 99  10
             return new ArrayList<String>( results );
 100  
         }
 101  0
         catch ( URISyntaxException e )
 102  
         {
 103  0
             throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e );
 104  
         }
 105  0
         catch ( IOException e )
 106  
         {
 107  0
             throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
 108  
         }
 109  
     }
 110  
 
 111  
     private static String cleanLink( URI baseURI, String link )
 112  
     {
 113  514
         if ( StringUtils.isEmpty( link ) )
 114  
         {
 115  17
             return "";
 116  
         }
 117  
 
 118  497
         String ret = link;
 119  
 
 120  
         try
 121  
         {
 122  497
             URI linkuri = new URI( ret );
 123  497
             if ( link.startsWith( "/" ) )
 124  
             {
 125  118
                 linkuri = baseURI.resolve( linkuri );
 126  
             }
 127  497
             URI relativeURI = baseURI.relativize( linkuri ).normalize();
 128  497
             ret = relativeURI.toASCIIString();
 129  497
             if ( ret.startsWith( baseURI.getPath() ) )
 130  
             {
 131  0
                 ret = ret.substring( baseURI.getPath().length() );
 132  
             }
 133  
 
 134  497
             ret = URLDecoder.decode( ret, "UTF-8" );
 135  
         }
 136  0
         catch ( URISyntaxException e )
 137  
         {
 138  
         }
 139  0
         catch ( UnsupportedEncodingException e )
 140  
         {
 141  497
         }
 142  
 
 143  497
         return ret;
 144  
     }
 145  
 
 146  
     private static boolean isAcceptableLink( String link )
 147  
     {
 148  514
         if ( StringUtils.isEmpty( link ) )
 149  
         {
 150  17
             return false;
 151  
         }
 152  
 
 153  2360
         for ( int i = 0; i < SKIPS.length; i++ )
 154  
         {
 155  1898
             if ( SKIPS[i].matcher( link ).find() )
 156  
             {
 157  35
                 return false;
 158  
             }
 159  
         }
 160  
 
 161  462
         return true;
 162  
     }
 163  
 
 164  
 }