| package org.apache.maven.wagon.shared.http; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| import org.apache.commons.io.IOUtils; |
| import org.apache.maven.wagon.TransferFailedException; |
| import org.jsoup.Jsoup; |
| import org.jsoup.nodes.Document; |
| import org.jsoup.nodes.Element; |
| import org.jsoup.select.Elements; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.UnsupportedEncodingException; |
| import java.net.URI; |
| import java.net.URISyntaxException; |
| import java.net.URLDecoder; |
| import java.util.ArrayList; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Set; |
| import java.util.regex.Pattern; |
| |
| /** |
| * Html File List Parser. |
| */ |
| public class HtmlFileListParser |
| { |
| // Apache Fancy Index Sort Headers |
| private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" ); |
| |
| // URLs with excessive paths. |
| private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" ); |
| |
| // URLs that to a parent directory. |
| private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" ); |
| |
| // mailto urls |
| private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" ); |
| |
| private static final Pattern[] SKIPS = |
| new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS }; |
| |
| /** |
| * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list. |
| * |
| * @param stream the input stream. |
| * @return the file list. |
| * @throws TransferFailedException if there was a problem fetching the raw html. |
| */ |
| public static List<String> parseFileList( String baseurl, InputStream stream ) |
| throws TransferFailedException |
| { |
| try |
| { |
| URI baseURI = new URI( baseurl ); |
| // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe |
| // assumption. |
| String content = IOUtils.toString( stream, "utf-8" ); |
| Document doc = Jsoup.parse( content, baseurl ); |
| Elements links = doc.select( "a[href]" ); |
| Set<String> results = new HashSet<String>(); |
| for ( Element link : links ) |
| { |
| /* |
| * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink |
| */ |
| String target = link.attr( "href" ); |
| if ( target != null ) |
| { |
| String clean = cleanLink( baseURI, target ); |
| if ( isAcceptableLink( clean ) ) |
| { |
| results.add( clean ); |
| } |
| } |
| |
| } |
| |
| return new ArrayList<String>( results ); |
| } |
| catch ( URISyntaxException e ) |
| { |
| throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e ); |
| } |
| catch ( IOException e ) |
| { |
| throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e ); |
| } |
| } |
| |
| private static String cleanLink( URI baseURI, String link ) |
| { |
| if ( link == null || link.length() == 0 ) |
| { |
| return ""; |
| } |
| |
| String ret = link; |
| |
| try |
| { |
| URI linkuri = new URI( ret ); |
| if ( link.startsWith( "/" ) ) |
| { |
| linkuri = baseURI.resolve( linkuri ); |
| } |
| URI relativeURI = baseURI.relativize( linkuri ).normalize(); |
| ret = relativeURI.toASCIIString(); |
| if ( ret.startsWith( baseURI.getPath() ) ) |
| { |
| ret = ret.substring( baseURI.getPath().length() ); |
| } |
| |
| ret = URLDecoder.decode( ret, "UTF-8" ); |
| } |
| catch ( URISyntaxException e ) |
| { |
| // ignore |
| } |
| catch ( UnsupportedEncodingException e ) |
| { |
| // ignore |
| } |
| |
| return ret; |
| } |
| |
| private static boolean isAcceptableLink( String link ) |
| { |
| if ( link == null || link.length() == 0 ) |
| { |
| return false; |
| } |
| |
| for ( Pattern pattern : SKIPS ) |
| { |
| if ( pattern.matcher( link ).find() ) |
| { |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| } |