wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java - maven-wagon - Git at Google

 package org.apache.maven.wagon.shared.http;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 import org.apache.commons.io.IOUtils;
 import org.apache.maven.wagon.TransferFailedException;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.net.URLDecoder;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import java.util.regex.Pattern;

 /**
  * Html File List Parser.
  */
 public class HtmlFileListParser
 {
     // Apache Fancy Index Sort Headers
     private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );

     // URLs with excessive paths.
     private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );

     // URLs that to a parent directory.
     private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );

     // mailto urls
     private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );

     private static final Pattern[] SKIPS =
         new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };

     /**
      * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
      *
      * @param stream the input stream.
      * @return the file list.
      * @throws TransferFailedException if there was a problem fetching the raw html.
      */
     public static List<String> parseFileList( String baseurl, InputStream stream )
         throws TransferFailedException
     {
         try
         {
             URI baseURI = new URI( baseurl );
             // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
             // assumption.
             String content = IOUtils.toString( stream, "utf-8" );
             Document doc = Jsoup.parse( content, baseurl );
             Elements links = doc.select( "a[href]" );
             Set<String> results = new HashSet<String>();
             for ( Element link : links )
             {
                 /*
                  * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
                  */
                 String target = link.attr( "href" );
                 if ( target != null )
                 {
                     String clean = cleanLink( baseURI, target );
                     if ( isAcceptableLink( clean ) )
                     {
                         results.add( clean );
                     }
                 }

             }

             return new ArrayList<String>( results );
         }
         catch ( URISyntaxException e )
         {
             throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e );
         }
         catch ( IOException e )
         {
             throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
         }
     }

     private static String cleanLink( URI baseURI, String link )
     {
         if ( link == null || link.length() == 0 )
         {
             return "";
         }

         String ret = link;

         try
         {
             URI linkuri = new URI( ret );
             if ( link.startsWith( "/" ) )
             {
                 linkuri = baseURI.resolve( linkuri );
             }
             URI relativeURI = baseURI.relativize( linkuri ).normalize();
             ret = relativeURI.toASCIIString();
             if ( ret.startsWith( baseURI.getPath() ) )
             {
                 ret = ret.substring( baseURI.getPath().length() );
             }

             ret = URLDecoder.decode( ret, "UTF-8" );
         }
         catch ( URISyntaxException e )
         {
             // ignore
         }
         catch ( UnsupportedEncodingException e )
         {
             // ignore
         }

         return ret;
     }

     private static boolean isAcceptableLink( String link )
     {
         if ( link == null || link.length() == 0 )
         {
             return false;
         }

         for ( Pattern pattern : SKIPS )
         {
             if ( pattern.matcher( link ).find() )
             {
                 return false;
             }
         }

         return true;
     }

 }
	package org.apache.maven.wagon.shared.http;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	import org.apache.commons.io.IOUtils;
	import org.apache.maven.wagon.TransferFailedException;
	import org.jsoup.Jsoup;
	import org.jsoup.nodes.Document;
	import org.jsoup.nodes.Element;
	import org.jsoup.select.Elements;

	import java.io.IOException;
	import java.io.InputStream;
	import java.io.UnsupportedEncodingException;
	import java.net.URI;
	import java.net.URISyntaxException;
	import java.net.URLDecoder;
	import java.util.ArrayList;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Set;
	import java.util.regex.Pattern;

	/**
	* Html File List Parser.
	*/
	public class HtmlFileListParser
	{
	// Apache Fancy Index Sort Headers
	private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );

	// URLs with excessive paths.
	private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );

	// URLs that to a parent directory.
	private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );

	// mailto urls
	private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );

	private static final Pattern[] SKIPS =
	new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };

	/**
	* Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
	*
	* @param stream the input stream.
	* @return the file list.
	* @throws TransferFailedException if there was a problem fetching the raw html.
	*/
	public static List<String> parseFileList( String baseurl, InputStream stream )
	throws TransferFailedException
	{
	try
	{
	URI baseURI = new URI( baseurl );
	// to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
	// assumption.
	String content = IOUtils.toString( stream, "utf-8" );
	Document doc = Jsoup.parse( content, baseurl );
	Elements links = doc.select( "a[href]" );
	Set<String> results = new HashSet<String>();
	for ( Element link : links )
	{
	/*
	* The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
	*/
	String target = link.attr( "href" );
	if ( target != null )
	{
	String clean = cleanLink( baseURI, target );
	if ( isAcceptableLink( clean ) )
	{
	results.add( clean );
	}
	}

	}

	return new ArrayList<String>( results );
	}
	catch ( URISyntaxException e )
	{
	throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e );
	}
	catch ( IOException e )
	{
	throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
	}
	}

	private static String cleanLink( URI baseURI, String link )
	{
	if ( link == null \|\| link.length() == 0 )
	{
	return "";
	}

	String ret = link;

	try
	{
	URI linkuri = new URI( ret );
	if ( link.startsWith( "/" ) )
	{
	linkuri = baseURI.resolve( linkuri );
	}
	URI relativeURI = baseURI.relativize( linkuri ).normalize();
	ret = relativeURI.toASCIIString();
	if ( ret.startsWith( baseURI.getPath() ) )
	{
	ret = ret.substring( baseURI.getPath().length() );
	}

	ret = URLDecoder.decode( ret, "UTF-8" );
	}
	catch ( URISyntaxException e )
	{
	// ignore
	}
	catch ( UnsupportedEncodingException e )
	{
	// ignore
	}

	return ret;
	}

	private static boolean isAcceptableLink( String link )
	{
	if ( link == null \|\| link.length() == 0 )
	{
	return false;
	}

	for ( Pattern pattern : SKIPS )
	{
	if ( pattern.matcher( link ).find() )
	{
	return false;
	}
	}

	return true;
	}

	}