| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * https://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| */ |
| package org.apache.ivy.util.url; |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.net.URI; |
| import java.net.URISyntaxException; |
| import java.net.URL; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.apache.ivy.util.FileUtil; |
| import org.apache.ivy.util.Message; |
| import org.apache.ivy.util.url.URLHandler.URLInfo; |
| |
| /** |
| * Utility class which helps to list urls under a given url. This has been tested with Apache 1.3.33 |
| * server listing, as the one used at ibiblio, and with Apache 2.0.53 server listing, as the one on |
| * mirrors.sunsite.dk. |
| */ |
| public class ApacheURLLister { |
| // ~ Static variables/initializers ------------------------------------------ |
| |
| private static final Pattern PATTERN = Pattern.compile( |
| "<a[^>]*href=\"([^\"]*)\"[^>]*>(?:<[^>]+>)*?([^<>]+?)(?:<[^>]+>)*?</a>", |
| Pattern.CASE_INSENSITIVE); |
| |
| // ~ Methods ---------------------------------------------------------------- |
| |
| /** |
| * Returns a list of sub urls of the given url. The returned list is a list of URL. |
| * |
| * @param url |
| * The base URL from which to retrieve the listing. |
| * @return a list of sub urls of the given url. |
| * @throws IOException |
| * If an error occurs retrieving the HTML. |
| */ |
| public List<URL> listAll(URL url) throws IOException { |
| return retrieveListing(url, true, true); |
| } |
| |
| /** |
| * Returns a list of sub 'directories' of the given url. The returned list is a list of URL. |
| * |
| * @param url |
| * The base URL from which to retrieve the listing. |
| * @return a list of sub 'directories' of the given url. |
| * @throws IOException |
| * If an error occurs retrieving the HTML. |
| */ |
| public List<URL> listDirectories(URL url) throws IOException { |
| return retrieveListing(url, false, true); |
| } |
| |
| /** |
| * Returns a list of sub 'files' (in opposition to directories) of the given url. The returned |
| * list is a list of URL. |
| * |
| * @param url |
| * The base URL from which to retrieve the listing. |
| * @return a list of sub 'files' of the given url. |
| * @throws IOException |
| * If an error occurs retrieving the HTML. |
| */ |
| public List<URL> listFiles(URL url) throws IOException { |
| return retrieveListing(url, true, false); |
| } |
| |
| /** |
| * Retrieves a {@link List} of {@link URL}s corresponding to the files and/or directories found |
| * at the supplied base URL. |
| * |
| * @param url |
| * The base URL from which to retrieve the listing. |
| * @param includeFiles |
| * If true include files in the returned list. |
| * @param includeDirectories |
| * If true include directories in the returned list. |
| * @return A {@link List} of {@link URL}s. |
| * @throws IOException |
| * If an error occurs retrieving the HTML. |
| */ |
| @SuppressWarnings("deprecation") |
| public List<URL> retrieveListing(URL url, boolean includeFiles, boolean includeDirectories) |
| throws IOException { |
| List<URL> urlList = new ArrayList<>(); |
| |
| // add trailing slash for relative urls |
| if (!url.getPath().endsWith("/") && !url.getPath().endsWith(".html")) { |
| url = new URL(url.getProtocol(), url.getHost(), url.getPort(), url.getPath() + "/"); |
| } |
| |
| URLHandler urlHandler = URLHandlerRegistry.getDefault(); |
| URLInfo urlInfo = urlHandler.getURLInfo(url); |
| if (urlInfo == URLHandler.UNAVAILABLE) { |
| return urlList; // not found => return empty list |
| } |
| // here, urlInfo is valid |
| String charset = urlInfo.getBodyCharset(); |
| |
| InputStream contentStream = urlHandler.openStream(url); |
| BufferedReader r = null; |
| if (charset == null) { |
| r = new BufferedReader(new InputStreamReader(contentStream)); |
| } else { |
| r = new BufferedReader(new InputStreamReader(contentStream, charset)); |
| } |
| |
| String htmlText = FileUtil.readEntirely(r); |
| |
| Matcher matcher = PATTERN.matcher(htmlText); |
| |
| while (matcher.find()) { |
| // get the href text and the displayed text |
| String href = matcher.group(1); |
| String text = matcher.group(2); |
| |
| if (href == null || text == null) { |
| // the groups were not found (shouldn't happen, really) |
| continue; |
| } |
| |
| text = text.trim(); |
| |
| try { |
| // URI methods decode the URL |
| URI uri = new URI(href); |
| href = uri.getPath(); |
| // handle complete URL listings |
| if (uri.getScheme() != null) { |
| if (!href.startsWith(url.getPath())) { |
| // ignore URLs which aren't children of the base URL |
| continue; |
| } |
| href = href.substring(url.getPath().length()); |
| } |
| } catch (URISyntaxException e) { |
| // incorrect URL, ignore |
| continue; |
| } |
| |
| if (href.startsWith("../")) { |
| // we are only interested in sub-URLs, not parent URLs, so skip this one |
| continue; |
| } |
| |
| // absolute href: convert to relative one |
| if (href.startsWith("/")) { |
| int slashIndex = href.substring(0, href.length() - 1).lastIndexOf('/'); |
| href = href.substring(slashIndex + 1); |
| } |
| |
| // relative to current href: convert to simple relative one |
| if (href.startsWith("./")) { |
| href = href.substring("./".length()); |
| } |
| |
| // exclude those where they do not match |
| // href will never be truncated, text may be truncated by apache |
| if (text.endsWith("..>")) { |
| // text is probably truncated, we can only check if the href starts with text |
| if (!href.startsWith(text.substring(0, text.length() - 3))) { |
| continue; |
| } |
| } else if (text.endsWith("..>")) { |
| // text is probably truncated, we can only check if the href starts with text |
| if (!href.startsWith(text.substring(0, text.length() - 6))) { |
| continue; |
| } |
| } else { |
| // text is not truncated, so it must match the url after stripping optional |
| // trailing slashes |
| String strippedHref = href.endsWith("/") ? href.substring(0, href.length() - 1) |
| : href; |
| String strippedText = text.endsWith("/") ? text.substring(0, text.length() - 1) |
| : text; |
| if (!strippedHref.equalsIgnoreCase(strippedText)) { |
| continue; |
| } |
| } |
| |
| boolean directory = href.endsWith("/"); |
| |
| if ((directory && includeDirectories) || (!directory && includeFiles)) { |
| URL child = new URL(url, href); |
| urlList.add(child); |
| Message.debug("ApacheURLLister found URL=[" + child + "]."); |
| } |
| } |
| |
| return urlList; |
| } |
| } |