blob: 040a82f39343f1793c9a23c4562e5b771eda65b0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.ivy.util.url;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.ivy.util.FileUtil;
import org.apache.ivy.util.Message;
import org.apache.ivy.util.url.URLHandler.URLInfo;
/**
* Utility class which helps to list urls under a given url. This has been tested with Apache 1.3.33
* server listing, as the one used at ibiblio, and with Apache 2.0.53 server listing, as the one on
* mirrors.sunsite.dk.
*/
public class ApacheURLLister {
// ~ Static variables/initializers ------------------------------------------
private static final Pattern PATTERN = Pattern.compile(
"<a[^>]*href=\"([^\"]*)\"[^>]*>(?:<[^>]+>)*?([^<>]+?)(?:<[^>]+>)*?</a>",
Pattern.CASE_INSENSITIVE);
// ~ Methods ----------------------------------------------------------------
/**
* Returns a list of sub urls of the given url. The returned list is a list of URL.
*
* @param url
* The base URL from which to retrieve the listing.
* @return a list of sub urls of the given url.
* @throws IOException
* If an error occurs retrieving the HTML.
*/
public List<URL> listAll(URL url) throws IOException {
return retrieveListing(url, true, true);
}
/**
* Returns a list of sub 'directories' of the given url. The returned list is a list of URL.
*
* @param url
* The base URL from which to retrieve the listing.
* @return a list of sub 'directories' of the given url.
* @throws IOException
* If an error occurs retrieving the HTML.
*/
public List<URL> listDirectories(URL url) throws IOException {
return retrieveListing(url, false, true);
}
/**
* Returns a list of sub 'files' (in opposition to directories) of the given url. The returned
* list is a list of URL.
*
* @param url
* The base URL from which to retrieve the listing.
* @return a list of sub 'files' of the given url.
* @throws IOException
* If an error occurs retrieving the HTML.
*/
public List<URL> listFiles(URL url) throws IOException {
return retrieveListing(url, true, false);
}
/**
* Retrieves a {@link List} of {@link URL}s corresponding to the files and/or directories found
* at the supplied base URL.
*
* @param url
* The base URL from which to retrieve the listing.
* @param includeFiles
* If true include files in the returned list.
* @param includeDirectories
* If true include directories in the returned list.
* @return A {@link List} of {@link URL}s.
* @throws IOException
* If an error occurs retrieving the HTML.
*/
@SuppressWarnings("deprecation")
public List<URL> retrieveListing(URL url, boolean includeFiles, boolean includeDirectories)
throws IOException {
List<URL> urlList = new ArrayList<>();
// add trailing slash for relative urls
if (!url.getPath().endsWith("/") && !url.getPath().endsWith(".html")) {
url = new URL(url.getProtocol(), url.getHost(), url.getPort(), url.getPath() + "/");
}
URLHandler urlHandler = URLHandlerRegistry.getDefault();
URLInfo urlInfo = urlHandler.getURLInfo(url);
if (urlInfo == URLHandler.UNAVAILABLE) {
return urlList; // not found => return empty list
}
// here, urlInfo is valid
String charset = urlInfo.getBodyCharset();
InputStream contentStream = urlHandler.openStream(url);
BufferedReader r = null;
if (charset == null) {
r = new BufferedReader(new InputStreamReader(contentStream));
} else {
r = new BufferedReader(new InputStreamReader(contentStream, charset));
}
String htmlText = FileUtil.readEntirely(r);
Matcher matcher = PATTERN.matcher(htmlText);
while (matcher.find()) {
// get the href text and the displayed text
String href = matcher.group(1);
String text = matcher.group(2);
if (href == null || text == null) {
// the groups were not found (shouldn't happen, really)
continue;
}
text = text.trim();
try {
// URI methods decode the URL
URI uri = new URI(href);
href = uri.getPath();
// handle complete URL listings
if (uri.getScheme() != null) {
if (!href.startsWith(url.getPath())) {
// ignore URLs which aren't children of the base URL
continue;
}
href = href.substring(url.getPath().length());
}
} catch (URISyntaxException e) {
// incorrect URL, ignore
continue;
}
if (href.startsWith("../")) {
// we are only interested in sub-URLs, not parent URLs, so skip this one
continue;
}
// absolute href: convert to relative one
if (href.startsWith("/")) {
int slashIndex = href.substring(0, href.length() - 1).lastIndexOf('/');
href = href.substring(slashIndex + 1);
}
// relative to current href: convert to simple relative one
if (href.startsWith("./")) {
href = href.substring("./".length());
}
// exclude those where they do not match
// href will never be truncated, text may be truncated by apache
if (text.endsWith("..>")) {
// text is probably truncated, we can only check if the href starts with text
if (!href.startsWith(text.substring(0, text.length() - 3))) {
continue;
}
} else if (text.endsWith("..&gt;")) {
// text is probably truncated, we can only check if the href starts with text
if (!href.startsWith(text.substring(0, text.length() - 6))) {
continue;
}
} else {
// text is not truncated, so it must match the url after stripping optional
// trailing slashes
String strippedHref = href.endsWith("/") ? href.substring(0, href.length() - 1)
: href;
String strippedText = text.endsWith("/") ? text.substring(0, text.length() - 1)
: text;
if (!strippedHref.equalsIgnoreCase(strippedText)) {
continue;
}
}
boolean directory = href.endsWith("/");
if ((directory && includeDirectories) || (!directory && includeFiles)) {
URL child = new URL(url, href);
urlList.add(child);
Message.debug("ApacheURLLister found URL=[" + child + "].");
}
}
return urlList;
}
}