blob: a0065988923e4ae345bbaaf7387f27abd857ce3e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.oodt.cas.protocol.http.util;
//JDK imports
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//APACHE imports
import org.apache.commons.lang.Validate;
//OODT imports
import org.apache.oodt.cas.metadata.util.MimeTypeUtils;
import org.apache.oodt.cas.protocol.http.HttpFile;
/**
* Utility methods for HTTP Protocol related tasks.
*
* @author bfoster
*/
public class HttpUtils {
static final MimeTypeUtils MIME_TYPES = new MimeTypeUtils();
// Pattern looking for <a href="(group-2)"/>(group-3)</a> . . . group-1 is for either " or '
static final Pattern XHTML_LINK_PATTERN = Pattern.compile("<\\s*a\\s+href\\s*=\\s*(['\"])(.+?)\\1\\s*>(.+?)<\\s*/\\s*a\\s*>");
static final Pattern LAZY_LINK_PATTERN = Pattern.compile("<\\s*a\\s+href\\s*=\\s*(['\"])(.+?)\\1\\s*/\\s*>");
private HttpUtils() {}
/**
* Resolves a path against given {@link URI} and creates the resolved {@link URI}.
* (i.e. base = "http://localhost" ; path = "/path/to/file" ; resolved = "http://localhost/path/to/file")
* Handles all cases: if base already has a path, if path is relative, if path is absolute.
*
* @param base The base {@link URI} which the given path will be resolved against.
* @param path The path to be resolved against the given {@link URI}
* @return resolved {@link URI}.
* @throws URISyntaxException
*/
public static URI resolveUri(URI base, String path) throws URISyntaxException {
Validate.notNull(base, "base URI must not be NULL");
Validate.notNull(path, "resolve path must not be NULL");
if (path.startsWith("http://")) {
return new URI(path);
} else if (path.startsWith("/")) {
return new URI(base.getScheme() + "://" + base.getHost() + path);
} else {
if (base.toString().endsWith("/")) {
return new URI(base.toString() + path);
} else {
return new URI(base.toString() + "/" + path);
}
}
}
public static HttpURLConnection connect(URL url) throws IOException {
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.connect();
conn.getResponseMessage();
return conn;
}
public static boolean checkForRedirection(URL beforeConnUrl, URL afterConnUrl) {
return !beforeConnUrl.toString().equals(afterConnUrl.toString());
}
public static String readUrl(HttpURLConnection conn) throws IOException {
// create URL source reader
Scanner scanner = new Scanner(conn.getInputStream());
// Read in link
StringBuffer sb = new StringBuffer("");
while (scanner.hasNext())
sb.append(scanner.nextLine());
return sb.toString();
}
public static List<HttpFile> findLinks(HttpFile file) throws IOException, URISyntaxException {
Matcher matcher = XHTML_LINK_PATTERN.matcher(HttpUtils.readUrl(connect(file.getLink())));
List<HttpFile> httpFiles = new ArrayList<HttpFile>();
while (matcher.find()) {
String link = matcher.group(2).trim();
String virtualPath = matcher.group(3).trim();
URL url = resolveUri(file.getLink().toURI(), link).toURL();
httpFiles.add(new HttpFile(file, link, isDirectory(url, virtualPath), url));
}
matcher = LAZY_LINK_PATTERN.matcher(HttpUtils.readUrl(connect(file.getLink())));
while (matcher.find()) {
String link = matcher.group(2).trim();
URL url = resolveUri(file.getLink().toURI(), link).toURL();
httpFiles.add(new HttpFile(file, link, isDirectory(url, link), url));
}
return httpFiles;
}
public static boolean isDirectory(URL url, String virtualPath) throws IOException {
try {
String mime = MIME_TYPES.autoResolveContentType(url.toString(),
MimeTypeUtils.readMagicHeader(url.openStream()));
return (mime.equals("text/html") && !virtualPath.endsWith(".html"));
} catch (Exception e) {
throw new IOException("URL does not exist '" + url + "'", e);
}
}
}