/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.similarity.apps.utils; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.net.MalformedURLException; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.logging.Logger; | |
import org.apache.tika.Tika; | |
import org.apache.tika.exception.TikaException; | |
import org.apache.tika.metadata.Metadata; | |
import org.apache.tika.parser.AutoDetectParser; | |
import org.apache.tika.parser.ParseContext; | |
import org.apache.tika.parser.Parser; | |
import org.apache.tika.sax.BodyContentHandler; | |
public class PageFetcher { | |
private static final Logger log = Logger | |
.getLogger("opennlp.tools.similarity.apps.utils.PageFetcher"); | |
Tika tika = new Tika(); | |
private static int DEFAULT_TIMEOUT = 1500; | |
private void setTimeout(int to){ | |
DEFAULT_TIMEOUT = to; | |
} | |
public String fetchPage(final String url) { | |
return fetchPage(url, DEFAULT_TIMEOUT); | |
} | |
public String fetchPageAutoDetectParser(final String url ){ | |
String fetchURL = addHttp(url); | |
String pageContent = null; | |
URLConnection connection; | |
try { | |
log.info("fetch url auto detect parser " + url); | |
connection = new URL(fetchURL).openConnection(); | |
connection.setReadTimeout(DEFAULT_TIMEOUT); | |
//parse method parameters | |
Parser parser = new AutoDetectParser(); | |
BodyContentHandler handler = new BodyContentHandler(); | |
Metadata metadata = new Metadata(); | |
ParseContext context = new ParseContext(); | |
//parsing the file | |
parser.parse(connection.getInputStream(), handler, metadata, context); | |
pageContent = handler.toString(); | |
} catch (Exception e) { | |
log.info(e.getMessage() + "\n" + e); | |
} | |
return pageContent; | |
} | |
public String fetchPage(final String url, final int timeout) { | |
String fetchURL = addHttp(url); | |
log.info("fetch url " + fetchURL); | |
String pageContent = null; | |
URLConnection connection; | |
try { | |
connection = new URL(fetchURL).openConnection(); | |
connection.setReadTimeout(DEFAULT_TIMEOUT); | |
pageContent = tika.parseToString(connection.getInputStream()) | |
.replace('\n', ' ').replace('\t', ' '); | |
} catch (MalformedURLException e) { | |
log.severe(e.getMessage() + "\n" + e); | |
} catch (IOException e) { | |
log.severe(e.getMessage() + "\n" + e); | |
} catch (TikaException e) { | |
log.severe(e.getMessage() + "\n" + e); | |
} | |
return pageContent; | |
} | |
private String addHttp(final String url) { | |
if (!url.startsWith("http://") && !url.startsWith("https://")) { | |
return "http://" + url; | |
} | |
return url; | |
} | |
public String fetchOrigHTML(String url, int timeout) { | |
setTimeout(timeout); | |
return fetchOrigHTML(url); | |
} | |
public String fetchOrigHTML(String url) { | |
log.info("fetch url " + url); | |
StringBuffer buf = new StringBuffer(); | |
try { | |
URLConnection connection = new URL(url).openConnection(); | |
connection.setReadTimeout(DEFAULT_TIMEOUT); | |
connection | |
.setRequestProperty( | |
"User-Agent", | |
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3"); | |
String line; | |
BufferedReader reader = null; | |
try { | |
reader = new BufferedReader(new InputStreamReader( | |
connection.getInputStream())); | |
} catch (Exception e) { | |
// we dont always need to log trial web pages if access fails | |
log.severe(e.toString()); | |
} | |
while ((line = reader.readLine()) != null) { | |
buf.append(line); | |
} | |
} | |
// normal case when a hypothetical page does not exist | |
catch (Exception e) { | |
// LOG.error(e.getMessage(), e); | |
// System.err.println("error fetching url " + url); | |
} | |
/* try { | |
Thread.sleep(50); // do nothing 4 sec | |
} catch (InterruptedException e) { | |
e.printStackTrace(); | |
} */ | |
return buf.toString(); | |
} | |
public static void main(String[] args){ | |
PageFetcher fetcher = new PageFetcher(); | |
String content = fetcher.fetchPageAutoDetectParser("http://www.elastica.net/"); | |
System.out.println(content); | |
content = fetcher. | |
fetchPageAutoDetectParser("http://www.cnn.com"); | |
System.out.println(content); | |
content = new PageFetcher().fetchPage("https://github.com"); | |
System.out.println(content); | |
content = new PageFetcher().fetchOrigHTML("http://www.cnn.com"); | |
System.out.println(content); | |
} | |
} |