blob: 8acc3dd31b87c656541882ca2052038a1af38657 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.cli;
import com.beust.jcommander.IStringConverter;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import com.beust.jcommander.Parameters;
import com.beust.jcommander.converters.FileConverter;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.ParseData;
import org.apache.any23.plugin.crawler.CrawlerListener;
import org.apache.any23.plugin.crawler.SiteCrawler;
import org.apache.any23.source.StringDocumentSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.net.URL;
import java.util.Locale;
import java.util.UUID;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import static java.lang.String.format;
/**
* Implementation of a <b>CLI crawler</b> based on
* {@link Rover}.
*
* @author Michele Mostarda (mostarda@fbk.eu)
*/
@Parameters(commandNames = "crawler", commandDescription = "Any23 Crawler Command Line Tool.")
public class Crawler extends Rover {
private static final Logger LOG = LoggerFactory.getLogger(Crawler.class);
private final Object roverLock = new Object();
@Parameter(
names = { "-pf", "--pagefilter" },
description = "Regex used to filter out page URLs during crawling.",
converter = PatterConverter.class
)
private Pattern pageFilter = Pattern.compile( SiteCrawler.DEFAULT_PAGE_FILTER_RE );
@Parameter(
names = { "-sf", "--storagefolder" },
description = "Folder used to store crawler temporary data.",
converter = FileConverter.class
)
private File storageFolder = new File(System.getProperty("java.io.tmpdir"), "crawler-metadata-" + UUID.randomUUID().toString());
@Parameter(names = { "-nc", "--numcrawlers" }, description = "Sets the number of crawlers.")
private int numCrawlers = SiteCrawler.DEFAULT_NUM_OF_CRAWLERS;
@Parameter(names = { "-mp", "--maxpages" }, description = "Max number of pages before interrupting crawl.")
private int maxPages = Integer.MAX_VALUE;
@Parameter(names = { "-md", "--maxdepth" }, description = "Max allowed crawler depth.")
private int maxDepth = Integer.MAX_VALUE;
@Parameter(names = { "-pd", "--politenessdelay" }, description = "Politeness delay in milliseconds.")
private int politenessDelay = Integer.MAX_VALUE;
@Override
public void run() throws Exception {
super.configure();
if (inputIRIs.size() != 1) {
throw new IllegalArgumentException("Expected just one seed.");
}
final URL seed = new URL(inputIRIs.get( 0 ));
if ( storageFolder.isFile() ) {
throw new IllegalStateException( format(Locale.ROOT, "Storage folder %s can not be a file, must be a directory",
storageFolder ) );
}
if ( !storageFolder.exists() ) {
if ( !storageFolder.mkdirs() ) {
throw new IllegalStateException(
format(Locale.ROOT, "Storage folder %s can not be created, please verify you have enough permissions",
storageFolder ) );
}
}
final SiteCrawler siteCrawler = new SiteCrawler( storageFolder );
siteCrawler.setNumOfCrawlers( numCrawlers );
siteCrawler.setMaxPages( maxPages );
siteCrawler.setMaxDepth( maxDepth );
siteCrawler.setPolitenessDelay(politenessDelay);
siteCrawler.addListener(new CrawlerListener() {
@Override
public void visitedPage(Page page) {
final String pageURL = page.getWebURL().getURL();
LOG.info(format(Locale.ROOT, "Processing page: [%s]", pageURL) );
final ParseData parseData = page.getParseData();
if (parseData instanceof HtmlParseData) {
final HtmlParseData htmlParseData = (HtmlParseData) parseData;
try {
synchronized (roverLock) {
Crawler.super.performExtraction(
new StringDocumentSource(
htmlParseData.getHtml(),
pageURL
)
);
}
} catch (Exception e) {
LOG.error(format(Locale.ROOT, "Error while processing page [%s], error: %s .",
pageURL, e.getMessage())
);
}
}
}
});
Runtime.getRuntime().addShutdownHook( new Thread() {
@Override
public void run() {
try {
LOG.error(Crawler.super.printReports());
} catch (Exception e) {
LOG.error(e.getMessage());
}
}
});
siteCrawler.start(seed, pageFilter, true);
}
public static final class PatterConverter implements IStringConverter<Pattern> {
@Override
public Pattern convert( String value ) {
try {
return Pattern.compile(value);
} catch (PatternSyntaxException pse) {
throw new ParameterException(format(Locale.ROOT, "Invalid page filter, '%s' must be a regular expression.", value) );
}
}
}
}