blob: ccb23fbde59c840ad4818c8ffa6e900af5b86f94 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.plugin.crawler;
import edu.uci.ics.crawler4j.crawler.Page;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
/**
* This class hosts shared data structures accessible
* to all the {@link DefaultWebCrawler} instances
* run by the {@link SiteCrawler}.
*
* @author Michele Mostarda (mostarda@fbk.eu)
*/
public class SharedData {
/**
* Singleton instance.
*/
private static SharedData instance;
/**
* Crawl seed.
*/
private final String seed;
/**
* Crawl page filter pattern.
*/
private final Pattern pattern;
/**
* List of crawler listeners.
*/
private final List<CrawlerListener> listeners;
// /**
// * Output triple handler.
// */
// private final TripleHandler tripleHandler;
/**
* @return the singleton instance.
*/
protected static SharedData getInstance() {
if(instance == null) throw new IllegalStateException("The configuration has not yet initialized.");
return instance;
}
/**
* Initializes the crawler data.
*
* @param seed crawler seed.
* @param regex page filter regex.
* @param listeners the listeners to be notified of the crawler activity.
*/
protected static void setCrawlData(String seed, Pattern regex, List<CrawlerListener> listeners) {
instance = new SharedData(seed, regex, listeners);
}
/**
* Internal constructor.
*
* @param seed
* @param pattern
* @param listeners
*/
private SharedData(String seed, Pattern pattern, List<CrawlerListener> listeners) {
if(seed == null || seed.trim().length() == 0)
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Invalid seed '%s'", seed)
);
this.seed = seed;
this.pattern = pattern;
this.listeners = listeners;
}
/**
* @return crawl seed.
*/
protected String getSeed() {
return seed;
}
/**
* @return page filter pattern.
*/
protected Pattern getPattern() {
return pattern;
}
/**
* Notifies all listeners that a page has been discovered.
*
* @param page the discovered page.
*/
protected void notifyPage(Page page) {
for(CrawlerListener listener : listeners) {
listener.visitedPage(page);
}
}
}