blob: b82ffd627018ed42ea69655a813fac19ea1bc46a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.collection;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.net.URLFilter;
import org.apache.xerces.util.DOMUtil;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
/**
* SubCollection represents a subset of index, you can define url patterns that
* will indicate that particular page (url) is part of SubCollection.
*/
public class Subcollection extends Configured implements URLFilter {
public static final String TAG_COLLECTIONS = "subcollections";
public static final String TAG_COLLECTION = "subcollection";
public static final String TAG_WHITELIST = "whitelist";
public static final String TAG_BLACKLIST = "blacklist";
public static final String TAG_NAME = "name";
public static final String TAG_KEY = "key";
public static final String TAG_ID = "id";
List<String> blackList = new ArrayList<String>();
List<String> whiteList = new ArrayList<String>();
/**
* SubCollection identifier
*/
String id;
/**
* SubCollection key
*/
String key;
/**
* SubCollection name
*/
String name;
/**
* SubCollection whitelist as String
*/
String wlString;
/**
* SubCollection blacklist as String
*/
String blString;
/**
* Whether the white and black lists are case sensitive
*/
boolean caseInsensitive = false;
/**
* public Constructor
*
* @param id
* Id of SubCollection
* @param name
* Name of SubCollection
* @param conf A populated {@link Configuration}
*/
public Subcollection(String id, String name, Configuration conf) {
this(id, name, null, conf);
}
/**
* public Constructor
*
* @param id
* Id of SubCollection
* @param name
* Name of SubCollection
* @param key SubCollection key
* @param conf A populated {@link Configuration}
*/
public Subcollection(String id, String name, String key, Configuration conf) {
this(conf);
this.id = id;
this.key = key;
this.name = name;
caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
}
public Subcollection(Configuration conf) {
super(conf);
caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
}
/**
* @return Returns the name
*/
public String getName() {
return name;
}
/**
* @return Returns the key
*/
public String getKey() {
return key;
}
/**
* @return Returns the id
*/
public String getId() {
return id;
}
/**
* Returns whitelist
*
* @return Whitelist entries
*/
public List<String> getWhiteList() {
return whiteList;
}
/**
* Returns whitelist String
*
* @return Whitelist String
*/
public String getWhiteListString() {
return wlString;
}
/**
* Returns blacklist String
*
* @return Blacklist String
*/
public String getBlackListString() {
return blString;
}
/**
* @param whiteList
* The whiteList to set.
*/
public void setWhiteList(ArrayList<String> whiteList) {
this.whiteList = whiteList;
}
/**
* Simple "indexOf" currentFilter for matching patterns.
*
* <pre>
* rules for evaluation are as follows:
* 1. if pattern matches in blacklist then url is rejected
* 2. if pattern matches in whitelist then url is allowed
* 3. url is rejected
* </pre>
*
* @see org.apache.nutch.net.URLFilter#filter(java.lang.String)
*/
@Override
public String filter(String urlString) {
// first the blacklist
Iterator<String> i = blackList.iterator();
while (i.hasNext()) {
String row = (String) i.next();
if (urlString.contains(row))
return null;
}
// then whitelist
i = whiteList.iterator();
while (i.hasNext()) {
String row = (String) i.next();
if (urlString.contains(row))
return urlString;
}
return null;
}
/**
* Initialize Subcollection from dom element
*
* @param collection A DOM {@link org.w3c.dom.Element} for use
* in creating the {@link Subcollection}
*/
public void initialize(Element collection) {
this.id = DOMUtil.getChildText(
collection.getElementsByTagName(TAG_ID).item(0)).trim();
this.name = DOMUtil.getChildText(
collection.getElementsByTagName(TAG_NAME).item(0)).trim();
this.wlString = DOMUtil.getChildText(
collection.getElementsByTagName(TAG_WHITELIST).item(0)).trim();
parseList(this.whiteList, wlString);
// Check if there's a blacklist we need to parse
NodeList nodeList = collection.getElementsByTagName(TAG_BLACKLIST);
if (nodeList.getLength() > 0) {
this.blString = DOMUtil.getChildText(nodeList.item(0)).trim();
parseList(this.blackList, blString);
}
// Check if there's a key element or set default name
nodeList = collection.getElementsByTagName(TAG_KEY);
if (nodeList.getLength() == 1) {
this.key = DOMUtil.getChildText(nodeList.item(0)).trim();
}
}
/**
* Create a list of patterns from a chunk of text, patterns are separated
* with a newline
*
* @param list An initialized {@link List} to insert String patterns.
* @param text A chunkl fo text (hopefully) containing patterns.
*/
protected void parseList(List<String> list, String text) {
list.clear();
StringTokenizer st = new StringTokenizer(text, "\n\r");
while (st.hasMoreElements()) {
String line = (String) st.nextElement();
line = line.trim();
if (line.isEmpty())
continue;
if (caseInsensitive) {
line = line.toLowerCase(Locale.ROOT);
}
list.add(line);
}
}
/**
* Set contents of blacklist from String
*
* @param list
* the blacklist contents
*/
public void setBlackList(String list) {
this.blString = list;
parseList(blackList, list);
}
/**
* Set contents of whitelist from String
*
* @param list
* the whitelist contents
*/
public void setWhiteList(String list) {
this.wlString = list;
parseList(whiteList, list);
}
}