blob: 503310a9381629a1a168d42e222f2656d03886ff [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexer.replace;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.NutchField;
import org.apache.nutch.parse.Parse;
/**
* Do pattern replacements on selected field contents prior to indexing.
*
* To use this plugin, add <code>index-replace</code> to your
* <code>plugin.includes</code>. Example:
*
* <pre>
* &lt;property&gt;
* &lt;name&gt;plugin.includes&lt;/name&gt;
* &lt;value&gt;protocol-(http)|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata|replace)|urlnormalizer-(pass|regex|basic)|indexer-solr&lt;/value&gt;
* &lt;/property&gt;
* </pre>
*
* And then add the <code>index.replace.regexp</code> property to
* <code>conf/nutch-site.xml</code>. This contains a list of replacement
* instructions per field name, one per line. eg.
*
* <pre>
* fieldname=/regexp/replacement/[flags]
* </pre>
*
* <pre>
* &lt;property&gt;
* &lt;name&gt;index.replace.regexp&lt;/name&gt;
* &lt;value&gt;
* hostmatch=.*\\.com
* title=/search/replace/2
* &lt;/value&gt;
* &lt;/property&gt;
* </pre>
*
* <code>hostmatch=</code> and <code>urlmatch=</code> lines indicate the match
* pattern for a host or url. The field replacements that follow this line will
* apply only to pages from the matching host or url. Replacements run in the
* order specified. Field names may appear multiple times if multiple
* replacements are needed.
*
* The property format is defined in greater detail in
* <code>conf/nutch-default.xml</code>.
*
* @author Peter Ciuffetti
* @see <a
* href="https://issues.apache.org/jira/browse/NUTCH-2058">NUTCH-2058</a>
*/
public class ReplaceIndexer implements IndexingFilter {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
/** Special field name signifying the start of a host-specific match set */
private static final String HOSTMATCH = "hostmatch";
/** Special field name signifying the start of a url-specific match set */
private static final String URLMATCH = "urlmatch";
private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_HOST = new LinkedHashMap<Pattern, List<FieldReplacer>>();
private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_URL = new LinkedHashMap<Pattern, List<FieldReplacer>>();
private static Pattern LINE_SPLIT = Pattern.compile("(^.+$)+",
Pattern.MULTILINE);
private static Pattern NAME_VALUE_SPLIT = Pattern.compile("(.*?)=(.*)");
private Configuration conf;
/**
* {@inheritDoc}
*/
public void setConf(Configuration conf) {
this.conf = conf;
FIELDREPLACERS_BY_HOST.clear();
FIELDREPLACERS_BY_URL.clear();
String value = conf.get("index.replace.regexp", null);
if (value != null) {
LOG.debug("Parsing index.replace.regexp property");
this.parseConf(value);
}
}
/**
* {@inheritDoc}
*/
public Configuration getConf() {
return this.conf;
}
/**
* Parse the property value into a set of maps that store a list of
* replacements by field for each host and url configured into the property.
*
* @param propertyValue
*/
private void parseConf(String propertyValue) {
if (propertyValue == null || propertyValue.trim().length() == 0) {
return;
}
// At the start, all replacements apply globally to every host.
Pattern hostPattern = Pattern.compile(".*");
Pattern urlPattern = null;
// Split the property into lines
Matcher lineMatcher = LINE_SPLIT.matcher(propertyValue);
while (lineMatcher.find()) {
String line = lineMatcher.group();
if (line != null && line.length() > 0) {
// Split the line into field and value
Matcher nameValueMatcher = NAME_VALUE_SPLIT.matcher(line.trim());
if (nameValueMatcher.find()) {
String fieldName = nameValueMatcher.group(1).trim();
String value = nameValueMatcher.group(2);
if (fieldName != null && value != null) {
// Check if the field name is one of our special cases.
if (HOSTMATCH.equals(fieldName)) {
urlPattern = null;
try {
hostPattern = Pattern.compile(value);
} catch (PatternSyntaxException pse) {
LOG.error("hostmatch pattern " + value + " does not compile: "
+ pse.getMessage());
// Deactivate this invalid match set by making it match no host.
hostPattern = Pattern.compile("willnotmatchanyhost");
}
} else if (URLMATCH.equals(fieldName)) {
try {
urlPattern = Pattern.compile(value);
} catch (PatternSyntaxException pse) {
LOG.error("urlmatch pattern " + value + " does not compile: "
+ pse.getMessage());
// Deactivate this invalid match set by making it match no url.
urlPattern = Pattern.compile("willnotmatchanyurl");
}
} else if (value.length() > 3) {
String toFieldName = fieldName;
// If the fieldname has a colon, this indicates a different target
// field.
if (fieldName.indexOf(':') > 0) {
toFieldName = fieldName.substring(fieldName.indexOf(':') + 1);
fieldName = fieldName.substring(0, fieldName.indexOf(':'));
}
String sep = value.substring(0, 1);
// Divide the value into pattern / replacement / flags.
value = value.substring(1);
if (!value.contains(sep)) {
LOG.error("Pattern '" + line
+ "', not parseable. Missing separator " + sep);
continue;
}
String pattern = value.substring(0, value.indexOf(sep));
value = value.substring(pattern.length() + 1);
String replacement = value;
if (value.contains(sep)) {
replacement = value.substring(0, value.indexOf(sep));
}
int flags = 0;
if (value.length() > replacement.length() + 1) {
value = value.substring(replacement.length() + 1).trim();
try {
flags = Integer.parseInt(value);
} catch (NumberFormatException e) {
LOG.error("Pattern " + line + ", has invalid flags component");
continue;
}
}
Integer iFlags = (flags > 0) ? Integer.valueOf(flags) : null;
// Make a FieldReplacer out of these params.
FieldReplacer fr = new FieldReplacer(fieldName, toFieldName,
pattern, replacement, iFlags);
// Add this field replacer to the list for this host or URL.
if (urlPattern != null) {
List<FieldReplacer> lfp = FIELDREPLACERS_BY_URL.get(urlPattern);
if (lfp == null) {
lfp = new ArrayList<FieldReplacer>();
}
lfp.add(fr);
FIELDREPLACERS_BY_URL.put(urlPattern, lfp);
} else {
List<FieldReplacer> lfp = FIELDREPLACERS_BY_HOST
.get(hostPattern);
if (lfp == null) {
lfp = new ArrayList<FieldReplacer>();
}
lfp.add(fr);
FIELDREPLACERS_BY_HOST.put(hostPattern, lfp);
}
}
}
}
}
}
}
/**
* {@inheritDoc}
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
if (doc != null) {
if (FIELDREPLACERS_BY_HOST.size() > 0) {
this.doReplace(doc, "host", FIELDREPLACERS_BY_HOST);
}
if (FIELDREPLACERS_BY_URL.size() > 0) {
this.doReplace(doc, "url", FIELDREPLACERS_BY_URL);
}
}
return doc;
}
/**
* Iterates through the replacement map provided, to update the fields in the
* Nutch Document.
*
* @param doc
* the document we are modifying
* @param keyName
* either "host" or "url" -- the field that determines the
* replacement set used
* @param replaceMap
* the list of FieldReplacers that applies to this keyName.
*/
private void doReplace(NutchDocument doc, String keyName,
Map<Pattern, List<FieldReplacer>> replaceMap) {
if (doc == null || replaceMap.size() == 0) {
return;
}
Collection<String> docFieldNames = doc.getFieldNames();
NutchField keyField = doc.getField(keyName);
if (keyField == null) {
// This document doesn't have the key field; no work to do.
return;
}
List<Object> keyFieldValues = keyField.getValues();
if (keyFieldValues.size() == 0) {
// This document doesn't have any values for the key field; no work to do.
return;
}
// For every value of the keyField (one expected)
for (Object oKeyFieldValue : keyFieldValues) {
if (oKeyFieldValue != null && oKeyFieldValue instanceof java.lang.String) {
String keyFieldValue = (String) oKeyFieldValue;
// For each pattern that we have a replacement list for...
for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap
.entrySet()) {
// If this key is a match for a replacement set...
if (entries.getKey().matcher(keyFieldValue).find()) {
// For each field we will replace for this key...
for (FieldReplacer fp : entries.getValue()) {
String fieldName = fp.getFieldName();
// Does this document contain the FieldReplacer's field?
if (docFieldNames.contains(fieldName)) {
NutchField docField = doc.getField(fieldName);
List<Object> fieldValues = docField.getValues();
ArrayList<String> newFieldValues = new ArrayList<String>();
// For each value of the field, match against our
// replacer...
for (Object oFieldValue : fieldValues) {
if (oFieldValue != null
&& oFieldValue instanceof java.lang.String) {
String fieldValue = (String) oFieldValue;
String newValue = fp.replace(fieldValue);
newFieldValues.add(newValue);
}
}
// Remove the target field and add our replaced values.
String targetFieldName = fp.getToFieldName();
doc.removeField(targetFieldName);
for (String newFieldValue : newFieldValues) {
doc.add(targetFieldName, newFieldValue);
}
}
}
}
}
}
}
}
}