blob: 6e86fc66c5898e7d009b186c7d14c999d8118776 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parsefilter.regex;
import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.FileReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.protocol.Content;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
/**
* RegexParseFilter. If a regular expression matches either HTML or
* extracted text, a configurable field is set to true.
*/
public class RegexParseFilter implements HtmlParseFilter {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private static String attributeFile = null;
private Configuration conf;
private static final Map<String,RegexRule> rules = new HashMap<>();
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
String html = new String(content.getContent());
String text = parse.getText();
for (Map.Entry<String, RegexRule> entry : rules.entrySet()) {
String field = entry.getKey();
RegexRule regexRule = entry.getValue();
String source = null;
if (regexRule.source.equalsIgnoreCase("html")) {
source = html;
}
if (regexRule.source.equalsIgnoreCase("text")) {
source = text;
}
if (source == null) {
LOG.error("source for regex rule: " + field + " misconfigured");
}
if (matches(source, regexRule.regex)) {
parse.getData().getParseMeta().set(field, "true");
} else {
parse.getData().getParseMeta().set(field, "false");
}
}
return parseResult;
}
public void setConf(Configuration conf) {
this.conf = conf;
// get the extensions for domain urlfilter
String pluginName = "parsefilter-regex";
Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
HtmlParseFilter.class.getName()).getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
if (extension.getDescriptor().getPluginId().equals(pluginName)) {
attributeFile = extension.getAttribute("file");
break;
}
}
// handle blank non empty input
if (attributeFile != null && attributeFile.trim().equals("")) {
attributeFile = null;
}
if (attributeFile != null) {
if (LOG.isInfoEnabled()) {
LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ " as " + attributeFile);
}
}
else {
if (LOG.isWarnEnabled()) {
LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+ pluginName);
}
}
String file = conf.get("parsefilter.regex.file", attributeFile);
String stringRules = conf.get("parsefilter.regex.rules");
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
reader = conf.getConfResourceAsReader(file);
}
try {
if (reader == null) {
reader = new FileReader(file);
}
readConfiguration(reader);
}
catch (IOException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
}
public Configuration getConf() {
return this.conf;
}
private boolean matches(String value, Pattern pattern) {
if (value != null) {
Matcher matcher = pattern.matcher(value);
return matcher.find();
}
return false;
}
private synchronized void readConfiguration(Reader configReader) throws IOException {
if (rules.size() > 0) {
return;
}
String line;
BufferedReader reader = new BufferedReader(configReader);
while ((line = reader.readLine()) != null) {
if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
line = line.trim();
String[] parts = line.split("\\s");
if (parts.length == 3) {
String field = parts[0].trim();
String source = parts[1].trim();
String regex = parts[2].trim();
rules.put(field, new RegexRule(source, regex));
} else {
LOG.info("RegexParseFilter rule is invalid. " + line);
}
}
}
}
private static class RegexRule {
public RegexRule(String source, String regex) {
this.source = source;
this.regex = Pattern.compile(regex);
}
String source;
Pattern regex;
}
}