blob: 557b8eb665d809e5d83d3f6a37b3ba03f594006c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexer.urlmeta;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
/**
* This is part of the URL Meta plugin. It is designed to enhance the NUTCH-655
* patch, by doing two things: 1. Meta Tags that are supplied with your Crawl
* URLs, during injection, will be propagated throughout the outlinks of those
* Crawl URLs. 2. When you index your URLs, the meta tags that you specified
* with your URLs will be indexed alongside those URLs--and can be directly
* queried, assuming you have done everything else correctly.
*
* The flat-file of URLs you are injecting should, per NUTCH-655, be
* tab-delimited in the form of:
*
* [www.url.com]\t[key1]=[value1]\t[key2]=[value2]...[keyN]=[valueN]
*
* Be aware that if you collide with keywords that are already in use (such as
* nutch.score/nutch.fetchInterval) then you are in for some unpredictable
* behavior.
*
* Furthermore, in your nutch-site.xml config, you must specify that this plugin
* is to be used (1), as well as what (2) Meta Tags it should actively look for.
* This does not mean that you must use these tags for every URL, but it does
* mean that you must list _all_ of meta tags that you have specified. If you
* want them to be propagated and indexed, that is.
*
* 1. As of Nutch 1.2, the property "plugin.includes" looks as follows:
* <value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index
* -(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic
* |scoring-opic|urlnormalizer-(pass|regex|basic)</value> You must change
* "index-(basic|anchor)" to "index-(basic|anchor|urlmeta)", in order to call
* this plugin.
*
* 2. You must also specify the property "urlmeta.tags", who's values are
* comma-delimited <value>key1, key2, key3</value>
*
* TODO: It may be ideal to offer two separate properties, to specify what gets
* indexed versus merely propagated.
*
*/
public class URLMetaIndexingFilter implements IndexingFilter {
private static final String CONF_PROPERTY = "urlmeta.tags";
private static String[] urlMetaTags;
private Configuration conf;
/**
* This will take the metatags that you have listed in your "urlmeta.tags"
* property, and looks for them inside the CrawlDatum object. If they exist,
* this will add it as an attribute inside the NutchDocument.
*
* @see IndexingFilter#filter
*/
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
if (conf != null)
this.setConf(conf);
if (urlMetaTags == null || doc == null)
return doc;
for (String metatag : urlMetaTags) {
Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
if (metadata != null)
doc.add(metatag, metadata.toString());
}
return doc;
}
@Override
public Configuration getConf() {
return conf;
}
/**
* handles conf assignment and pulls the value assignment from the
* "urlmeta.tags" property
*/
@Override
public void setConf(Configuration conf) {
this.conf = conf;
if (conf == null)
return;
urlMetaTags = conf.getStrings(CONF_PROPERTY);
}
}