blob: fcd8cb5c842bb61162cf1518eaaf803ef905c720 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexer.replace;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.basic.BasicIndexingFilter;
import org.apache.nutch.indexer.metadata.MetadataIndexer;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Assert;
import org.junit.Test;
/**
* JUnit tests for the <code>index-replace</code> plugin.
*
* In these tests, the sample file has some meta tags added to the Nutch
* document by the <code>index-metadata</code> plugin. The
* <code>index-replace</code> plugin is then used to either change (or not
* change) the fields depending on the various values of
* <code>index.replace.regexp</code> property being provided to Nutch.
*
*
* @author Peter Ciuffetti
*
*/
public class TestIndexReplace {
private static final String INDEX_REPLACE_PROPERTY = "index.replace.regexp";
private String fileSeparator = System.getProperty("file.separator");
private String sampleDir = System.getProperty("test.data", ".");
private String sampleFile = "testIndexReplace.html";
/**
* Run a test file through the Nutch parser and index filters.
*
* @param fileName
* @param conf
* @return the Nutch document with the replace indexer applied
*/
public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
NutchDocument doc = new NutchDocument();
BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
basicIndexer.setConf(conf);
Assert.assertNotNull(basicIndexer);
MetadataIndexer metaIndexer = new MetadataIndexer();
metaIndexer.setConf(conf);
Assert.assertNotNull(basicIndexer);
ReplaceIndexer replaceIndexer = new ReplaceIndexer();
replaceIndexer.setConf(conf);
Assert.assertNotNull(replaceIndexer);
try {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Text text = new Text(urlString);
CrawlDatum crawlDatum = new CrawlDatum();
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(text, crawlDatum)
.getContent();
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
crawlDatum.setFetchTime(100L);
Inlinks inlinks = new Inlinks();
doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
return doc;
}
/**
* Test property parsing.
*
* The filter does not expose details of the parse. So all we are checking is
* that the parse does not throw a runtime exception and that the value
* provided is the value returned.
*/
@Test
public void testPropertyParse() {
Configuration conf = NutchConfiguration.create();
String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/2\n"
+ " metatag.keywords=/\\,/\\!/\n"
+ " hostmatch=.*.com\n"
+ " metatag.keywords=/\\,/\\?/\n"
+ " metatag.author:dc_author=/\\s+/ David /\n"
+ " urlmatch=.*.html\n"
+ " metatag.keywords=/\\,/\\./\n" + " metatag.author=/\\s+/ D. /\n";
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
ReplaceIndexer rp = new ReplaceIndexer();
try {
rp.setConf(conf);
} catch (RuntimeException ohno) {
Assert.fail("Unable to parse a valid index.replace.regexp property! "
+ ohno.getMessage());
}
Configuration parsedConf = rp.getConf();
// Does the getter equal the setter? Too easy!
Assert.assertEquals(indexReplaceProperty,
parsedConf.get(INDEX_REPLACE_PROPERTY));
}
/**
* Test metatag value replacement using global replacement settings.
*
* The index.replace.regexp property does not use hostmatch or urlmatch, so
* all patterns are global.
*/
@Test
public void testGlobalReplacement() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
String expectedAuthor = "Peter D. Ciuffetti";
String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/\n"
+ " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
Configuration conf = NutchConfiguration.create();
conf.set(
"plugin.includes",
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md",
"metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
Assert.assertEquals(expectedDescription,
doc.getFieldValue("metatag.description"));
Assert
.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
/**
* Test that invalid property settings are handled and ignored.
*
* This test provides an invalid property setting that will fail property
* parsing and Pattern.compile. The expected outcome is that the patterns will
* not cause failure and the targeted fields will not be modified by the
* filter.
*/
@Test
public void testInvalidPatterns() {
String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
String expectedAuthor = "Peter Ciuffetti";
// Contains: invalid pattern, invalid flags, incomplete property
String indexReplaceProperty = " metatag.description=/this\\s+**plugin/this awesome plugin/\n"
+ " metatag.keywords=/\\,/\\!/what\n" + " metatag.author=#notcomplete";
Configuration conf = NutchConfiguration.create();
conf.set(
"plugin.includes",
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md",
"metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Assert that our metatags have not changed.
Assert.assertEquals(expectedDescription,
doc.getFieldValue("metatag.description"));
Assert
.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
/**
* Test URL pattern matching
*/
@Test
public void testUrlMatchesPattern() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
String expectedAuthor = "Peter D. Ciuffetti";
String indexReplaceProperty = " urlmatch=.*.html\n"
+ " metatag.description=/this(.*)plugin/this awesome plugin/\n"
+ " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
Configuration conf = NutchConfiguration.create();
conf.set(
"plugin.includes",
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md",
"metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Assert that our metatags have changed.
Assert.assertEquals(expectedDescription,
doc.getFieldValue("metatag.description"));
Assert
.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
/**
* Test URL pattern not matching.
*
* Expected result is that the filter does not change the fields.
*/
@Test
public void testUrlNotMatchesPattern() {
String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
String expectedAuthor = "Peter Ciuffetti";
String indexReplaceProperty = " urlmatch=.*.xml\n"
+ " metatag.description=/this(.*)plugin/this awesome plugin/\n"
+ " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
Configuration conf = NutchConfiguration.create();
conf.set(
"plugin.includes",
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md",
"metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Assert that our metatags have not changed.
Assert.assertEquals(expectedDescription,
doc.getFieldValue("metatag.description"));
Assert
.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
/**
* Test a global pattern match for description and URL pattern match for
* keywords and author.
*
* All three should be triggered. It also tests replacement groups.
*/
@Test
public void testGlobalAndUrlMatchesPattern() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
String expectedAuthor = "Peter D. Ciuffetti";
String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n"
+ " urlmatch=.*.html\n"
+ " metatag.keywords=/\\,/\\!/\n"
+ " metatag.author=/\\s+/ D. /\n";
Configuration conf = NutchConfiguration.create();
conf.set(
"plugin.includes",
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md",
"metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Assert that our metatags have changed.
Assert.assertEquals(expectedDescription,
doc.getFieldValue("metatag.description"));
Assert
.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
/**
* Test a global pattern match for description and URL pattern match for
* keywords and author.
*
* Only the global match should be triggered.
*/
@Test
public void testGlobalAndUrlNotMatchesPattern() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
String expectedAuthor = "Peter Ciuffetti";
String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n"
+ " urlmatch=.*.xml\n"
+ " metatag.keywords=/\\,/\\!/\n"
+ " metatag.author=/\\s+/ D. /\n";
Configuration conf = NutchConfiguration.create();
conf.set(
"plugin.includes",
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md",
"metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Assert that description has changed and the others have not changed.
Assert.assertEquals(expectedDescription,
doc.getFieldValue("metatag.description"));
Assert
.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
/**
* Test order-specific replacement settings.
*
* This makes multiple replacements on the same field and will produce the
* expected value only if the replacements are run in the order specified.
*/
@Test
public void testReplacementsRunInSpecifedOrder() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String indexReplaceProperty = " metatag.description=/this plugin/this amazing plugin/\n"
+ " metatag.description=/this amazing plugin/this valuable plugin/\n"
+ " metatag.description=/this valuable plugin/this cool plugin/\n"
+ " metatag.description=/this cool plugin/this wicked plugin/\n"
+ " metatag.description=/this wicked plugin/this awesome plugin/\n";
Configuration conf = NutchConfiguration.create();
conf.set(
"plugin.includes",
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md",
"metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Check that the value produced by the last replacement has worked.
Assert.assertEquals(expectedDescription,
doc.getFieldValue("metatag.description"));
}
/**
* Test a replacement pattern that uses the flags feature.
*
* A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to match
* any case.
*/
@Test
public void testReplacementsWithFlags() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String indexReplaceProperty = " metatag.description=/THIS PLUGIN/this awesome plugin/2";
Configuration conf = NutchConfiguration.create();
conf.set(
"plugin.includes",
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md",
"metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Check that the value produced by the case-insensitive replacement has
// worked.
Assert.assertEquals(expectedDescription,
doc.getFieldValue("metatag.description"));
}
/**
* Test a replacement pattern that uses the target field feature.
* Check that the input is not modifid and that the taret field is added.
*/
@Test
public void testReplacementsDifferentTarget() {
String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
String expectedTargetDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String indexReplaceProperty = " metatag.description:new=/this plugin/this awesome plugin/";
Configuration conf = NutchConfiguration.create();
conf.set(
"plugin.includes",
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md",
"metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Check that the input field has not been modified
Assert.assertEquals(expectedDescription,
doc.getFieldValue("metatag.description"));
// Check that the output field has created
Assert.assertEquals(expectedTargetDescription,
doc.getFieldValue("new"));
}
}