| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.indexer.replace; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.io.Text; |
| import org.apache.nutch.crawl.CrawlDatum; |
| import org.apache.nutch.crawl.Inlinks; |
| import org.apache.nutch.indexer.NutchDocument; |
| import org.apache.nutch.indexer.basic.BasicIndexingFilter; |
| import org.apache.nutch.indexer.metadata.MetadataIndexer; |
| import org.apache.nutch.parse.Parse; |
| import org.apache.nutch.parse.ParseUtil; |
| import org.apache.nutch.protocol.Content; |
| import org.apache.nutch.protocol.Protocol; |
| import org.apache.nutch.protocol.ProtocolFactory; |
| import org.apache.nutch.util.NutchConfiguration; |
| import org.junit.Assert; |
| import org.junit.Test; |
| |
| /** |
| * JUnit tests for the <code>index-replace</code> plugin. |
| * |
| * In these tests, the sample file has some meta tags added to the Nutch |
| * document by the <code>index-metadata</code> plugin. The |
| * <code>index-replace</code> plugin is then used to either change (or not |
| * change) the fields depending on the various values of |
| * <code>index.replace.regexp</code> property being provided to Nutch. |
| * |
| * |
| * @author Peter Ciuffetti |
| * |
| */ |
| public class TestIndexReplace { |
| |
| private static final String INDEX_REPLACE_PROPERTY = "index.replace.regexp"; |
| |
| private String fileSeparator = System.getProperty("file.separator"); |
| private String sampleDir = System.getProperty("test.data", "."); |
| private String sampleFile = "testIndexReplace.html"; |
| |
| /** |
| * Run a test file through the Nutch parser and index filters. |
| * |
| * @param fileName |
| * @param conf |
| * @return the Nutch document with the replace indexer applied |
| */ |
| public NutchDocument parseAndFilterFile(String fileName, Configuration conf) { |
| NutchDocument doc = new NutchDocument(); |
| |
| BasicIndexingFilter basicIndexer = new BasicIndexingFilter(); |
| basicIndexer.setConf(conf); |
| Assert.assertNotNull(basicIndexer); |
| |
| MetadataIndexer metaIndexer = new MetadataIndexer(); |
| metaIndexer.setConf(conf); |
| Assert.assertNotNull(basicIndexer); |
| |
| ReplaceIndexer replaceIndexer = new ReplaceIndexer(); |
| replaceIndexer.setConf(conf); |
| Assert.assertNotNull(replaceIndexer); |
| |
| try { |
| String urlString = "file:" + sampleDir + fileSeparator + fileName; |
| Text text = new Text(urlString); |
| CrawlDatum crawlDatum = new CrawlDatum(); |
| Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); |
| Content content = protocol.getProtocolOutput(text, crawlDatum) |
| .getContent(); |
| Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); |
| crawlDatum.setFetchTime(100L); |
| |
| Inlinks inlinks = new Inlinks(); |
| doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks); |
| doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks); |
| doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks); |
| } catch (Exception e) { |
| e.printStackTrace(); |
| Assert.fail(e.toString()); |
| } |
| |
| return doc; |
| } |
| |
| /** |
| * Test property parsing. |
| * |
| * The filter does not expose details of the parse. So all we are checking is |
| * that the parse does not throw a runtime exception and that the value |
| * provided is the value returned. |
| */ |
| @Test |
| public void testPropertyParse() { |
| Configuration conf = NutchConfiguration.create(); |
| String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/2\n" |
| + " metatag.keywords=/\\,/\\!/\n" |
| + " hostmatch=.*.com\n" |
| + " metatag.keywords=/\\,/\\?/\n" |
| + " metatag.author:dc_author=/\\s+/ David /\n" |
| + " urlmatch=.*.html\n" |
| + " metatag.keywords=/\\,/\\./\n" + " metatag.author=/\\s+/ D. /\n"; |
| |
| conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); |
| |
| ReplaceIndexer rp = new ReplaceIndexer(); |
| try { |
| rp.setConf(conf); |
| } catch (RuntimeException ohno) { |
| Assert.fail("Unable to parse a valid index.replace.regexp property! " |
| + ohno.getMessage()); |
| } |
| |
| Configuration parsedConf = rp.getConf(); |
| |
| // Does the getter equal the setter? Too easy! |
| Assert.assertEquals(indexReplaceProperty, |
| parsedConf.get(INDEX_REPLACE_PROPERTY)); |
| } |
| |
| /** |
| * Test metatag value replacement using global replacement settings. |
| * |
| * The index.replace.regexp property does not use hostmatch or urlmatch, so |
| * all patterns are global. |
| */ |
| @Test |
| public void testGlobalReplacement() { |
| String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; |
| String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; |
| String expectedAuthor = "Peter D. Ciuffetti"; |
| String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/\n" |
| + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; |
| |
| Configuration conf = NutchConfiguration.create(); |
| conf.set( |
| "plugin.includes", |
| "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); |
| conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); |
| conf.set("metatags.names", "author,description,keywords"); |
| conf.set("index.parse.md", |
| "metatag.author,metatag.description,metatag.keywords"); |
| // Not necessary but helpful when debugging the filter. |
| conf.set("http.timeout", "99999999999"); |
| |
| // Run the document through the parser and index filters. |
| NutchDocument doc = parseAndFilterFile(sampleFile, conf); |
| |
| Assert.assertEquals(expectedDescription, |
| doc.getFieldValue("metatag.description")); |
| Assert |
| .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); |
| Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); |
| } |
| |
| /** |
| * Test that invalid property settings are handled and ignored. |
| * |
| * This test provides an invalid property setting that will fail property |
| * parsing and Pattern.compile. The expected outcome is that the patterns will |
| * not cause failure and the targeted fields will not be modified by the |
| * filter. |
| */ |
| @Test |
| public void testInvalidPatterns() { |
| String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; |
| String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; |
| String expectedAuthor = "Peter Ciuffetti"; |
| // Contains: invalid pattern, invalid flags, incomplete property |
| String indexReplaceProperty = " metatag.description=/this\\s+**plugin/this awesome plugin/\n" |
| + " metatag.keywords=/\\,/\\!/what\n" + " metatag.author=#notcomplete"; |
| |
| Configuration conf = NutchConfiguration.create(); |
| conf.set( |
| "plugin.includes", |
| "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); |
| conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); |
| conf.set("metatags.names", "author,description,keywords"); |
| conf.set("index.parse.md", |
| "metatag.author,metatag.description,metatag.keywords"); |
| // Not necessary but helpful when debugging the filter. |
| conf.set("http.timeout", "99999999999"); |
| |
| // Run the document through the parser and index filters. |
| NutchDocument doc = parseAndFilterFile(sampleFile, conf); |
| |
| // Assert that our metatags have not changed. |
| Assert.assertEquals(expectedDescription, |
| doc.getFieldValue("metatag.description")); |
| Assert |
| .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); |
| Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); |
| |
| } |
| |
| /** |
| * Test URL pattern matching |
| */ |
| @Test |
| public void testUrlMatchesPattern() { |
| String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; |
| String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; |
| String expectedAuthor = "Peter D. Ciuffetti"; |
| String indexReplaceProperty = " urlmatch=.*.html\n" |
| + " metatag.description=/this(.*)plugin/this awesome plugin/\n" |
| + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; |
| |
| Configuration conf = NutchConfiguration.create(); |
| conf.set( |
| "plugin.includes", |
| "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); |
| conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); |
| conf.set("metatags.names", "author,description,keywords"); |
| conf.set("index.parse.md", |
| "metatag.author,metatag.description,metatag.keywords"); |
| // Not necessary but helpful when debugging the filter. |
| conf.set("http.timeout", "99999999999"); |
| |
| // Run the document through the parser and index filters. |
| NutchDocument doc = parseAndFilterFile(sampleFile, conf); |
| |
| // Assert that our metatags have changed. |
| Assert.assertEquals(expectedDescription, |
| doc.getFieldValue("metatag.description")); |
| Assert |
| .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); |
| Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); |
| |
| } |
| |
| /** |
| * Test URL pattern not matching. |
| * |
| * Expected result is that the filter does not change the fields. |
| */ |
| @Test |
| public void testUrlNotMatchesPattern() { |
| String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; |
| String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; |
| String expectedAuthor = "Peter Ciuffetti"; |
| String indexReplaceProperty = " urlmatch=.*.xml\n" |
| + " metatag.description=/this(.*)plugin/this awesome plugin/\n" |
| + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n"; |
| |
| Configuration conf = NutchConfiguration.create(); |
| conf.set( |
| "plugin.includes", |
| "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); |
| conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); |
| conf.set("metatags.names", "author,description,keywords"); |
| conf.set("index.parse.md", |
| "metatag.author,metatag.description,metatag.keywords"); |
| // Not necessary but helpful when debugging the filter. |
| conf.set("http.timeout", "99999999999"); |
| |
| // Run the document through the parser and index filters. |
| NutchDocument doc = parseAndFilterFile(sampleFile, conf); |
| |
| // Assert that our metatags have not changed. |
| Assert.assertEquals(expectedDescription, |
| doc.getFieldValue("metatag.description")); |
| Assert |
| .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); |
| Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); |
| |
| } |
| |
| /** |
| * Test a global pattern match for description and URL pattern match for |
| * keywords and author. |
| * |
| * All three should be triggered. It also tests replacement groups. |
| */ |
| @Test |
| public void testGlobalAndUrlMatchesPattern() { |
| String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; |
| String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!"; |
| String expectedAuthor = "Peter D. Ciuffetti"; |
| String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" |
| + " urlmatch=.*.html\n" |
| + " metatag.keywords=/\\,/\\!/\n" |
| + " metatag.author=/\\s+/ D. /\n"; |
| |
| Configuration conf = NutchConfiguration.create(); |
| conf.set( |
| "plugin.includes", |
| "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); |
| conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); |
| conf.set("metatags.names", "author,description,keywords"); |
| conf.set("index.parse.md", |
| "metatag.author,metatag.description,metatag.keywords"); |
| // Not necessary but helpful when debugging the filter. |
| conf.set("http.timeout", "99999999999"); |
| |
| // Run the document through the parser and index filters. |
| NutchDocument doc = parseAndFilterFile(sampleFile, conf); |
| |
| // Assert that our metatags have changed. |
| Assert.assertEquals(expectedDescription, |
| doc.getFieldValue("metatag.description")); |
| Assert |
| .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); |
| Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); |
| |
| } |
| |
| /** |
| * Test a global pattern match for description and URL pattern match for |
| * keywords and author. |
| * |
| * Only the global match should be triggered. |
| */ |
| @Test |
| public void testGlobalAndUrlNotMatchesPattern() { |
| String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; |
| String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!"; |
| String expectedAuthor = "Peter Ciuffetti"; |
| String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" |
| + " urlmatch=.*.xml\n" |
| + " metatag.keywords=/\\,/\\!/\n" |
| + " metatag.author=/\\s+/ D. /\n"; |
| |
| Configuration conf = NutchConfiguration.create(); |
| conf.set( |
| "plugin.includes", |
| "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); |
| conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); |
| conf.set("metatags.names", "author,description,keywords"); |
| conf.set("index.parse.md", |
| "metatag.author,metatag.description,metatag.keywords"); |
| // Not necessary but helpful when debugging the filter. |
| conf.set("http.timeout", "99999999999"); |
| |
| // Run the document through the parser and index filters. |
| NutchDocument doc = parseAndFilterFile(sampleFile, conf); |
| |
| // Assert that description has changed and the others have not changed. |
| Assert.assertEquals(expectedDescription, |
| doc.getFieldValue("metatag.description")); |
| Assert |
| .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords")); |
| Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author")); |
| } |
| |
| /** |
| * Test order-specific replacement settings. |
| * |
| * This makes multiple replacements on the same field and will produce the |
| * expected value only if the replacements are run in the order specified. |
| */ |
| @Test |
| public void testReplacementsRunInSpecifedOrder() { |
| String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; |
| String indexReplaceProperty = " metatag.description=/this plugin/this amazing plugin/\n" |
| + " metatag.description=/this amazing plugin/this valuable plugin/\n" |
| + " metatag.description=/this valuable plugin/this cool plugin/\n" |
| + " metatag.description=/this cool plugin/this wicked plugin/\n" |
| + " metatag.description=/this wicked plugin/this awesome plugin/\n"; |
| |
| Configuration conf = NutchConfiguration.create(); |
| conf.set( |
| "plugin.includes", |
| "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); |
| conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); |
| conf.set("metatags.names", "author,description,keywords"); |
| conf.set("index.parse.md", |
| "metatag.author,metatag.description,metatag.keywords"); |
| // Not necessary but helpful when debugging the filter. |
| conf.set("http.timeout", "99999999999"); |
| |
| // Run the document through the parser and index filters. |
| NutchDocument doc = parseAndFilterFile(sampleFile, conf); |
| |
| // Check that the value produced by the last replacement has worked. |
| Assert.assertEquals(expectedDescription, |
| doc.getFieldValue("metatag.description")); |
| } |
| |
| /** |
| * Test a replacement pattern that uses the flags feature. |
| * |
| * A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to match |
| * any case. |
| */ |
| @Test |
| public void testReplacementsWithFlags() { |
| String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; |
| String indexReplaceProperty = " metatag.description=/THIS PLUGIN/this awesome plugin/2"; |
| |
| Configuration conf = NutchConfiguration.create(); |
| conf.set( |
| "plugin.includes", |
| "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); |
| conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); |
| conf.set("metatags.names", "author,description,keywords"); |
| conf.set("index.parse.md", |
| "metatag.author,metatag.description,metatag.keywords"); |
| // Not necessary but helpful when debugging the filter. |
| conf.set("http.timeout", "99999999999"); |
| |
| // Run the document through the parser and index filters. |
| NutchDocument doc = parseAndFilterFile(sampleFile, conf); |
| |
| // Check that the value produced by the case-insensitive replacement has |
| // worked. |
| Assert.assertEquals(expectedDescription, |
| doc.getFieldValue("metatag.description")); |
| } |
| |
| /** |
| * Test a replacement pattern that uses the target field feature. |
| * Check that the input is not modifid and that the taret field is added. |
| */ |
| @Test |
| public void testReplacementsDifferentTarget() { |
| String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!"; |
| String expectedTargetDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!"; |
| String indexReplaceProperty = " metatag.description:new=/this plugin/this awesome plugin/"; |
| |
| Configuration conf = NutchConfiguration.create(); |
| conf.set( |
| "plugin.includes", |
| "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)"); |
| conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty); |
| conf.set("metatags.names", "author,description,keywords"); |
| conf.set("index.parse.md", |
| "metatag.author,metatag.description,metatag.keywords"); |
| // Not necessary but helpful when debugging the filter. |
| conf.set("http.timeout", "99999999999"); |
| |
| // Run the document through the parser and index filters. |
| NutchDocument doc = parseAndFilterFile(sampleFile, conf); |
| |
| // Check that the input field has not been modified |
| Assert.assertEquals(expectedDescription, |
| doc.getFieldValue("metatag.description")); |
| // Check that the output field has created |
| Assert.assertEquals(expectedTargetDescription, |
| doc.getFieldValue("new")); |
| } |
| } |