blob: 04ca46b3ff7a8d18feac3cb2df81d89d9d2322da [file] [log] [blame]
package org.apache.nutch.scoring.metadata;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.parse.*;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.util.HashMap;
public class TestMetadataScoringFilter {
@Before
public void setUp() throws Exception {
}
@Test
public void distributeScoreToOutlinks() throws ScoringFilterException {
Configuration conf = NutchConfiguration.create();
conf.set(MetadataScoringFilter.METADATA_PARSED,"parent,depth");
MetadataScoringFilter metadataScoringFilter = new MetadataScoringFilter();
metadataScoringFilter.setConf(conf);
CrawlDatum crawlDatum = new CrawlDatum();
Text from = new Text("https://nutch.apache.org/");
ParseData parseData = new ParseData();
String PARENT = "parent";
String DEPTH = "depth";
String parentMD = "https://nutch.apache.org/";
String depthMD = "1";
parseData.getParseMeta().add("parent",parentMD);
parseData.getParseMeta().add("depth",depthMD);
HashMap<Text,CrawlDatum> targets = new HashMap();
targets.put(new Text("https://nutch.apache.org/downloads.html"),new CrawlDatum());
targets.put(new Text("https://wiki.apache.org/nutch"),new CrawlDatum());
metadataScoringFilter.distributeScoreToOutlinks(from,parseData,targets.entrySet(),crawlDatum,2);
for (CrawlDatum outlink : targets.values()){
Text parent = (Text) outlink.getMetaData().get(new Text(PARENT));
Text depth = (Text) outlink.getMetaData().get(new Text(DEPTH));
Assert.assertEquals(parentMD,parent.toString());
Assert.assertEquals(depthMD,depth.toString());
}
}
@Test
public void passScoreBeforeParsing() {
Configuration conf = NutchConfiguration.create();
conf.set(MetadataScoringFilter.METADATA_DATUM,"parent,depth");
MetadataScoringFilter metadataScoringFilter = new MetadataScoringFilter();
metadataScoringFilter.setConf(conf);
CrawlDatum crawlDatum = new CrawlDatum();
Text from = new Text("https://nutch.apache.org/");
String PARENT = "parent";
String DEPTH = "depth";
String parentMD = "https://nutch.apache.org/";
String depthMD = "1";
crawlDatum.getMetaData().put(new Text(PARENT), new Text(parentMD));
crawlDatum.getMetaData().put(new Text(DEPTH), new Text(depthMD));
Content content = new Content();
metadataScoringFilter.passScoreBeforeParsing(from,crawlDatum,content);
Assert.assertEquals(parentMD,content.getMetadata().get(PARENT));
Assert.assertEquals(depthMD,content.getMetadata().get(DEPTH));
}
@Test
public void passScoreAfterParsing() {
Configuration conf = NutchConfiguration.create();
conf.set(MetadataScoringFilter.METADATA_DATUM,"parent,depth");
conf.set(MetadataScoringFilter.METADATA_CONTENT,"parent,depth");
MetadataScoringFilter metadataScoringFilter = new MetadataScoringFilter();
metadataScoringFilter.setConf(conf);
CrawlDatum crawlDatum = new CrawlDatum();
Text from = new Text("https://nutch.apache.org/");
String PARENT = "parent";
String DEPTH = "depth";
String parentMD = "https://nutch.apache.org/";
String depthMD = "1";
crawlDatum.getMetaData().put(new Text(PARENT), new Text(parentMD));
crawlDatum.getMetaData().put(new Text(DEPTH), new Text(depthMD));
Content content = new Content();
metadataScoringFilter.passScoreBeforeParsing(from,crawlDatum,content);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, null, null, content.getMetadata());
Parse parse = new ParseImpl(from.toString(),parseData);
metadataScoringFilter.passScoreAfterParsing(from,content,parse);
Assert.assertEquals(parentMD,parse.getData().getMeta(PARENT));
Assert.assertEquals(depthMD,parse.getData().getMeta(DEPTH));
}
}