src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.scoring.depth;

 import java.lang.invoke.MethodHandles;
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map.Entry;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.scoring.ScoringFilter;
 import org.apache.nutch.scoring.ScoringFilterException;

 /**
  * This scoring filter limits the number of hops from the initial seed urls. If
  * the number of hops exceeds the depth (either the default value, or the one
  * set in the injector file) then all outlinks from that url are discarded,
  * effectively stopping further crawling along this path.
  */
 public class DepthScoringFilter extends Configured implements ScoringFilter {

   private static final Logger LOG = LoggerFactory
 		      .getLogger(MethodHandles.lookup().lookupClass());

   public static final String DEPTH_KEY = "_depth_";
   public static final Text DEPTH_KEY_W = new Text(DEPTH_KEY);
   public static final String MAX_DEPTH_KEY = "_maxdepth_";
   public static final Text MAX_DEPTH_KEY_W = new Text(MAX_DEPTH_KEY);

   // maximum value that we are never likely to reach
   // because the depth of the Web graph is that high only
   // for spam cliques.
   public static final int DEFAULT_MAX_DEPTH = 1000;

   private int defaultMaxDepth;

   @Override
   public void setConf(Configuration conf) {
     super.setConf(conf);
     if (conf == null)
       return;
     defaultMaxDepth = conf.getInt("scoring.depth.max", DEFAULT_MAX_DEPTH);
     if (defaultMaxDepth <= 0) {
       defaultMaxDepth = DEFAULT_MAX_DEPTH;
     }
   }

   @Override
   public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
       ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
       CrawlDatum adjust, int allCount) throws ScoringFilterException {
     if (targets.isEmpty()) {
       return adjust;
     }
     String depthString = parseData.getMeta(DEPTH_KEY);
     if (depthString == null) {
       LOG.warn("Missing depth, removing all outlinks from url " + fromUrl);
       targets.clear();
       return adjust;
     }
     int curDepth = Integer.parseInt(depthString);
     int curMaxDepth = defaultMaxDepth;
     IntWritable customMaxDepth = null;
     // allow overrides from injector
     String maxDepthString = parseData.getMeta(MAX_DEPTH_KEY);
     if (maxDepthString != null) {
       curMaxDepth = Integer.parseInt(maxDepthString);
       customMaxDepth = new IntWritable(curMaxDepth);
     }
     if (curDepth >= curMaxDepth) {
       // depth exceeded - throw away
       LOG.info("Depth limit (" + curMaxDepth
           + ") reached, ignoring outlinks for " + fromUrl);
       targets.clear();
       return adjust;
     }
     Iterator<Entry<Text, CrawlDatum>> it = targets.iterator();
     while (it.hasNext()) {
       Entry<Text, CrawlDatum> e = it.next();
       // record increased depth
       e.getValue().getMetaData()
           .put(DEPTH_KEY_W, new IntWritable(curDepth + 1));
       // record maxDepth if any
       if (customMaxDepth != null) {
         e.getValue().getMetaData().put(MAX_DEPTH_KEY_W, customMaxDepth);
       }
     }
     return adjust;
   }

   // prioritize by smaller values of depth
   @Override
   public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
       throws ScoringFilterException {
     // boost up by current depth
     int curDepth, curMaxDepth;
     IntWritable maxDepth = (IntWritable) datum.getMetaData().get(
         MAX_DEPTH_KEY_W);
     if (maxDepth != null) {
       curMaxDepth = maxDepth.get();
     } else {
       curMaxDepth = defaultMaxDepth;
     }
     IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
     if (depth == null) {
       // penalize
       curDepth = curMaxDepth;
     } else {
       curDepth = depth.get();
     }
     int mul = curMaxDepth - curDepth;
     return initSort * (1 + mul);
   }

   @Override
   public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
       CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
       throws ScoringFilterException {
     return initScore;
   }

   @Override
   public void initialScore(Text url, CrawlDatum datum)
       throws ScoringFilterException {
     // the datum might already have some values set
     // e.g. obtained from redirection
     // in which case we don't want to override them
     if (datum.getMetaData().get(MAX_DEPTH_KEY_W) == null)
       datum.getMetaData()
           .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
     // initial depth is 1
     if (datum.getMetaData().get(DEPTH_KEY_W) == null)
       datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
   }

   @Override
   public void injectedScore(Text url, CrawlDatum datum)
       throws ScoringFilterException {

     // check for the presence of the depth limit key
     if (datum.getMetaData().get(MAX_DEPTH_KEY_W) != null) {
       // convert from Text to Int
       String depthString = datum.getMetaData().get(MAX_DEPTH_KEY_W).toString();
       datum.getMetaData().remove(MAX_DEPTH_KEY_W);
       int depth = Integer.parseInt(depthString);
       datum.getMetaData().put(MAX_DEPTH_KEY_W, new IntWritable(depth));
     } else { // put the default
       datum.getMetaData()
           .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
     }
     // initial depth is 1
     datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
   }

   @Override
   public void passScoreAfterParsing(Text url, Content content, Parse parse)
       throws ScoringFilterException {
     String depth = content.getMetadata().get(DEPTH_KEY);
     if (depth != null) {
       parse.getData().getParseMeta().set(DEPTH_KEY, depth);
     }
     String maxdepth = content.getMetadata().get(MAX_DEPTH_KEY);
     if (maxdepth != null) {
       parse.getData().getParseMeta().set(MAX_DEPTH_KEY, maxdepth);
     }
   }

   @Override
   public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
       throws ScoringFilterException {
     IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
     if (depth != null) {
       content.getMetadata().set(DEPTH_KEY, depth.toString());
     }
     IntWritable maxdepth = (IntWritable) datum.getMetaData().get(
         MAX_DEPTH_KEY_W);
     if (maxdepth != null) {
       content.getMetadata().set(MAX_DEPTH_KEY, maxdepth.toString());
     }
   }

   @Override
   public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
       List<CrawlDatum> inlinked) throws ScoringFilterException {
     // find a minimum of all depths
     int newDepth = DEFAULT_MAX_DEPTH;
     if (old != null) {
       IntWritable oldDepth = (IntWritable) old.getMetaData().get(DEPTH_KEY_W);
       if (oldDepth != null) {
         newDepth = oldDepth.get();
       } else {
         // not set ?
         initialScore(url, old);
       }
     }
     for (CrawlDatum lnk : inlinked) {
       IntWritable depth = (IntWritable) lnk.getMetaData().get(DEPTH_KEY_W);
       if (depth != null && depth.get() < newDepth) {
         newDepth = depth.get();
       }
     }
     datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(newDepth));
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.scoring.depth;

	import java.lang.invoke.MethodHandles;
	import java.util.Collection;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Map.Entry;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.conf.Configured;
	import org.apache.hadoop.io.IntWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.crawl.Inlinks;
	import org.apache.nutch.indexer.NutchDocument;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseData;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.scoring.ScoringFilter;
	import org.apache.nutch.scoring.ScoringFilterException;

	/**
	* This scoring filter limits the number of hops from the initial seed urls. If
	* the number of hops exceeds the depth (either the default value, or the one
	* set in the injector file) then all outlinks from that url are discarded,
	* effectively stopping further crawling along this path.
	*/
	public class DepthScoringFilter extends Configured implements ScoringFilter {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	public static final String DEPTH_KEY = "_depth_";
	public static final Text DEPTH_KEY_W = new Text(DEPTH_KEY);
	public static final String MAX_DEPTH_KEY = "_maxdepth_";
	public static final Text MAX_DEPTH_KEY_W = new Text(MAX_DEPTH_KEY);

	// maximum value that we are never likely to reach
	// because the depth of the Web graph is that high only
	// for spam cliques.
	public static final int DEFAULT_MAX_DEPTH = 1000;

	private int defaultMaxDepth;

	@Override
	public void setConf(Configuration conf) {
	super.setConf(conf);
	if (conf == null)
	return;
	defaultMaxDepth = conf.getInt("scoring.depth.max", DEFAULT_MAX_DEPTH);
	if (defaultMaxDepth <= 0) {
	defaultMaxDepth = DEFAULT_MAX_DEPTH;
	}
	}

	@Override
	public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
	ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
	CrawlDatum adjust, int allCount) throws ScoringFilterException {
	if (targets.isEmpty()) {
	return adjust;
	}
	String depthString = parseData.getMeta(DEPTH_KEY);
	if (depthString == null) {
	LOG.warn("Missing depth, removing all outlinks from url " + fromUrl);
	targets.clear();
	return adjust;
	}
	int curDepth = Integer.parseInt(depthString);
	int curMaxDepth = defaultMaxDepth;
	IntWritable customMaxDepth = null;
	// allow overrides from injector
	String maxDepthString = parseData.getMeta(MAX_DEPTH_KEY);
	if (maxDepthString != null) {
	curMaxDepth = Integer.parseInt(maxDepthString);
	customMaxDepth = new IntWritable(curMaxDepth);
	}
	if (curDepth >= curMaxDepth) {
	// depth exceeded - throw away
	LOG.info("Depth limit (" + curMaxDepth
	+ ") reached, ignoring outlinks for " + fromUrl);
	targets.clear();
	return adjust;
	}
	Iterator<Entry<Text, CrawlDatum>> it = targets.iterator();
	while (it.hasNext()) {
	Entry<Text, CrawlDatum> e = it.next();
	// record increased depth
	e.getValue().getMetaData()
	.put(DEPTH_KEY_W, new IntWritable(curDepth + 1));
	// record maxDepth if any
	if (customMaxDepth != null) {
	e.getValue().getMetaData().put(MAX_DEPTH_KEY_W, customMaxDepth);
	}
	}
	return adjust;
	}

	// prioritize by smaller values of depth
	@Override
	public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
	throws ScoringFilterException {
	// boost up by current depth
	int curDepth, curMaxDepth;
	IntWritable maxDepth = (IntWritable) datum.getMetaData().get(
	MAX_DEPTH_KEY_W);
	if (maxDepth != null) {
	curMaxDepth = maxDepth.get();
	} else {
	curMaxDepth = defaultMaxDepth;
	}
	IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
	if (depth == null) {
	// penalize
	curDepth = curMaxDepth;
	} else {
	curDepth = depth.get();
	}
	int mul = curMaxDepth - curDepth;
	return initSort * (1 + mul);
	}

	@Override
	public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
	CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
	throws ScoringFilterException {
	return initScore;
	}

	@Override
	public void initialScore(Text url, CrawlDatum datum)
	throws ScoringFilterException {
	// the datum might already have some values set
	// e.g. obtained from redirection
	// in which case we don't want to override them
	if (datum.getMetaData().get(MAX_DEPTH_KEY_W) == null)
	datum.getMetaData()
	.put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
	// initial depth is 1
	if (datum.getMetaData().get(DEPTH_KEY_W) == null)
	datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
	}

	@Override
	public void injectedScore(Text url, CrawlDatum datum)
	throws ScoringFilterException {

	// check for the presence of the depth limit key
	if (datum.getMetaData().get(MAX_DEPTH_KEY_W) != null) {
	// convert from Text to Int
	String depthString = datum.getMetaData().get(MAX_DEPTH_KEY_W).toString();
	datum.getMetaData().remove(MAX_DEPTH_KEY_W);
	int depth = Integer.parseInt(depthString);
	datum.getMetaData().put(MAX_DEPTH_KEY_W, new IntWritable(depth));
	} else { // put the default
	datum.getMetaData()
	.put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
	}
	// initial depth is 1
	datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
	}

	@Override
	public void passScoreAfterParsing(Text url, Content content, Parse parse)
	throws ScoringFilterException {
	String depth = content.getMetadata().get(DEPTH_KEY);
	if (depth != null) {
	parse.getData().getParseMeta().set(DEPTH_KEY, depth);
	}
	String maxdepth = content.getMetadata().get(MAX_DEPTH_KEY);
	if (maxdepth != null) {
	parse.getData().getParseMeta().set(MAX_DEPTH_KEY, maxdepth);
	}
	}

	@Override
	public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
	throws ScoringFilterException {
	IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
	if (depth != null) {
	content.getMetadata().set(DEPTH_KEY, depth.toString());
	}
	IntWritable maxdepth = (IntWritable) datum.getMetaData().get(
	MAX_DEPTH_KEY_W);
	if (maxdepth != null) {
	content.getMetadata().set(MAX_DEPTH_KEY, maxdepth.toString());
	}
	}

	@Override
	public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
	List<CrawlDatum> inlinked) throws ScoringFilterException {
	// find a minimum of all depths
	int newDepth = DEFAULT_MAX_DEPTH;
	if (old != null) {
	IntWritable oldDepth = (IntWritable) old.getMetaData().get(DEPTH_KEY_W);
	if (oldDepth != null) {
	newDepth = oldDepth.get();
	} else {
	// not set ?
	initialScore(url, old);
	}
	}
	for (CrawlDatum lnk : inlinked) {
	IntWritable depth = (IntWritable) lnk.getMetaData().get(DEPTH_KEY_W);
	if (depth != null && depth.get() < newDepth) {
	newDepth = depth.get();
	}
	}
	datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(newDepth));
	}
	}