src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.scoring.opic;

 import java.lang.invoke.MethodHandles;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.Collection;
 import java.util.List;
 import java.util.Map.Entry;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.scoring.ScoringFilter;
 import org.apache.nutch.scoring.ScoringFilterException;

 /**
  * This plugin implements a variant of an Online Page Importance Computation
  * (OPIC) score, described in this <a href="https://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html">paper</a>:
  * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive
  * On-Line Page Importance Computation.
  *
  * @author Andrzej Bialecki
  */
 public class OPICScoringFilter implements ScoringFilter {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   private Configuration conf;
   private float scoreInjected;
   private float scorePower;
   private float internalScoreFactor;
   private float externalScoreFactor;
   private boolean countFiltered;

   @Override
   public Configuration getConf() {
     return conf;
   }

   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
     scorePower = conf.getFloat("indexer.score.power", 0.5f);
     internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f);
     externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f);
     countFiltered = conf.getBoolean("db.score.count.filtered", false);
   }

   @Override
   public void injectedScore(Text url, CrawlDatum datum)
       throws ScoringFilterException {
   }

   /**
    * Set to 0.0f (unknown value) - inlink contributions will bring it to a
    * correct level. Newly discovered pages have at least one inlink.
    */
   @Override
   public void initialScore(Text url, CrawlDatum datum)
       throws ScoringFilterException {
     datum.setScore(0.0f);
   }

   /** Use {@link CrawlDatum#getScore()}. */
   @Override
   public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
       throws ScoringFilterException {
     return datum.getScore() * initSort;
   }

   /** Increase the score by a sum of inlinked scores. */
   @Override
   public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
       List<CrawlDatum> inlinked) throws ScoringFilterException {
     float adjust = 0.0f;
     for (int i = 0; i < inlinked.size(); i++) {
       CrawlDatum linked = inlinked.get(i);
       adjust += linked.getScore();
     }
     if (old == null)
       old = datum;
     datum.setScore(old.getScore() + adjust);
   }

   /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
   @Override
   public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
     content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
   }

   /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
   @Override
   public void passScoreAfterParsing(Text url, Content content, Parse parse) {
     parse.getData().getContentMeta()
         .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
   }

   /**
    * Get a float value from Fetcher.SCORE_KEY, divide it by the number of
    * outlinks and apply.
    */
   @Override
   public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
       ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
       CrawlDatum adjust, int allCount) throws ScoringFilterException {
     float score = scoreInjected;
     String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
     if (scoreString != null) {
       try {
         score = Float.parseFloat(scoreString);
       } catch (Exception e) {
         LOG.error("Error: ", e);
       }
     }
     int validCount = targets.size();
     if (countFiltered) {
       score /= allCount;
     } else {
       if (validCount == 0) {
         // no outlinks to distribute score, so just return adjust
         return adjust;
       }
       score /= validCount;
     }
     // internal and external score factor
     float internalScore = score * internalScoreFactor;
     float externalScore = score * externalScoreFactor;
     for (Entry<Text, CrawlDatum> target : targets) {
       try {
         String toHost = new URL(target.getKey().toString()).getHost();
         String fromHost = new URL(fromUrl.toString()).getHost();
         if (toHost.equalsIgnoreCase(fromHost)) {
           target.getValue().setScore(internalScore);
         } else {
           target.getValue().setScore(externalScore);
         }
       } catch (MalformedURLException e) {
         LOG.error("Error: ", e);
         target.getValue().setScore(externalScore);
       }
     }
     // XXX (ab) no adjustment? I think this is contrary to the algorithm descr.
     // XXX in the paper, where page "loses" its score if it's distributed to
     // XXX linked pages...
     return adjust;
   }

   /** Dampen the boost value by scorePower. */
   @Override
   public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
       CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
       throws ScoringFilterException {
     if (dbDatum == null) {
       return initScore;
     }
     return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.scoring.opic;

	import java.lang.invoke.MethodHandles;
	import java.net.MalformedURLException;
	import java.net.URL;
	import java.util.Collection;
	import java.util.List;
	import java.util.Map.Entry;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.Text;
	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.crawl.Inlinks;
	import org.apache.nutch.indexer.NutchDocument;
	import org.apache.nutch.metadata.Nutch;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseData;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.scoring.ScoringFilter;
	import org.apache.nutch.scoring.ScoringFilterException;

	/**
	* This plugin implements a variant of an Online Page Importance Computation
	* (OPIC) score, described in this <a href="https://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html">paper</a>:
	* Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive
	* On-Line Page Importance Computation.
	*
	* @author Andrzej Bialecki
	*/
	public class OPICScoringFilter implements ScoringFilter {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	private Configuration conf;
	private float scoreInjected;
	private float scorePower;
	private float internalScoreFactor;
	private float externalScoreFactor;
	private boolean countFiltered;

	@Override
	public Configuration getConf() {
	return conf;
	}

	@Override
	public void setConf(Configuration conf) {
	this.conf = conf;
	scorePower = conf.getFloat("indexer.score.power", 0.5f);
	internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f);
	externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f);
	countFiltered = conf.getBoolean("db.score.count.filtered", false);
	}

	@Override
	public void injectedScore(Text url, CrawlDatum datum)
	throws ScoringFilterException {
	}

	/**
	* Set to 0.0f (unknown value) - inlink contributions will bring it to a
	* correct level. Newly discovered pages have at least one inlink.
	*/
	@Override
	public void initialScore(Text url, CrawlDatum datum)
	throws ScoringFilterException {
	datum.setScore(0.0f);
	}

	/** Use {@link CrawlDatum#getScore()}. */
	@Override
	public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
	throws ScoringFilterException {
	return datum.getScore() * initSort;
	}

	/** Increase the score by a sum of inlinked scores. */
	@Override
	public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
	List<CrawlDatum> inlinked) throws ScoringFilterException {
	float adjust = 0.0f;
	for (int i = 0; i < inlinked.size(); i++) {
	CrawlDatum linked = inlinked.get(i);
	adjust += linked.getScore();
	}
	if (old == null)
	old = datum;
	datum.setScore(old.getScore() + adjust);
	}

	/** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
	@Override
	public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
	content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
	}

	/** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
	@Override
	public void passScoreAfterParsing(Text url, Content content, Parse parse) {
	parse.getData().getContentMeta()
	.set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
	}

	/**
	* Get a float value from Fetcher.SCORE_KEY, divide it by the number of
	* outlinks and apply.
	*/
	@Override
	public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
	ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
	CrawlDatum adjust, int allCount) throws ScoringFilterException {
	float score = scoreInjected;
	String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
	if (scoreString != null) {
	try {
	score = Float.parseFloat(scoreString);
	} catch (Exception e) {
	LOG.error("Error: ", e);
	}
	}
	int validCount = targets.size();
	if (countFiltered) {
	score /= allCount;
	} else {
	if (validCount == 0) {
	// no outlinks to distribute score, so just return adjust
	return adjust;
	}
	score /= validCount;
	}
	// internal and external score factor
	float internalScore = score * internalScoreFactor;
	float externalScore = score * externalScoreFactor;
	for (Entry<Text, CrawlDatum> target : targets) {
	try {
	String toHost = new URL(target.getKey().toString()).getHost();
	String fromHost = new URL(fromUrl.toString()).getHost();
	if (toHost.equalsIgnoreCase(fromHost)) {
	target.getValue().setScore(internalScore);
	} else {
	target.getValue().setScore(externalScore);
	}
	} catch (MalformedURLException e) {
	LOG.error("Error: ", e);
	target.getValue().setScore(externalScore);
	}
	}
	// XXX (ab) no adjustment? I think this is contrary to the algorithm descr.
	// XXX in the paper, where page "loses" its score if it's distributed to
	// XXX linked pages...
	return adjust;
	}

	/** Dampen the boost value by scorePower. */
	@Override
	public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
	CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
	throws ScoringFilterException {
	if (dbDatum == null) {
	return initScore;
	}
	return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore;
	}
	}