src/java/org/apache/nutch/scoring/ScoringFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.scoring;

 import java.util.Collection;
 import java.util.List;
 import java.util.Map.Entry;

 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.plugin.Pluggable;
 import org.apache.nutch.protocol.Content;

 /**
  * A contract defining behavior of scoring plugins.
  *
  * A scoring filter will manipulate scoring variables in CrawlDatum and in
  * resulting search indexes. Filters can be chained in a specific order, to
  * provide multi-stage scoring adjustments.
  *
  * @author Andrzej Bialecki
  */
 public interface ScoringFilter extends Configurable, Pluggable {
   /** The name of the extension point. */
   public final static String X_POINT_ID = ScoringFilter.class.getName();

   /**
    * Set an initial score for newly injected pages. Note: newly injected pages
    * may have no inlinks, so filter implementations may wish to set this score
    * to a non-zero value, to give newly injected pages some initial credit.
    *
    * @param url
    *          url of the page
    * @param datum
    *          new datum. Filters will modify it in-place.
    * @throws ScoringFilterException
    */
   public void injectedScore(Text url, CrawlDatum datum)
       throws ScoringFilterException;

   /**
    * Set an initial score for newly discovered pages. Note: newly discovered
    * pages have at least one inlink with its score contribution, so filter
    * implementations may choose to set initial score to zero (unknown value),
    * and then the inlink score contribution will set the "real" value of the new
    * page.
    *
    * @param url
    *          url of the page
    * @param datum
    *          new datum. Filters will modify it in-place.
    * @throws ScoringFilterException
    */
   public void initialScore(Text url, CrawlDatum datum)
       throws ScoringFilterException;

   /**
    * This method prepares a sort value for the purpose of sorting and selecting
    * top N scoring pages during fetchlist generation.
    *
    * @param url
    *          url of the page
    * @param datum
    *          page's datum, should not be modified
    * @param initSort
    *          initial sort value, or a value from previous filters in chain
    */
   public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
       throws ScoringFilterException;

   /**
    * This method takes all relevant score information from the current datum
    * (coming from a generated fetchlist) and stores it into
    * {@link org.apache.nutch.protocol.Content} metadata. This is needed in order
    * to pass this value(s) to the mechanism that distributes it to outlinked
    * pages.
    *
    * @param url
    *          url of the page
    * @param datum
    *          source datum. NOTE: modifications to this value are not persisted.
    * @param content
    *          instance of content. Implementations may modify this in-place,
    *          primarily by setting some metadata properties.
    */
   public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
       throws ScoringFilterException;

   /**
    * Currently a part of score distribution is performed using only data coming
    * from the parsing process. We need this method in order to ensure the
    * presence of score data in these steps.
    *
    * @param url
    *          page url
    * @param content
    *          original content. NOTE: modifications to this value are not
    *          persisted.
    * @param parse
    *          target instance to copy the score information to. Implementations
    *          may modify this in-place, primarily by setting some metadata
    *          properties.
    */
   public void passScoreAfterParsing(Text url, Content content, Parse parse)
       throws ScoringFilterException;

   /**
    * Distribute score value from the current page to all its outlinked pages.
    *
    * @param fromUrl
    *          url of the source page
    * @param parseData
    *          ParseData instance, which stores relevant score value(s) in its
    *          metadata. NOTE: filters may modify this in-place, all changes will
    *          be persisted.
    * @param targets
    *          &lt;url, CrawlDatum&gt; pairs. NOTE: filters can modify this
    *          in-place, all changes will be persisted.
    * @param adjust
    *          a CrawlDatum instance, initially null, which implementations may
    *          use to pass adjustment values to the original CrawlDatum. When
    *          creating this instance, set its status to
    *          {@link CrawlDatum#STATUS_LINKED}.
    * @param allCount
    *          number of all collected outlinks from the source page
    * @return if needed, implementations may return an instance of CrawlDatum,
    *         with status {@link CrawlDatum#STATUS_LINKED}, which contains
    *         adjustments to be applied to the original CrawlDatum score(s) and
    *         metadata. This can be null if not needed.
    * @throws ScoringFilterException
    */
   public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
       ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
       CrawlDatum adjust, int allCount) throws ScoringFilterException;

   /**
    * This method calculates a new score of CrawlDatum during CrawlDb update,
    * based on the initial value of the original CrawlDatum, and also score
    * values contributed by inlinked pages.
    *
    * @param url
    *          url of the page
    * @param old
    *          original datum, with original score. May be null if this is a
    *          newly discovered page. If not null, filters should use score
    *          values from this parameter as the starting values - the
    *          <code>datum</code> parameter may contain values that are no longer
    *          valid, if other updates occurred between generation and this
    *          update.
    * @param datum
    *          the new datum, with the original score saved at the time when
    *          fetchlist was generated. Filters should update this in-place, and
    *          it will be saved in the crawldb.
    * @param inlinked
    *          (partial) list of CrawlDatum-s (with their scores) from links
    *          pointing to this page, found in the current update batch.
    * @throws ScoringFilterException
    */
   public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
       List<CrawlDatum> inlinked) throws ScoringFilterException;

   /**
    * This method may change the score or status of CrawlDatum during CrawlDb
    * update, when the URL is neither fetched nor has any inlinks.
    *
    * @param url
    *          URL of the page
    * @param datum
    *          CrawlDatum for page
    * @throws ScoringFilterException
    */
   public default void orphanedScore(Text url, CrawlDatum datum)
       throws ScoringFilterException {
   }

   /**
    * This method calculates a indexed document score/boost.
    *
    * @param url
    *          url of the page
    * @param doc
    *          indexed document. NOTE: this already contains all information
    *          collected by indexing filters. Implementations may modify this
    *          instance, in order to store/remove some information.
    * @param dbDatum
    *          current page from CrawlDb. NOTE:
    *          <ul>
    *          <li>changes made to this instance are not persisted</li>
    *          <li>may be null if indexing is done without CrawlDb or if the
    *          segment is generated not from the CrawlDb (via
    *          FreeGenerator).</li>
    *          </ul>
    * @param fetchDatum
    *          datum from FetcherOutput (containing among others the fetching
    *          status)
    * @param parse
    *          parsing result. NOTE: changes made to this instance are not
    *          persisted.
    * @param inlinks
    *          current inlinks from LinkDb. NOTE: changes made to this instance
    *          are not persisted.
    * @param initScore
    *          initial boost value for the indexed document.
    * @return boost value for the indexed document. This value is passed as an
    *         argument to the next scoring filter in chain. NOTE: implementations
    *         may also express other scoring strategies by modifying the indexed
    *         document directly.
    * @throws ScoringFilterException
    */
   public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
       CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
       throws ScoringFilterException;
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.scoring;

	import java.util.Collection;
	import java.util.List;
	import java.util.Map.Entry;

	import org.apache.hadoop.conf.Configurable;
	import org.apache.hadoop.io.Text;
	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.crawl.Inlinks;
	import org.apache.nutch.indexer.NutchDocument;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseData;
	import org.apache.nutch.plugin.Pluggable;
	import org.apache.nutch.protocol.Content;

	/**
	* A contract defining behavior of scoring plugins.
	*
	* A scoring filter will manipulate scoring variables in CrawlDatum and in
	* resulting search indexes. Filters can be chained in a specific order, to
	* provide multi-stage scoring adjustments.
	*
	* @author Andrzej Bialecki
	*/
	public interface ScoringFilter extends Configurable, Pluggable {
	/** The name of the extension point. */
	public final static String X_POINT_ID = ScoringFilter.class.getName();

	/**
	* Set an initial score for newly injected pages. Note: newly injected pages
	* may have no inlinks, so filter implementations may wish to set this score
	* to a non-zero value, to give newly injected pages some initial credit.
	*
	* @param url
	* url of the page
	* @param datum
	* new datum. Filters will modify it in-place.
	* @throws ScoringFilterException
	*/
	public void injectedScore(Text url, CrawlDatum datum)
	throws ScoringFilterException;

	/**
	* Set an initial score for newly discovered pages. Note: newly discovered
	* pages have at least one inlink with its score contribution, so filter
	* implementations may choose to set initial score to zero (unknown value),
	* and then the inlink score contribution will set the "real" value of the new
	* page.
	*
	* @param url
	* url of the page
	* @param datum
	* new datum. Filters will modify it in-place.
	* @throws ScoringFilterException
	*/
	public void initialScore(Text url, CrawlDatum datum)
	throws ScoringFilterException;

	/**
	* This method prepares a sort value for the purpose of sorting and selecting
	* top N scoring pages during fetchlist generation.
	*
	* @param url
	* url of the page
	* @param datum
	* page's datum, should not be modified
	* @param initSort
	* initial sort value, or a value from previous filters in chain
	*/
	public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
	throws ScoringFilterException;

	/**
	* This method takes all relevant score information from the current datum
	* (coming from a generated fetchlist) and stores it into
	* {@link org.apache.nutch.protocol.Content} metadata. This is needed in order
	* to pass this value(s) to the mechanism that distributes it to outlinked
	* pages.
	*
	* @param url
	* url of the page
	* @param datum
	* source datum. NOTE: modifications to this value are not persisted.
	* @param content
	* instance of content. Implementations may modify this in-place,
	* primarily by setting some metadata properties.
	*/
	public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
	throws ScoringFilterException;

	/**
	* Currently a part of score distribution is performed using only data coming
	* from the parsing process. We need this method in order to ensure the
	* presence of score data in these steps.
	*
	* @param url
	* page url
	* @param content
	* original content. NOTE: modifications to this value are not
	* persisted.
	* @param parse
	* target instance to copy the score information to. Implementations
	* may modify this in-place, primarily by setting some metadata
	* properties.
	*/
	public void passScoreAfterParsing(Text url, Content content, Parse parse)
	throws ScoringFilterException;

	/**
	* Distribute score value from the current page to all its outlinked pages.
	*
	* @param fromUrl
	* url of the source page
	* @param parseData
	* ParseData instance, which stores relevant score value(s) in its
	* metadata. NOTE: filters may modify this in-place, all changes will
	* be persisted.
	* @param targets
	* <url, CrawlDatum> pairs. NOTE: filters can modify this
	* in-place, all changes will be persisted.
	* @param adjust
	* a CrawlDatum instance, initially null, which implementations may
	* use to pass adjustment values to the original CrawlDatum. When
	* creating this instance, set its status to
	* {@link CrawlDatum#STATUS_LINKED}.
	* @param allCount
	* number of all collected outlinks from the source page
	* @return if needed, implementations may return an instance of CrawlDatum,
	* with status {@link CrawlDatum#STATUS_LINKED}, which contains
	* adjustments to be applied to the original CrawlDatum score(s) and
	* metadata. This can be null if not needed.
	* @throws ScoringFilterException
	*/
	public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
	ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
	CrawlDatum adjust, int allCount) throws ScoringFilterException;

	/**
	* This method calculates a new score of CrawlDatum during CrawlDb update,
	* based on the initial value of the original CrawlDatum, and also score
	* values contributed by inlinked pages.
	*
	* @param url
	* url of the page
	* @param old
	* original datum, with original score. May be null if this is a
	* newly discovered page. If not null, filters should use score
	* values from this parameter as the starting values - the
	* <code>datum</code> parameter may contain values that are no longer
	* valid, if other updates occurred between generation and this
	* update.
	* @param datum
	* the new datum, with the original score saved at the time when
	* fetchlist was generated. Filters should update this in-place, and
	* it will be saved in the crawldb.
	* @param inlinked
	* (partial) list of CrawlDatum-s (with their scores) from links
	* pointing to this page, found in the current update batch.
	* @throws ScoringFilterException
	*/
	public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
	List<CrawlDatum> inlinked) throws ScoringFilterException;

	/**
	* This method may change the score or status of CrawlDatum during CrawlDb
	* update, when the URL is neither fetched nor has any inlinks.
	*
	* @param url
	* URL of the page
	* @param datum
	* CrawlDatum for page
	* @throws ScoringFilterException
	*/
	public default void orphanedScore(Text url, CrawlDatum datum)
	throws ScoringFilterException {
	}

	/**
	* This method calculates a indexed document score/boost.
	*
	* @param url
	* url of the page
	* @param doc
	* indexed document. NOTE: this already contains all information
	* collected by indexing filters. Implementations may modify this
	* instance, in order to store/remove some information.
	* @param dbDatum
	* current page from CrawlDb. NOTE:
	* <ul>
	* <li>changes made to this instance are not persisted</li>
	* <li>may be null if indexing is done without CrawlDb or if the
	* segment is generated not from the CrawlDb (via
	* FreeGenerator).</li>
	* </ul>
	* @param fetchDatum
	* datum from FetcherOutput (containing among others the fetching
	* status)
	* @param parse
	* parsing result. NOTE: changes made to this instance are not
	* persisted.
	* @param inlinks
	* current inlinks from LinkDb. NOTE: changes made to this instance
	* are not persisted.
	* @param initScore
	* initial boost value for the indexed document.
	* @return boost value for the indexed document. This value is passed as an
	* argument to the next scoring filter in chain. NOTE: implementations
	* may also express other scoring strategies by modifying the indexed
	* document directly.
	* @throws ScoringFilterException
	*/
	public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
	CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
	throws ScoringFilterException;
	}