src/java/org/apache/nutch/crawl/TextProfileSignature.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.crawl;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Iterator;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.MD5Hash;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.NutchConfiguration;

 /**
  * <p>
  * An implementation of a page signature. It calculates an MD5 hash of a plain
  * text "profile" of a page. In case there is no text, it calculates a hash
  * using the {@link MD5Signature}.
  * </p>
  * <p>
  * The algorithm to calculate a page "profile" takes the plain text version of a
  * page and performs the following steps:
  * <ul>
  * <li>remove all characters except letters and digits, and bring all characters
  * to lower case,</li>
  * <li>split the text into tokens (all consecutive non-whitespace characters),</li>
  * <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2
  * characters),</li>
  * <li>sort the list of tokens by decreasing frequency,</li>
  * <li>round down the counts of tokens to the nearest multiple of QUANT (
  * <code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is
  * 0.01f by default, and <code>maxFreq</code> is the maximum token frequency).
  * If <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2
  * (which means that tokens with frequency 1 are always discarded).</li>
  * <li>tokens, which frequency after quantization falls below QUANT, are
  * discarded.</li>
  * <li>create a list of tokens and their quantized frequency, separated by
  * spaces, in the order of decreasing frequency.</li>
  * </ul>
  * This list is then submitted to an MD5 hash calculation.
  *
  * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
  */
 public class TextProfileSignature extends Signature {

   Signature fallback = new MD5Signature();

   int MIN_TOKEN_LEN = 2;
   float QUANT_RATE = 0.01f;
   boolean secondaryLexicographicSorting = true;

   @Override
   public void setConf(Configuration conf) {
     super.setConf(conf);
     MIN_TOKEN_LEN = conf.getInt(
         "db.signature.text_profile.min_token_len", 2);
     QUANT_RATE = conf.getFloat(
         "db.signature.text_profile.quant_rate", 0.01f);
     secondaryLexicographicSorting = conf.getBoolean(
         "db.signature.text_profile.sec_sort_lex", true);
   }

   public byte[] calculate(Content content, Parse parse) {
     HashMap<String, Token> tokens = new HashMap<>();
     String text = null;
     if (parse != null)
       text = parse.getText();
     if (text == null || text.length() == 0)
       return fallback.calculate(content, parse);
     StringBuffer curToken = new StringBuffer();
     int maxFreq = 0;
     for (int i = 0; i < text.length(); i++) {
       char c = text.charAt(i);
       if (Character.isLetterOrDigit(c)) {
         curToken.append(Character.toLowerCase(c));
       } else {
         if (curToken.length() > 0) {
           if (curToken.length() > MIN_TOKEN_LEN) {
             // add it
             String s = curToken.toString();
             Token tok = tokens.get(s);
             if (tok == null) {
               tok = new Token(0, s);
               tokens.put(s, tok);
             }
             tok.cnt++;
             if (tok.cnt > maxFreq)
               maxFreq = tok.cnt;
           }
           curToken.setLength(0);
         }
       }
     }
     // check the last token
     if (curToken.length() > MIN_TOKEN_LEN) {
       // add it
       String s = curToken.toString();
       Token tok = tokens.get(s);
       if (tok == null) {
         tok = new Token(0, s);
         tokens.put(s, tok);
       }
       tok.cnt++;
       if (tok.cnt > maxFreq)
         maxFreq = tok.cnt;
     }
     Iterator<Token> it = tokens.values().iterator();
     ArrayList<Token> profile = new ArrayList<>();
     // calculate the QUANT value
     int QUANT = Math.round(maxFreq * QUANT_RATE);
     if (QUANT < 2) {
       if (maxFreq > 1)
         QUANT = 2;
       else
         QUANT = 1;
     }
     while (it.hasNext()) {
       Token t = it.next();
       // round down to the nearest QUANT
       t.cnt = (t.cnt / QUANT) * QUANT;
       // discard the frequencies below the QUANT
       if (t.cnt < QUANT) {
         continue;
       }
       profile.add(t);
     }
     Collections.sort(profile, new TokenComparator());
     StringBuffer newText = new StringBuffer();
     it = profile.iterator();
     while (it.hasNext()) {
       Token t = it.next();
       if (newText.length() > 0)
         newText.append("\n");
       newText.append(t.toString());
     }
     return MD5Hash.digest(newText.toString()).getDigest();
   }

   private static class Token {
     public int cnt;
     public String val;

     public Token(int cnt, String val) {
       this.cnt = cnt;
       this.val = val;
     }

     public String toString() {
       return val + " " + cnt;
     }
   }

   private class TokenComparator implements Comparator<Token> {
     /**
      * Sort tokens first by decreasing frequency and second in lexicographic
      * (Unicode) order
      */
     public int compare(Token t1, Token t2) {
       int diffCnt = t2.cnt - t1.cnt;
       if (diffCnt == 0 && secondaryLexicographicSorting) {
         return t1.val.compareTo(t2.val);
       }
       return diffCnt;
     }
   }

   public static void main(String[] args) throws Exception {
     TextProfileSignature sig = new TextProfileSignature();
     sig.setConf(NutchConfiguration.create());
     HashMap<String, byte[]> res = new HashMap<>();
     File[] files = new File(args[0]).listFiles();
     for (int i = 0; i < files.length; i++) {
       FileInputStream fis = new FileInputStream(files[i]);
       BufferedReader br = new BufferedReader(
           new InputStreamReader(fis, "UTF-8"));
       StringBuffer text = new StringBuffer();
       String line = null;
       while ((line = br.readLine()) != null) {
         if (text.length() > 0)
           text.append("\n");
         text.append(line);
       }
       br.close();
       byte[] signature = sig.calculate(null, new ParseImpl(text.toString(),
           null));
       res.put(files[i].toString(), signature);
     }
     Iterator<String> it = res.keySet().iterator();
     while (it.hasNext()) {
       String name = it.next();
       byte[] signature = res.get(name);
       System.out.println(name + "\t" + StringUtil.toHexString(signature));
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.crawl;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.InputStreamReader;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.HashMap;
	import java.util.Iterator;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.MD5Hash;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseImpl;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.util.StringUtil;
	import org.apache.nutch.util.NutchConfiguration;

	/**
	* <p>
	* An implementation of a page signature. It calculates an MD5 hash of a plain
	* text "profile" of a page. In case there is no text, it calculates a hash
	* using the {@link MD5Signature}.
	* </p>
	* <p>
	* The algorithm to calculate a page "profile" takes the plain text version of a
	* page and performs the following steps:
	* <ul>
	* <li>remove all characters except letters and digits, and bring all characters
	* to lower case,</li>
	* <li>split the text into tokens (all consecutive non-whitespace characters),</li>
	* <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2
	* characters),</li>
	* <li>sort the list of tokens by decreasing frequency,</li>
	* <li>round down the counts of tokens to the nearest multiple of QUANT (
	* <code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is
	* 0.01f by default, and <code>maxFreq</code> is the maximum token frequency).
	* If <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2
	* (which means that tokens with frequency 1 are always discarded).</li>
	* <li>tokens, which frequency after quantization falls below QUANT, are
	* discarded.</li>
	* <li>create a list of tokens and their quantized frequency, separated by
	* spaces, in the order of decreasing frequency.</li>
	* </ul>
	* This list is then submitted to an MD5 hash calculation.
	*
	* @author Andrzej Bialecki <ab@getopt.org>
	*/
	public class TextProfileSignature extends Signature {

	Signature fallback = new MD5Signature();

	int MIN_TOKEN_LEN = 2;
	float QUANT_RATE = 0.01f;
	boolean secondaryLexicographicSorting = true;

	@Override
	public void setConf(Configuration conf) {
	super.setConf(conf);
	MIN_TOKEN_LEN = conf.getInt(
	"db.signature.text_profile.min_token_len", 2);
	QUANT_RATE = conf.getFloat(
	"db.signature.text_profile.quant_rate", 0.01f);
	secondaryLexicographicSorting = conf.getBoolean(
	"db.signature.text_profile.sec_sort_lex", true);
	}

	public byte[] calculate(Content content, Parse parse) {
	HashMap<String, Token> tokens = new HashMap<>();
	String text = null;
	if (parse != null)
	text = parse.getText();
	if (text == null \|\| text.length() == 0)
	return fallback.calculate(content, parse);
	StringBuffer curToken = new StringBuffer();
	int maxFreq = 0;
	for (int i = 0; i < text.length(); i++) {
	char c = text.charAt(i);
	if (Character.isLetterOrDigit(c)) {
	curToken.append(Character.toLowerCase(c));
	} else {
	if (curToken.length() > 0) {
	if (curToken.length() > MIN_TOKEN_LEN) {
	// add it
	String s = curToken.toString();
	Token tok = tokens.get(s);
	if (tok == null) {
	tok = new Token(0, s);
	tokens.put(s, tok);
	}
	tok.cnt++;
	if (tok.cnt > maxFreq)
	maxFreq = tok.cnt;
	}
	curToken.setLength(0);
	}
	}
	}
	// check the last token
	if (curToken.length() > MIN_TOKEN_LEN) {
	// add it
	String s = curToken.toString();
	Token tok = tokens.get(s);
	if (tok == null) {
	tok = new Token(0, s);
	tokens.put(s, tok);
	}
	tok.cnt++;
	if (tok.cnt > maxFreq)
	maxFreq = tok.cnt;
	}
	Iterator<Token> it = tokens.values().iterator();
	ArrayList<Token> profile = new ArrayList<>();
	// calculate the QUANT value
	int QUANT = Math.round(maxFreq * QUANT_RATE);
	if (QUANT < 2) {
	if (maxFreq > 1)
	QUANT = 2;
	else
	QUANT = 1;
	}
	while (it.hasNext()) {
	Token t = it.next();
	// round down to the nearest QUANT
	t.cnt = (t.cnt / QUANT) * QUANT;
	// discard the frequencies below the QUANT
	if (t.cnt < QUANT) {
	continue;
	}
	profile.add(t);
	}
	Collections.sort(profile, new TokenComparator());
	StringBuffer newText = new StringBuffer();
	it = profile.iterator();
	while (it.hasNext()) {
	Token t = it.next();
	if (newText.length() > 0)
	newText.append("\n");
	newText.append(t.toString());
	}
	return MD5Hash.digest(newText.toString()).getDigest();
	}

	private static class Token {
	public int cnt;
	public String val;

	public Token(int cnt, String val) {
	this.cnt = cnt;
	this.val = val;
	}

	public String toString() {
	return val + " " + cnt;
	}
	}

	private class TokenComparator implements Comparator<Token> {
	/**
	* Sort tokens first by decreasing frequency and second in lexicographic
	* (Unicode) order
	*/
	public int compare(Token t1, Token t2) {
	int diffCnt = t2.cnt - t1.cnt;
	if (diffCnt == 0 && secondaryLexicographicSorting) {
	return t1.val.compareTo(t2.val);
	}
	return diffCnt;
	}
	}

	public static void main(String[] args) throws Exception {
	TextProfileSignature sig = new TextProfileSignature();
	sig.setConf(NutchConfiguration.create());
	HashMap<String, byte[]> res = new HashMap<>();
	File[] files = new File(args[0]).listFiles();
	for (int i = 0; i < files.length; i++) {
	FileInputStream fis = new FileInputStream(files[i]);
	BufferedReader br = new BufferedReader(
	new InputStreamReader(fis, "UTF-8"));
	StringBuffer text = new StringBuffer();
	String line = null;
	while ((line = br.readLine()) != null) {
	if (text.length() > 0)
	text.append("\n");
	text.append(line);
	}
	br.close();
	byte[] signature = sig.calculate(null, new ParseImpl(text.toString(),
	null));
	res.put(files[i].toString(), signature);
	}
	Iterator<String> it = res.keySet().iterator();
	while (it.hasNext()) {
	String name = it.next();
	byte[] signature = res.get(name);
	System.out.println(name + "\t" + StringUtil.toHexString(signature));
	}
	}
	}