NUTCH-2381 In some situations the class TextProfileSignature
gives different signatures for the same text "profile" page.
- implement secondary sorting (similar to patch provided by
Rodrigo Joni Sestari)
- allow to restore previous behavior by setting property
`db.signature.text_profile.sec_sort_lex = false`
diff --git a/CHANGES.txt b/CHANGES.txt
index ff564d3..5721439 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -14,7 +14,15 @@
further information.
- HostDB entries have been moved from Integer to Long in order to accomodate very large
- hosts. Remove your existing HostDB and recreate it with bin/nutch updatehostdb.
+ hosts. Remove your existing HostDB and recreate it with bin/nutch updatehostdb, see
+ NUTCH-2694 for additional information.
+
+ - The signature class TextProfileSignature has been improved to be stable over
+ consecutive runs by sorting tokens by frequency first and secondarily in lexicographic
+ order. If an existing CrawlDb contains signatures generated by TextProfileSignature
+ these are likely to change when upgrading to Nutch 1.16. The previous behavior relying
+ on a semi-stable pseudo-random hash sorting could be restored setting the property
+ `db.signature.text_profile.sec_sort_lex` to `false`. See also NUTCH-2381.
Nutch 1.15 Release (25/07/2018)
diff --git a/src/java/org/apache/nutch/crawl/TextProfileSignature.java b/src/java/org/apache/nutch/crawl/TextProfileSignature.java
index c831be5..049206a 100644
--- a/src/java/org/apache/nutch/crawl/TextProfileSignature.java
+++ b/src/java/org/apache/nutch/crawl/TextProfileSignature.java
@@ -26,6 +26,7 @@
import java.util.HashMap;
import java.util.Iterator;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseImpl;
@@ -67,11 +68,22 @@
Signature fallback = new MD5Signature();
- public byte[] calculate(Content content, Parse parse) {
- int MIN_TOKEN_LEN = getConf().getInt(
+ int MIN_TOKEN_LEN = 2;
+ float QUANT_RATE = 0.01f;
+ boolean secondaryLexicographicSorting = true;
+
+ @Override
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ MIN_TOKEN_LEN = conf.getInt(
"db.signature.text_profile.min_token_len", 2);
- float QUANT_RATE = getConf().getFloat(
+ QUANT_RATE = conf.getFloat(
"db.signature.text_profile.quant_rate", 0.01f);
+ secondaryLexicographicSorting = conf.getBoolean(
+ "db.signature.text_profile.sec_sort_lex", true);
+ }
+
+ public byte[] calculate(Content content, Parse parse) {
HashMap<String, Token> tokens = new HashMap<>();
String text = null;
if (parse != null)
@@ -161,9 +173,17 @@
}
}
- private static class TokenComparator implements Comparator<Token> {
+ private class TokenComparator implements Comparator<Token> {
+ /**
+ * Sort tokens first by decreasing frequency and second in lexicographic
+ * (Unicode) order
+ */
public int compare(Token t1, Token t2) {
- return t2.cnt - t1.cnt;
+ int diffCnt = t2.cnt - t1.cnt;
+ if (diffCnt == 0 && secondaryLexicographicSorting) {
+ return t1.val.compareTo(t2.val);
+ }
+ return diffCnt;
}
}
diff --git a/src/test/org/apache/nutch/crawl/TestTextProfileSignature.java b/src/test/org/apache/nutch/crawl/TestTextProfileSignature.java
new file mode 100644
index 0000000..adf4b5e
--- /dev/null
+++ b/src/test/org/apache/nutch/crawl/TestTextProfileSignature.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.StringUtil;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestTextProfileSignature {
+
+ @Test
+ public void testGetSignature() {
+ Configuration conf = NutchConfiguration.create();
+ Signature textProf = new TextProfileSignature();
+ textProf.setConf(conf);
+ String text = "Hello World The Quick Brown Fox Jumped Over the Lazy Fox";
+ ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, "Hello World",
+ new Outlink[0], new Metadata());
+ byte[] signature1 = textProf.calculate(new Content(),
+ new ParseImpl(text, pd));
+ Assert.assertNotNull(signature1);
+ List<String> words = Arrays.asList(text.split("\\s"));
+ Collections.shuffle(words);
+ String text2 = String.join(" ", words);
+ byte[] signature2 = textProf.calculate(new Content(),
+ new ParseImpl(text2, pd));
+ Assert.assertNotNull(signature2);
+ Assert.assertEquals(StringUtil.toHexString(signature1),
+ StringUtil.toHexString(signature2));
+ }
+}