PIG-5300: hashCode for Bag needs to be order independent (knoguchi)
git-svn-id: https://svn.apache.org/repos/asf/pig/trunk@1818985 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/CHANGES.txt b/CHANGES.txt
index bdd211b..694a637 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -59,6 +59,7 @@
OPTIMIZATIONS
BUG FIXES
+PIG-5300: hashCode for Bag needs to be order independent (knoguchi)
PIG-5318: Unit test failures on Pig on Spark with Spark 2.2 (nkollar via szita)
diff --git a/src/org/apache/pig/data/DataBag.java b/src/org/apache/pig/data/DataBag.java
index dcbf6a7..ae4641b 100644
--- a/src/org/apache/pig/data/DataBag.java
+++ b/src/org/apache/pig/data/DataBag.java
@@ -126,6 +126,15 @@
void clear();
/**
+ * Returns the hash code value for the bag. The hash code of a bag is
+ * defined to be the sum of the hash codes of each tuple.
+ * This ensures that b1.equals(b2) implies b1.hashCode() == b2.hashCode()
+ *
+ * @return the hash code value for this bag
+ */
+ int hashCode();
+
+ /**
* This is used by FuncEvalSpec.FakeDataBag.
* @param stale Set stale state.
*/
diff --git a/src/org/apache/pig/data/DefaultAbstractBag.java b/src/org/apache/pig/data/DefaultAbstractBag.java
index 4c2d4f7..08cbcc9 100644
--- a/src/org/apache/pig/data/DefaultAbstractBag.java
+++ b/src/org/apache/pig/data/DefaultAbstractBag.java
@@ -309,6 +309,9 @@
@Override
public boolean equals(Object other) {
+ if( other == null ) {
+ return false;
+ }
return compareTo(other) == 0;
}
@@ -359,11 +362,10 @@
@Override
public int hashCode() {
- int hash = 1;
+ int hash = 0;
Iterator<Tuple> i = iterator();
while (i.hasNext()) {
- // Use 37 because we want a prime, and tuple uses 31.
- hash = 37 * hash + i.next().hashCode();
+ hash += i.next().hashCode();
}
return hash;
}
diff --git a/src/org/apache/pig/data/NonSpillableDataBag.java b/src/org/apache/pig/data/NonSpillableDataBag.java
index 421b418..385af82 100644
--- a/src/org/apache/pig/data/NonSpillableDataBag.java
+++ b/src/org/apache/pig/data/NonSpillableDataBag.java
@@ -198,11 +198,18 @@
*/
@Override
public boolean equals(Object obj) {
+ if( obj == null) {
+ return false;
+ }
return compareTo(obj) == 0;
}
public int hashCode() {
- return mContents.hashCode();
+ int hash = 0;
+ for( Tuple t : mContents ) {
+ hash += t.hashCode();
+ }
+ return hash;
}
@SuppressWarnings("unchecked")
diff --git a/src/org/apache/pig/data/SingleTupleBag.java b/src/org/apache/pig/data/SingleTupleBag.java
index e9e6b7f..cfe2b4c 100644
--- a/src/org/apache/pig/data/SingleTupleBag.java
+++ b/src/org/apache/pig/data/SingleTupleBag.java
@@ -156,22 +156,37 @@
}
}
- /* (non-Javadoc)
- * @see java.lang.Comparable#compareTo(java.lang.Object)
- */
@Override
- public int compareTo(Object o) {
- // TODO Auto-generated method stub
- return 0;
+ public int compareTo(Object other) {
+ if (this == other)
+ return 0;
+ if (other instanceof DataBag) {
+ DataBag bOther = (DataBag) other;
+ if (bOther.size() != 1) {
+ if ( 1 > bOther.size()) return 1;
+ else return -1;
+ }
+ Iterator<Tuple> iter = bOther.iterator();
+ return item.compareTo(iter.next());
+ } else {
+ return DataType.compare(this, other);
+ }
}
- public boolean equals(Object o){
- // TODO: match to compareTo if it is updated
- return true;
+ @Override
+ public boolean equals(Object other) {
+ if( other == null ) {
+ return false;
+ }
+ return compareTo(other) == 0;
}
+ @Override
public int hashCode() {
- return 42;
+ if( item != null ) {
+ return item.hashCode();
+ }
+ return 0;
}
class TBIterator implements Iterator<Tuple> {
diff --git a/test/org/apache/pig/test/TestDataModel.java b/test/org/apache/pig/test/TestDataModel.java
index 4568c82..3b8a4a6 100644
--- a/test/org/apache/pig/test/TestDataModel.java
+++ b/test/org/apache/pig/test/TestDataModel.java
@@ -17,6 +17,7 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@@ -42,7 +43,15 @@
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
+import org.apache.pig.data.DefaultDataBag;
+import org.apache.pig.data.DistinctDataBag;
+import org.apache.pig.data.InternalCachedBag;
+import org.apache.pig.data.InternalDistinctBag;
import org.apache.pig.data.InternalMap;
+import org.apache.pig.data.InternalSortedBag;
+import org.apache.pig.data.NonSpillableDataBag;
+import org.apache.pig.data.SingleTupleBag;
+import org.apache.pig.data.SortedDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.junit.Test;
@@ -587,6 +596,77 @@
}
}
+ @Test
+ public void testBagHashCodeAndEquals() throws Exception {
+ DataBag [] ascBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag()};
+ DataBag [] dscBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag()};
+ Tuple input = giveMeOneOfEach();
+ TupleFactory tf = TupleFactory.getInstance();
+ for (int i = 0; i < ascBags.length; i++ ) {
+ for( int j = 0; j < input.size(); j++ ) {
+ ascBags[i].add(tf.newTuple(input.get(j)));
+ dscBags[i].add(tf.newTuple(input.get(input.size() - 1 - j)));
+ }
+ }
+ // making sure equals(null) does not throw NPE
+ for (DataBag bag : ascBags ) {
+ assertFalse(bag.equals(null));
+ }
+ // Comparing against each other including itself
+ for (DataBag bag1 : ascBags ) {
+ for (DataBag bag2 : ascBags ) {
+ assertEquals("Comparing hashcode for " + bag1.getClass() + " and " + bag2.getClass() + " failed", bag1.hashCode(), bag2.hashCode());
+ assertEquals("Comparing content of Bag for " + bag1.getClass() + " and " + bag2.getClass() + " failed", bag1, bag2);
+ }
+ }
+ // Comparing against each other again but now with different insertion
+ // order. (including from the same type of Bag class)
+ for (DataBag ascBag : ascBags) {
+ for (DataBag dscBag : dscBags) {
+ assertEquals("Comparing hashcode for " + ascBag.getClass() + " and " + dscBag.getClass() + " failed", ascBag.hashCode(), dscBag.hashCode());
+ assertEquals("Comparing content of Bag for " + ascBag.getClass() + " and " + dscBag.getClass() + " failed", ascBag, dscBag);
+ }
+ }
+ }
+
+ @Test
+ public void testBagHashCodeAndEqualsForSingleTuple() throws Exception {
+ TupleFactory tf = TupleFactory.getInstance();
+
+ // For each datatype, making sure bag hashcode and content match
+ for( Object input: giveMeOneOfEach() ) {
+ DataBag [] singleItemBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag(), new SingleTupleBag(null)};
+ DataBag [] twoItemBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag()};
+
+ for( DataBag singleItemBag : singleItemBags ) {
+ singleItemBag.add(tf.newTuple(input));
+ }
+
+ for( DataBag singleItemBag1 : singleItemBags ) {
+ for( DataBag singleItemBag2 : singleItemBags ) {
+ assertEquals("Comparing hashcode for " + singleItemBag1.getClass()
+ + " and " + singleItemBag2.getClass() + " failed",
+ singleItemBag1.hashCode(), singleItemBag2.hashCode());
+ assertEquals("Comparing content of Bag for " + singleItemBag1.getClass()
+ + " and " + singleItemBag2.getClass() + " failed",
+ singleItemBag1, singleItemBag2);
+ }
+ }
+
+ // making sure no two-items-bag would match with single item bag
+ for( DataBag twoItemBag : twoItemBags ) {
+ twoItemBag.add(tf.newTuple(input));
+ twoItemBag.add(tf.newTuple(new String("Unique string not to overlap with a tuple above")));
+ }
+
+ for( DataBag twoItemBag : twoItemBags ) {
+ for( DataBag singleItemBag : singleItemBags ) {
+ assertNotEquals(twoItemBag, singleItemBag);
+ }
+ }
+ }
+ }
+
private Tuple giveMeOneOfEach() throws Exception {
TupleFactory tf = TupleFactory.getInstance();