PIG-5300: hashCode for Bag needs to be order independent (knoguchi)


git-svn-id: https://svn.apache.org/repos/asf/pig/trunk@1818985 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/CHANGES.txt b/CHANGES.txt
index bdd211b..694a637 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -59,6 +59,7 @@
 OPTIMIZATIONS
  
 BUG FIXES
+PIG-5300: hashCode for Bag needs to be order independent (knoguchi)
 
 PIG-5318: Unit test failures on Pig on Spark with Spark 2.2 (nkollar via szita)
 
diff --git a/src/org/apache/pig/data/DataBag.java b/src/org/apache/pig/data/DataBag.java
index dcbf6a7..ae4641b 100644
--- a/src/org/apache/pig/data/DataBag.java
+++ b/src/org/apache/pig/data/DataBag.java
@@ -126,6 +126,15 @@
     void clear();
 
     /**
+     * Returns the hash code value for the bag.  The hash code of a bag is
+     * defined to be the sum of the hash codes of each tuple.
+     * This ensures that b1.equals(b2) implies b1.hashCode() == b2.hashCode()
+     *
+     * @return the hash code value for this bag
+     */
+    int hashCode();
+
+    /**
      * This is used by FuncEvalSpec.FakeDataBag.
      * @param stale Set stale state.
      */
diff --git a/src/org/apache/pig/data/DefaultAbstractBag.java b/src/org/apache/pig/data/DefaultAbstractBag.java
index 4c2d4f7..08cbcc9 100644
--- a/src/org/apache/pig/data/DefaultAbstractBag.java
+++ b/src/org/apache/pig/data/DefaultAbstractBag.java
@@ -309,6 +309,9 @@
 
     @Override
     public boolean equals(Object other) {
+        if( other == null ) {
+            return false;
+        }
         return compareTo(other) == 0;
     }
 
@@ -359,11 +362,10 @@
 
     @Override
     public int hashCode() {
-        int hash = 1;
+        int hash = 0;
         Iterator<Tuple> i = iterator();
         while (i.hasNext()) {
-            // Use 37 because we want a prime, and tuple uses 31.
-            hash = 37 * hash + i.next().hashCode();
+            hash += i.next().hashCode();
         }
         return hash;
     }
diff --git a/src/org/apache/pig/data/NonSpillableDataBag.java b/src/org/apache/pig/data/NonSpillableDataBag.java
index 421b418..385af82 100644
--- a/src/org/apache/pig/data/NonSpillableDataBag.java
+++ b/src/org/apache/pig/data/NonSpillableDataBag.java
@@ -198,11 +198,18 @@
      */
     @Override
     public boolean equals(Object obj) {
+        if( obj == null) {
+            return false;
+        }
         return compareTo(obj) == 0;
     }
 
     public int hashCode() {
-        return mContents.hashCode();
+        int hash = 0;
+        for( Tuple t : mContents ) {
+            hash += t.hashCode();
+        }
+        return hash;
     }
 
     @SuppressWarnings("unchecked")
diff --git a/src/org/apache/pig/data/SingleTupleBag.java b/src/org/apache/pig/data/SingleTupleBag.java
index e9e6b7f..cfe2b4c 100644
--- a/src/org/apache/pig/data/SingleTupleBag.java
+++ b/src/org/apache/pig/data/SingleTupleBag.java
@@ -156,22 +156,37 @@
         }    
     }
 
-    /* (non-Javadoc)
-     * @see java.lang.Comparable#compareTo(java.lang.Object)
-     */
     @Override
-    public int compareTo(Object o) {
-        // TODO Auto-generated method stub
-        return 0;
+    public int compareTo(Object other) {
+        if (this == other)
+            return 0;
+        if (other instanceof DataBag) {
+            DataBag bOther = (DataBag) other;
+            if (bOther.size() != 1) {
+                if ( 1 > bOther.size()) return 1;
+                else return -1;
+            }
+            Iterator<Tuple> iter = bOther.iterator();
+            return item.compareTo(iter.next());
+        } else {
+            return DataType.compare(this, other);
+        }
     }
 
-    public boolean equals(Object o){
-        // TODO: match to compareTo if it is updated
-        return true;
+    @Override
+    public boolean equals(Object other) {
+        if( other == null ) {
+            return false;
+        }
+        return compareTo(other) == 0;
     }
 
+    @Override
     public int hashCode() {
-        return 42; 
+        if( item != null ) {
+            return item.hashCode();
+        }
+        return 0;
     }
 
     class TBIterator implements Iterator<Tuple> {
diff --git a/test/org/apache/pig/test/TestDataModel.java b/test/org/apache/pig/test/TestDataModel.java
index 4568c82..3b8a4a6 100644
--- a/test/org/apache/pig/test/TestDataModel.java
+++ b/test/org/apache/pig/test/TestDataModel.java
@@ -17,6 +17,7 @@
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
@@ -42,7 +43,15 @@
 import org.apache.pig.data.DataBag;
 import org.apache.pig.data.DataByteArray;
 import org.apache.pig.data.DataType;
+import org.apache.pig.data.DefaultDataBag;
+import org.apache.pig.data.DistinctDataBag;
+import org.apache.pig.data.InternalCachedBag;
+import org.apache.pig.data.InternalDistinctBag;
 import org.apache.pig.data.InternalMap;
+import org.apache.pig.data.InternalSortedBag;
+import org.apache.pig.data.NonSpillableDataBag;
+import org.apache.pig.data.SingleTupleBag;
+import org.apache.pig.data.SortedDataBag;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.TupleFactory;
 import org.junit.Test;
@@ -587,6 +596,77 @@
         }
     }
 
+    @Test
+    public void testBagHashCodeAndEquals() throws Exception {
+        DataBag [] ascBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag()};
+        DataBag [] dscBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag()};
+        Tuple input = giveMeOneOfEach();
+        TupleFactory tf = TupleFactory.getInstance();
+        for (int i = 0; i < ascBags.length; i++ ) {
+            for( int j = 0; j < input.size(); j++ ) {
+                ascBags[i].add(tf.newTuple(input.get(j)));
+                dscBags[i].add(tf.newTuple(input.get(input.size() - 1 - j)));
+            }
+        }
+        // making sure equals(null) does not throw NPE
+        for (DataBag bag : ascBags ) {
+            assertFalse(bag.equals(null));
+        }
+        // Comparing against each other including itself
+        for (DataBag bag1 : ascBags ) {
+            for (DataBag bag2 : ascBags ) {
+                assertEquals("Comparing hashcode for " + bag1.getClass() + " and " + bag2.getClass() + " failed", bag1.hashCode(), bag2.hashCode());
+                assertEquals("Comparing content of Bag for " + bag1.getClass() + " and " + bag2.getClass() + " failed", bag1, bag2);
+            }
+        }
+        // Comparing against each other again but now with different insertion
+        // order.  (including from the same type of Bag class)
+        for (DataBag ascBag : ascBags) {
+            for (DataBag dscBag : dscBags) {
+                assertEquals("Comparing hashcode for " + ascBag.getClass() + " and " + dscBag.getClass() + " failed", ascBag.hashCode(), dscBag.hashCode());
+                assertEquals("Comparing content of Bag for " + ascBag.getClass() + " and " + dscBag.getClass() + " failed", ascBag, dscBag);
+            }
+        }
+    }
+
+    @Test
+    public void testBagHashCodeAndEqualsForSingleTuple() throws Exception {
+        TupleFactory tf = TupleFactory.getInstance();
+
+        // For each datatype, making sure bag hashcode and content match
+        for( Object input: giveMeOneOfEach() ) {
+            DataBag [] singleItemBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag(), new SingleTupleBag(null)};
+            DataBag [] twoItemBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag()};
+
+            for( DataBag singleItemBag : singleItemBags ) {
+                singleItemBag.add(tf.newTuple(input));
+            }
+
+            for( DataBag singleItemBag1 : singleItemBags ) {
+                for( DataBag singleItemBag2 : singleItemBags ) {
+                    assertEquals("Comparing hashcode for " + singleItemBag1.getClass()
+                                 + " and " + singleItemBag2.getClass() + " failed",
+                                 singleItemBag1.hashCode(), singleItemBag2.hashCode());
+                    assertEquals("Comparing content of Bag for " + singleItemBag1.getClass()
+                                 + " and " + singleItemBag2.getClass() + " failed",
+                                 singleItemBag1, singleItemBag2);
+                }
+            }
+
+            // making sure no two-items-bag would match with single item bag
+            for( DataBag twoItemBag : twoItemBags ) {
+                twoItemBag.add(tf.newTuple(input));
+                twoItemBag.add(tf.newTuple(new String("Unique string not to overlap with a tuple above")));
+            }
+
+            for( DataBag twoItemBag : twoItemBags ) {
+                for( DataBag singleItemBag : singleItemBags ) {
+                    assertNotEquals(twoItemBag, singleItemBag);
+                }
+            }
+        }
+    }
+
     private Tuple giveMeOneOfEach() throws Exception {
         TupleFactory tf = TupleFactory.getInstance();