PARQUET-2052: Integer overflow when writing huge binary using dictionary encoding (#910)

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java
index 2999f3c..c4a9852 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java
@@ -81,7 +81,7 @@
   protected boolean dictionaryTooBig;
 
   /* current size in bytes the dictionary will take once serialized */
-  protected int dictionaryByteSize;
+  protected long dictionaryByteSize;
 
   /* size in bytes of the dictionary at the end of last dictionary encoded page (in case the current page falls back to PLAIN) */
   protected int lastUsedDictionaryByteSize;
@@ -173,7 +173,7 @@
       BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes);
       // remember size of dictionary when we last wrote a page
       lastUsedDictionarySize = getDictionarySize();
-      lastUsedDictionaryByteSize = dictionaryByteSize;
+      lastUsedDictionaryByteSize = Math.toIntExact(dictionaryByteSize);
       return bytes;
     } catch (IOException e) {
       throw new ParquetEncodingException("could not encode the values", e);
@@ -249,7 +249,7 @@
         id = binaryDictionaryContent.size();
         binaryDictionaryContent.put(v.copy(), id);
         // length as int (4 bytes) + actual bytes
-        dictionaryByteSize += 4 + v.length();
+        dictionaryByteSize += 4L + v.length();
       }
       encodedValues.add(id);
     }
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
index 2783b69..174fad8 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
@@ -53,6 +53,7 @@
 import org.apache.parquet.column.values.plain.PlainValuesWriter;
 import org.apache.parquet.io.api.Binary;
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.mockito.Mockito;
 
 public class TestDictionary {
 
@@ -172,6 +173,20 @@
   }
 
   @Test
+  public void testBinaryDictionaryIntegerOverflow() {
+    Binary mock = Mockito.mock(Binary.class);
+    Mockito.when(mock.length()).thenReturn(Integer.MAX_VALUE - 1);
+    // make the writer happy
+    Mockito.when(mock.copy()).thenReturn(Binary.fromString(" world"));
+
+    final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(100, 100);
+    cw.writeBytes(Binary.fromString("hello"));
+    cw.writeBytes(mock);
+
+    assertEquals(PLAIN, cw.getEncoding());
+  }
+
+  @Test
   public void testBinaryDictionaryChangedValues() throws IOException {
     int COUNT = 100;
     ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(200, 10000);
diff --git a/pom.xml b/pom.xml
index 24f1229..870c111 100644
--- a/pom.xml
+++ b/pom.xml
@@ -475,6 +475,10 @@
             </excludeModules>
             <excludes>
               <exclude>${shade.prefix}</exclude>
+              <!-- In PARQUET-2052 this field is changed from int to long which is a minor API
+                change to fix a integer overflow issue.
+                TODO: remove this after Parquet 1.13 release -->
+              <exclude>org.apache.parquet.column.values.dictionary.DictionaryValuesWriter#dictionaryByteSize</exclude>
             </excludes>
           </parameter>
         </configuration>