PARQUET-2158: Upgrade Hadoop dependency to version 3.2.0 (#976)
* PARQUET-2158: Upgrade Hadoop dependency to version 3.2.0
This updates Parquet's Hadoop dependency to 3.2.0.
This version adds compatibility with Java 11, as well
as many other features and bug fixes.
* PARQUET-2158. PathGlobPattern to compile/link with hadoop 3.2.0
The deprecated parquet-thrift class PathGlobPattern doesn't
compile against hadoop 3.x because in HADOOP-12436 the
nominally private class org.apache.hadoop.fs.GlobPattern
implementation switched from using java.util.regex.Pattern
to com.google.re2j.PatternSyntaxException.
The fact nobody has ever reported this problem implies that it
is never used on any hadoop 3 release, ever.
This commit fixes the build by moving to the google classes.
The alternative strategy would actually be to fork the hadoop
class. This will work unless/until the hadoop project changes
the class again.
It may be time to consider removing entirely. Clearly nobody
is actually using it.
* PARQUET-2158. build auditing to cope with switch to google rej2j.
Disables the API compatibility check and adds rej2j as a 'provided'
dependency so that the relevant auditing checks do not fail.
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInputFormat.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInputFormat.java
index 02c80fc..697e8e2 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInputFormat.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInputFormat.java
@@ -42,7 +42,6 @@
import java.util.List;
import java.util.Map;
-import org.apache.commons.lang.SystemUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
diff --git a/parquet-thrift/pom.xml b/parquet-thrift/pom.xml
index 7f08ca6..e30b5d4 100644
--- a/parquet-thrift/pom.xml
+++ b/parquet-thrift/pom.xml
@@ -174,6 +174,14 @@
<type>test-jar</type>
<scope>test</scope>
</dependency>
+ <dependency>
+ <!-- Needed to compile PathGlobPattern on Hadoop 3.
+ If that deprecated class is removed, so can this dependency -->
+ <groupId>com.google.re2j</groupId>
+ <artifactId>re2j</artifactId>
+ <version>1.1</version>
+ <scope>provided</scope>
+ </dependency>
</dependencies>
<dependencyManagement>
diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPattern.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPattern.java
index ba5646d..4d4136f 100644
--- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPattern.java
+++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/projection/deprecated/PathGlobPattern.java
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -6,9 +6,9 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,8 +20,8 @@
import org.apache.hadoop.fs.GlobPattern;
-import java.util.regex.Pattern;
-import java.util.regex.PatternSyntaxException;
+import com.google.re2j.Pattern;
+import com.google.re2j.PatternSyntaxException;
/**
* Enhanced version of GlobPattern class that is defined in hadoop with extra capability of matching
@@ -56,7 +56,7 @@
}
private static void error(String message, String pattern, int pos) {
- throw new PatternSyntaxException(message, pattern, pos);
+ throw new PatternSyntaxException(String.format("%1s at %2d", message, pos), pattern);
}
/**
diff --git a/pom.xml b/pom.xml
index d37afb6..a9f23a5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -76,7 +76,7 @@
<jackson-databind.version>2.13.2.2</jackson-databind.version>
<japicmp.version>0.14.2</japicmp.version>
<shade.prefix>shaded.parquet</shade.prefix>
- <hadoop.version>2.10.1</hadoop.version>
+ <hadoop.version>3.2.0</hadoop.version>
<parquet.format.version>2.9.0</parquet.format.version>
<previous.version>1.12.0</previous.version>
<thrift.executable>thrift</thrift.executable>
@@ -519,6 +519,8 @@
change to fix a integer overflow issue.
TODO: remove this after Parquet 1.13 release -->
<exclude>org.apache.parquet.column.values.dictionary.DictionaryValuesWriter#dictionaryByteSize</exclude>
+ <!-- In PARQUET-2158 the return type of PathGlobPattern was changed to be compatible with Hadoop 3 -->
+ <exclude>org.apache.parquet.thrift.projection.deprecated.PathGlobPattern</exclude>
</excludes>
</parameter>
</configuration>