ARROW-7518: [Python] Use PYARROW_WITH_HDFS when building wheels, conda packages
Closes #6203 from kszucs/ARROW-7518 and squashes the following commits:
1e3f09195 <Krisztián Szűcs> test dataset import
6dd82a6d3 <Krisztián Szűcs> fix shebang
be3974826 <Krisztián Szűcs> fix flags
7d3668bb7 <Krisztián Szűcs> enable hdfs in the conda recipes and the wheels
Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
diff --git a/dev/tasks/conda-recipes/arrow-cpp/bld.bat b/dev/tasks/conda-recipes/arrow-cpp/bld.bat
index fd87e0c..0588961 100644
--- a/dev/tasks/conda-recipes/arrow-cpp/bld.bat
+++ b/dev/tasks/conda-recipes/arrow-cpp/bld.bat
@@ -22,6 +22,7 @@
-DARROW_MIMALLOC:BOOL=ON ^
-DARROW_DATASET:BOOL=ON ^
-DARROW_FLIGHT:BOOL=ON ^
+ -DARROW_HDFS:BOOL=ON ^
-DARROW_PARQUET:BOOL=ON ^
-DARROW_GANDIVA:BOOL=ON ^
-DARROW_ORC:BOOL=ON ^
diff --git a/dev/tasks/conda-recipes/arrow-cpp/build.sh b/dev/tasks/conda-recipes/arrow-cpp/build.sh
index 70f4894..251187f 100644
--- a/dev/tasks/conda-recipes/arrow-cpp/build.sh
+++ b/dev/tasks/conda-recipes/arrow-cpp/build.sh
@@ -40,6 +40,7 @@
-DARROW_PYTHON=ON \
-DARROW_PARQUET=ON \
-DARROW_GANDIVA=ON \
+ -DARROW_HDFS=ON \
-DARROW_ORC=ON \
-DARROW_S3=ON \
-DCMAKE_AR=${AR} \
diff --git a/dev/tasks/conda-recipes/pyarrow/bld.bat b/dev/tasks/conda-recipes/pyarrow/bld.bat
index f06eb21..dbea193 100644
--- a/dev/tasks/conda-recipes/pyarrow/bld.bat
+++ b/dev/tasks/conda-recipes/pyarrow/bld.bat
@@ -16,6 +16,7 @@
SET SETUPTOOLS_SCM_PRETEND_VERSION=%PKG_VERSION%
SET PYARROW_BUILD_TYPE=release
SET PYARROW_WITH_S3=1
+SET PYARROW_WITH_HDFS=1
SET PYARROW_WITH_DATASET=1
SET PYARROW_WITH_FLIGHT=1
SET PYARROW_WITH_GANDIVA=1
diff --git a/dev/tasks/conda-recipes/pyarrow/build.sh b/dev/tasks/conda-recipes/pyarrow/build.sh
index 1242f4b..a095dc2 100644
--- a/dev/tasks/conda-recipes/pyarrow/build.sh
+++ b/dev/tasks/conda-recipes/pyarrow/build.sh
@@ -11,6 +11,7 @@
export PYARROW_WITH_DATASET=1
export PYARROW_WITH_FLIGHT=1
export PYARROW_WITH_GANDIVA=1
+export PYARROW_WITH_HDFS=1
export PYARROW_WITH_ORC=1
export PYARROW_WITH_PARQUET=1
export PYARROW_WITH_PLASMA=1
diff --git a/dev/tasks/conda-recipes/pyarrow/meta.yaml b/dev/tasks/conda-recipes/pyarrow/meta.yaml
index 6ea3d64..abc1720 100644
--- a/dev/tasks/conda-recipes/pyarrow/meta.yaml
+++ b/dev/tasks/conda-recipes/pyarrow/meta.yaml
@@ -44,13 +44,15 @@
test:
imports:
- pyarrow
- - pyarrow.fs
- pyarrow.dataset # [not py==27]
- pyarrow.flight # [not py==27]
- pyarrow.gandiva # [not py==27]
- pyarrow.orc # [unix]
- pyarrow.parquet
- pyarrow.plasma # [unix]
+ - pyarrow.fs
+ - pyarrow._s3fs
+ - pyarrow._hdfs
requires:
- pytest
diff --git a/dev/tasks/python-wheels/azure.linux.yml b/dev/tasks/python-wheels/azure.linux.yml
index 23ebfd5..87510b8 100644
--- a/dev/tasks/python-wheels/azure.linux.yml
+++ b/dev/tasks/python-wheels/azure.linux.yml
@@ -44,7 +44,7 @@
# TODO(kou): Uncomment this when we resolve "ADD never use cache" problem.
# docker-compose build $BUILD_IMAGE
docker-compose run \
- -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} \
+ -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \
-e PYTHON_VERSION="{{ python_version }}" \
-e UNICODE_WIDTH="{{ unicode_width }}" \
$BUILD_IMAGE
diff --git a/dev/tasks/python-wheels/manylinux-test.sh b/dev/tasks/python-wheels/manylinux-test.sh
index 6656801..4142791 100755
--- a/dev/tasks/python-wheels/manylinux-test.sh
+++ b/dev/tasks/python-wheels/manylinux-test.sh
@@ -41,6 +41,8 @@
import pyarrow
import pyarrow.parquet
import pyarrow.plasma
+import pyarrow.fs
+import pyarrow._hdfs
if sys.version_info.major > 2:
import pyarrow.dataset
diff --git a/dev/tasks/python-wheels/osx-build.sh b/dev/tasks/python-wheels/osx-build.sh
index c896ef5..ffaf076 100755
--- a/dev/tasks/python-wheels/osx-build.sh
+++ b/dev/tasks/python-wheels/osx-build.sh
@@ -124,6 +124,7 @@
-DARROW_BUILD_TESTS=OFF \
-DARROW_DATASET=ON \
-DARROW_DEPENDENCY_SOURCE=BUNDLED \
+ -DARROW_HDFS=ON \
-DARROW_FLIGHT=ON \
-DARROW_GANDIVA=${BUILD_ARROW_GANDIVA} \
-DARROW_JEMALLOC=ON \
@@ -166,6 +167,7 @@
export PYARROW_WITH_DATASET=1
export PYARROW_WITH_FLIGHT=1
+ export PYARROW_WITH_HDFS=1
export PYARROW_WITH_PLASMA=1
export PYARROW_WITH_PARQUET=1
export PYARROW_WITH_ORC=0
@@ -220,6 +222,8 @@
import pyarrow
import pyarrow.parquet
import pyarrow.plasma
+import pyarrow.fs
+import pyarrow._hdfs
if sys.version_info.major > 2:
import pyarrow.dataset
diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh
index bdc303f..52825fd 100755
--- a/python/manylinux1/build_arrow.sh
+++ b/python/manylinux1/build_arrow.sh
@@ -45,7 +45,7 @@
# ARROW-6860: Disabling ORC in wheels until Protobuf static linking issues
# across projects is resolved
export PYARROW_WITH_ORC=0
-
+export PYARROW_WITH_HDFS=1
export PYARROW_WITH_PARQUET=1
export PYARROW_WITH_PLASMA=1
export PYARROW_BUNDLE_ARROW_CPP=1
@@ -97,36 +97,38 @@
ARROW_BUILD_DIR=/tmp/build-PY${PYTHON_VERSION}-${UNICODE_WIDTH}
mkdir -p "${ARROW_BUILD_DIR}"
pushd "${ARROW_BUILD_DIR}"
-cmake -DCMAKE_BUILD_TYPE=Release \
- -DARROW_DEPENDENCY_SOURCE="SYSTEM" \
- -DCMAKE_INSTALL_PREFIX=/arrow-dist \
- -DCMAKE_INSTALL_LIBDIR=lib \
- -DARROW_BUILD_TESTS=OFF \
- -DARROW_BUILD_SHARED=ON \
+cmake \
+ -DCMAKE_BUILD_TYPE=Release \
-DARROW_BOOST_USE_SHARED=ON \
+ -DARROW_BUILD_SHARED=ON \
+ -DARROW_BUILD_TESTS=OFF \
+ -DARROW_DATASET=${BUILD_ARROW_DATASET} \
+ -DARROW_DEPENDENCY_SOURCE="SYSTEM" \
+ -DARROW_FLIGHT=${BUILD_ARROW_FLIGHT} \
+ -DARROW_GANDIVA_JAVA=OFF \
-DARROW_GANDIVA_PC_CXX_FLAGS="-isystem;/opt/rh/devtoolset-2/root/usr/include/c++/4.8.2;-isystem;/opt/rh/devtoolset-2/root/usr/include/c++/4.8.2/x86_64-CentOS-linux/" \
+ -DARROW_GANDIVA=${BUILD_ARROW_GANDIVA} \
+ -DARROW_HDFS=ON \
-DARROW_JEMALLOC=ON \
- -DARROW_RPATH_ORIGIN=ON \
- -DARROW_PYTHON=ON \
- -DARROW_PARQUET=ON \
- -DPythonInterp_FIND_VERSION=${PYTHON_VERSION} \
- -DARROW_PLASMA=ON \
- -DARROW_TENSORFLOW=ON \
-DARROW_ORC=OFF \
- -DORC_SOURCE=BUNDLED \
+ -DARROW_PARQUET=ON \
+ -DARROW_PLASMA=ON \
+ -DARROW_PYTHON=ON \
+ -DARROW_RPATH_ORIGIN=ON \
+ -DARROW_TENSORFLOW=ON \
+ -DARROW_WITH_BROTLI=ON \
-DARROW_WITH_BZ2=ON \
- -DARROW_WITH_ZLIB=ON \
- -DARROW_WITH_ZSTD=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_SNAPPY=ON \
- -DARROW_WITH_BROTLI=ON \
- -DARROW_DATASET=${BUILD_ARROW_DATASET} \
- -DARROW_FLIGHT=${BUILD_ARROW_FLIGHT} \
- -DARROW_GANDIVA=${BUILD_ARROW_GANDIVA} \
- -DARROW_GANDIVA_JAVA=OFF \
+ -DARROW_WITH_ZLIB=ON \
+ -DARROW_WITH_ZSTD=ON \
-DBoost_NAMESPACE=arrow_boost \
-DBOOST_ROOT=/arrow_boost_dist \
+ -DCMAKE_INSTALL_LIBDIR=lib \
+ -DCMAKE_INSTALL_PREFIX=/arrow-dist \
-DOPENSSL_USE_STATIC_LIBS=ON \
+ -DORC_SOURCE=BUNDLED \
+ -DPythonInterp_FIND_VERSION=${PYTHON_VERSION} \
-GNinja /arrow/cpp
ninja
ninja install
@@ -164,6 +166,8 @@
import pyarrow
import pyarrow.parquet
import pyarrow.plasma
+import pyarrow.fs
+import pyarrow._hdfs
if sys.version_info.major > 2:
import pyarrow.dataset
diff --git a/python/manylinux201x/build_arrow.sh b/python/manylinux201x/build_arrow.sh
index 661dac1..7ac9e98 100755
--- a/python/manylinux201x/build_arrow.sh
+++ b/python/manylinux201x/build_arrow.sh
@@ -46,7 +46,7 @@
# ARROW-6860: Disabling ORC in wheels until Protobuf static linking issues
# across projects is resolved
export PYARROW_WITH_ORC=0
-
+export PYARROW_WITH_HDFS=1
export PYARROW_WITH_PARQUET=1
export PYARROW_WITH_PLASMA=1
export PYARROW_BUNDLE_ARROW_CPP=1
@@ -98,38 +98,40 @@
ARROW_BUILD_DIR=/tmp/build-PY${PYTHON_VERSION}-${UNICODE_WIDTH}
mkdir -p "${ARROW_BUILD_DIR}"
pushd "${ARROW_BUILD_DIR}"
-PATH="${CPYTHON_PATH}/bin:${PATH}" cmake -DCMAKE_BUILD_TYPE=Release \
- -DARROW_DEPENDENCY_SOURCE="SYSTEM" \
- -DZLIB_ROOT=/usr/local \
- -DCMAKE_INSTALL_PREFIX=/arrow-dist \
- -DCMAKE_INSTALL_LIBDIR=lib \
- -DARROW_BUILD_TESTS=OFF \
+PATH="${CPYTHON_PATH}/bin:${PATH}" cmake \
+ -DARROW_BOOST_USE_SHARED=ON \
-DARROW_BUILD_SHARED=ON \
-DARROW_BUILD_STATIC=OFF \
- -DARROW_BOOST_USE_SHARED=ON \
+ -DARROW_BUILD_TESTS=OFF \
+ -DARROW_DATASET=${BUILD_ARROW_DATASET} \
+ -DARROW_DEPENDENCY_SOURCE="SYSTEM" \
+ -DARROW_FLIGHT=${BUILD_ARROW_FLIGHT} \
+ -DARROW_GANDIVA_JAVA=OFF \
-DARROW_GANDIVA_PC_CXX_FLAGS="-isystem;/opt/rh/devtoolset-8/root/usr/include/c++/8/;-isystem;/opt/rh/devtoolset-8/root/usr/include/c++/8/x86_64-redhat-linux/" \
+ -DARROW_GANDIVA=${BUILD_ARROW_GANDIVA} \
+ -DARROW_HDFS=ON \
-DARROW_JEMALLOC=ON \
- -DARROW_RPATH_ORIGIN=ON \
- -DARROW_PYTHON=ON \
- -DARROW_PARQUET=ON \
- -DPythonInterp_FIND_VERSION=${PYTHON_VERSION} \
- -DARROW_PLASMA=ON \
- -DARROW_TENSORFLOW=ON \
-DARROW_ORC=OFF \
- -DORC_SOURCE=BUNDLED \
+ -DARROW_PARQUET=ON \
+ -DARROW_PLASMA=ON \
+ -DARROW_PYTHON=ON \
+ -DARROW_RPATH_ORIGIN=ON \
+ -DARROW_TENSORFLOW=ON \
+ -DARROW_WITH_BROTLI=ON \
-DARROW_WITH_BZ2=ON \
- -DARROW_WITH_ZLIB=ON \
- -DARROW_WITH_ZSTD=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_SNAPPY=ON \
- -DARROW_WITH_BROTLI=ON \
- -DARROW_DATASET=${BUILD_ARROW_DATASET} \
- -DARROW_FLIGHT=${BUILD_ARROW_FLIGHT} \
- -DARROW_GANDIVA=${BUILD_ARROW_GANDIVA} \
- -DARROW_GANDIVA_JAVA=OFF \
+ -DARROW_WITH_ZLIB=ON \
+ -DARROW_WITH_ZSTD=ON \
-DBoost_NAMESPACE=arrow_boost \
-DBOOST_ROOT=/arrow_boost_dist \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_INSTALL_LIBDIR=lib \
+ -DCMAKE_INSTALL_PREFIX=/arrow-dist \
-DOPENSSL_USE_STATIC_LIBS=ON \
+ -DORC_SOURCE=BUNDLED \
+ -DPythonInterp_FIND_VERSION=${PYTHON_VERSION} \
+ -DZLIB_ROOT=/usr/local \
-GNinja /arrow/cpp
ninja install
popd
@@ -162,6 +164,8 @@
import pyarrow
import pyarrow.parquet
import pyarrow.plasma
+import pyarrow.fs
+import pyarrow._hdfs
if sys.version_info.major > 2:
import pyarrow.dataset