blob: b2e4f2a4db55308d138acdd001a1e4c2cc50b979 [file] [log] [blame]
#!/usr/bin/env python3
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib.util
import glob
import os
import sys
import ctypes
from setuptools import setup
from setuptools.command.install import install
from shutil import copyfile, copytree, rmtree
from pathlib import Path
if (
# When we package, the parent directory 'classic' dir
# (as we pip install -e python/packaging/classic)
os.getcwd() == str(Path(__file__).parent.absolute())
and str(Path(__file__).parent.name) == "classic"
):
# For:
# - pip install -e python/packaging/classic
# It moves the current working directory to 'classic'
# - cd python/packaging/classic; python setup.py sdist
#
# For:
# - python packaging/classic/setup.py sdist, it does not
# execute this branch.
#
# Move to spark/python
os.chdir(Path(__file__).parent.parent.parent.absolute())
try:
exec(open("pyspark/version.py").read())
except IOError:
print(
"Failed to load PySpark version file for packaging. You must be in Spark's python dir.",
file=sys.stderr,
)
sys.exit(-1)
try:
spec = importlib.util.spec_from_file_location("install", "pyspark/install.py")
install_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(install_module)
except IOError:
print(
"Failed to load the installing module (pyspark/install.py) which had to be "
"packaged together.",
file=sys.stderr,
)
sys.exit(-1)
VERSION = __version__ # noqa
# A temporary path so we can access above the Python project root and fetch scripts and jars we need
TEMP_PATH = "deps"
SPARK_HOME = os.path.abspath("../")
# Provide guidance about how to use setup.py
incorrect_invocation_message = """
If you are installing pyspark from spark source, you must first build Spark and
run sdist.
To build Spark with maven you can run:
./build/mvn -DskipTests clean package
Building the source dist is done in the Python directory:
cd python
python packaging/classic/setup.py sdist
pip install dist/*.tar.gz"""
# Figure out where the jars are we need to package with PySpark.
JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/"))
if len(JARS_PATH) == 1:
JARS_PATH = JARS_PATH[0]
elif os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1:
# Release mode puts the jars in a jars directory
JARS_PATH = os.path.join(SPARK_HOME, "jars")
elif len(JARS_PATH) > 1:
print(
"Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format(
JARS_PATH
),
file=sys.stderr,
)
sys.exit(-1)
elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
print(incorrect_invocation_message, file=sys.stderr)
sys.exit(-1)
EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
USER_SCRIPTS_PATH = os.path.join(SPARK_HOME, "sbin")
DATA_PATH = os.path.join(SPARK_HOME, "data")
LICENSES_PATH = os.path.join(SPARK_HOME, "licenses")
SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
USER_SCRIPTS_TARGET = os.path.join(TEMP_PATH, "sbin")
JARS_TARGET = os.path.join(TEMP_PATH, "jars")
EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
DATA_TARGET = os.path.join(TEMP_PATH, "data")
LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses")
# Check and see if we are under the spark path in which case we need to build the symlink farm.
# This is important because we only want to build the symlink farm while under Spark otherwise we
# want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a
# partially built sdist) we should error and have the user sort it out.
in_spark = os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or (
os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1
)
def _supports_symlinks():
"""Check if the system supports symlinks (e.g. *nix) or not."""
return hasattr(os, "symlink") and (
(not hasattr(ctypes, "windll")) # Non-Windows
or (
# In some Windows, `os.symlink` works but only for admins.
hasattr(ctypes.windll, "shell32")
and hasattr(ctypes.windll.shell32, "IsUserAnAdmin")
and bool(ctypes.windll.shell32.IsUserAnAdmin())
)
)
if in_spark:
# Construct links for setup
try:
os.mkdir(TEMP_PATH)
except BaseException:
print(
"Temp path for symlink to parent already exists {0}".format(TEMP_PATH), file=sys.stderr
)
sys.exit(-1)
# If you are changing the versions here, please also change ./python/pyspark/sql/pandas/utils.py
# For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the
# binary format protocol with the Java version, see ARROW_HOME/format/* for specifications.
# Also don't forget to update python/docs/source/getting_started/install.rst,
# python/packaging/client/setup.py, and python/packaging/connect/setup.py
_minimum_pandas_version = "2.2.0"
_minimum_numpy_version = "1.21"
_minimum_pyarrow_version = "15.0.0"
_minimum_grpc_version = "1.67.0"
_minimum_googleapis_common_protos_version = "1.65.0"
_minimum_pyyaml_version = "3.11"
class InstallCommand(install):
# TODO(SPARK-32837) leverage pip's custom options
def run(self):
install.run(self)
# Make sure the destination is always clean.
spark_dist = os.path.join(self.install_lib, "pyspark", "spark-distribution")
rmtree(spark_dist, ignore_errors=True)
if ("PYSPARK_HADOOP_VERSION" in os.environ) or ("PYSPARK_HIVE_VERSION" in os.environ):
# Note that PYSPARK_VERSION environment is just a testing purpose.
# PYSPARK_HIVE_VERSION environment variable is also internal for now in case
# we support another version of Hive in the future.
spark_version, hadoop_version, hive_version = install_module.checked_versions(
os.environ.get("PYSPARK_VERSION", VERSION).lower(),
os.environ.get("PYSPARK_HADOOP_VERSION", install_module.DEFAULT_HADOOP).lower(),
os.environ.get("PYSPARK_HIVE_VERSION", install_module.DEFAULT_HIVE).lower(),
)
if "PYSPARK_VERSION" not in os.environ and (
(install_module.DEFAULT_HADOOP, install_module.DEFAULT_HIVE)
== (hadoop_version, hive_version)
):
# Do not download and install if they are same as default.
return
install_module.install_spark(
dest=spark_dist,
spark_version=spark_version,
hadoop_version=hadoop_version,
hive_version=hive_version,
)
try:
# We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts
# find it where expected. The rest of the files aren't copied because they are accessed
# using Python imports instead which will be resolved correctly.
try:
os.makedirs("pyspark/python/pyspark")
except OSError:
# Don't worry if the directory already exists.
pass
copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py")
if in_spark:
# !!HACK ALTERT!!
# `setup.py` has to be located with the same directory with the package.
# Therefore, we copy the current file, and place it at `spark/python` directory.
# After that, we remove it in the end.
copyfile("packaging/classic/setup.py", "setup.py")
copyfile("packaging/classic/setup.cfg", "setup.cfg")
# Construct the symlink farm - this is nein_sparkcessary since we can't refer to
# the path above the package root and we need to copy the jars and scripts which
# are up above the python root.
if _supports_symlinks():
os.symlink(JARS_PATH, JARS_TARGET)
os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
os.symlink(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET)
os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
os.symlink(DATA_PATH, DATA_TARGET)
os.symlink(LICENSES_PATH, LICENSES_TARGET)
else:
# For windows fall back to the slower copytree
copytree(JARS_PATH, JARS_TARGET)
copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
copytree(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET)
copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
copytree(DATA_PATH, DATA_TARGET)
copytree(LICENSES_PATH, LICENSES_TARGET)
else:
# If we are not inside of SPARK_HOME verify we have the required symlink farm
if not os.path.exists(JARS_TARGET):
print(
"To build packaging must be in the python directory under the SPARK_HOME.",
file=sys.stderr,
)
if not os.path.isdir(SCRIPTS_TARGET):
print(incorrect_invocation_message, file=sys.stderr)
sys.exit(-1)
# Scripts directive requires a list of each script path and does not take wild cards.
script_names = os.listdir(SCRIPTS_TARGET)
scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names))
# We add find_spark_home.py to the bin directory we install so that pip installed PySpark
# will search for SPARK_HOME with Python.
scripts.append("pyspark/find_spark_home.py")
with open("README.md") as f:
long_description = f.read()
setup(
name="pyspark",
version=VERSION,
description="Apache Spark Python API",
long_description=long_description,
long_description_content_type="text/markdown",
author="Spark Developers",
author_email="dev@spark.apache.org",
url="https://github.com/apache/spark/tree/master/python",
packages=[
"pyspark",
"pyspark.core",
"pyspark.cloudpickle",
"pyspark.mllib",
"pyspark.mllib.linalg",
"pyspark.mllib.stat",
"pyspark.ml",
"pyspark.ml.connect",
"pyspark.ml.linalg",
"pyspark.ml.param",
"pyspark.ml.torch",
"pyspark.ml.deepspeed",
"pyspark.sql",
"pyspark.sql.avro",
"pyspark.sql.classic",
"pyspark.sql.connect",
"pyspark.sql.connect.avro",
"pyspark.sql.connect.client",
"pyspark.sql.connect.functions",
"pyspark.sql.connect.proto",
"pyspark.sql.connect.protobuf",
"pyspark.sql.connect.resource",
"pyspark.sql.connect.shell",
"pyspark.sql.connect.streaming",
"pyspark.sql.connect.streaming.worker",
"pyspark.sql.functions",
"pyspark.sql.pandas",
"pyspark.sql.plot",
"pyspark.sql.protobuf",
"pyspark.sql.streaming",
"pyspark.sql.streaming.proto",
"pyspark.sql.worker",
"pyspark.streaming",
"pyspark.bin",
"pyspark.sbin",
"pyspark.jars",
"pyspark.pandas",
"pyspark.pandas.data_type_ops",
"pyspark.pandas.indexes",
"pyspark.pandas.missing",
"pyspark.pandas.plot",
"pyspark.pandas.spark",
"pyspark.pandas.typedef",
"pyspark.pandas.usage_logging",
"pyspark.pipelines",
"pyspark.python.pyspark",
"pyspark.python.lib",
"pyspark.testing",
"pyspark.data",
"pyspark.licenses",
"pyspark.resource",
"pyspark.errors",
"pyspark.errors.exceptions",
"pyspark.examples.src.main.python",
"pyspark.logger",
],
include_package_data=True,
package_dir={
"pyspark.jars": "deps/jars",
"pyspark.bin": "deps/bin",
"pyspark.sbin": "deps/sbin",
"pyspark.python.lib": "lib",
"pyspark.data": "deps/data",
"pyspark.licenses": "deps/licenses",
"pyspark.examples.src.main.python": "deps/examples",
},
package_data={
"pyspark.jars": ["*.jar"],
"pyspark.bin": ["*"],
"pyspark.sbin": [
"spark-config.sh",
"spark-daemon.sh",
"start-history-server.sh",
"stop-history-server.sh",
],
"pyspark.python.lib": ["*.zip"],
"pyspark.data": ["*.txt", "*.data"],
"pyspark.licenses": ["*.txt"],
"pyspark.examples.src.main.python": ["*.py", "*/*.py"],
},
scripts=scripts,
license="Apache-2.0",
# Don't forget to update python/docs/source/getting_started/install.rst
# if you're updating the versions or dependencies.
install_requires=["py4j==0.10.9.9"],
extras_require={
"ml": ["numpy>=%s" % _minimum_numpy_version],
"mllib": ["numpy>=%s" % _minimum_numpy_version],
"sql": [
"pandas>=%s" % _minimum_pandas_version,
"pyarrow>=%s" % _minimum_pyarrow_version,
"numpy>=%s" % _minimum_numpy_version,
],
"pandas_on_spark": [
"pandas>=%s" % _minimum_pandas_version,
"pyarrow>=%s" % _minimum_pyarrow_version,
"numpy>=%s" % _minimum_numpy_version,
],
"connect": [
"pandas>=%s" % _minimum_pandas_version,
"pyarrow>=%s" % _minimum_pyarrow_version,
"grpcio>=%s" % _minimum_grpc_version,
"grpcio-status>=%s" % _minimum_grpc_version,
"googleapis-common-protos>=%s" % _minimum_googleapis_common_protos_version,
"numpy>=%s" % _minimum_numpy_version,
],
"pipelines": [
"pandas>=%s" % _minimum_pandas_version,
"pyarrow>=%s" % _minimum_pyarrow_version,
"numpy>=%s" % _minimum_numpy_version,
"grpcio>=%s" % _minimum_grpc_version,
"grpcio-status>=%s" % _minimum_grpc_version,
"googleapis-common-protos>=%s" % _minimum_googleapis_common_protos_version,
"pyyaml>=%s" % _minimum_pyyaml_version,
],
},
python_requires=">=3.10",
classifiers=[
"Development Status :: 5 - Production/Stable",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"Typing :: Typed",
],
cmdclass={
"install": InstallCommand,
},
)
finally:
# We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than
# packaging.
if in_spark:
os.remove("setup.py")
os.remove("setup.cfg")
# Depending on cleaning up the symlink farm or copied version
if _supports_symlinks():
os.remove(os.path.join(TEMP_PATH, "jars"))
os.remove(os.path.join(TEMP_PATH, "bin"))
os.remove(os.path.join(TEMP_PATH, "sbin"))
os.remove(os.path.join(TEMP_PATH, "examples"))
os.remove(os.path.join(TEMP_PATH, "data"))
os.remove(os.path.join(TEMP_PATH, "licenses"))
else:
rmtree(os.path.join(TEMP_PATH, "jars"))
rmtree(os.path.join(TEMP_PATH, "bin"))
rmtree(os.path.join(TEMP_PATH, "sbin"))
rmtree(os.path.join(TEMP_PATH, "examples"))
rmtree(os.path.join(TEMP_PATH, "data"))
rmtree(os.path.join(TEMP_PATH, "licenses"))
os.rmdir(TEMP_PATH)