python/packaging/classic/setup.py - spark - Git at Google

 #!/usr/bin/env python3

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import importlib.util
 import glob
 import os
 import sys
 import ctypes
 from setuptools import setup
 from setuptools.command.install import install
 from shutil import copyfile, copytree, rmtree
 from pathlib import Path

 if (
     # When we package, the parent diectory 'classic' dir
     # (as we pip install -e python/packaging/classic)
     os.getcwd() == str(Path(__file__).parent.absolute())
     and str(Path(__file__).parent.name) == "classic"
 ):
     # For:
     # - pip install -e python/packaging/classic
     #     It moves the current working directory to 'classic'
     # - cd python/packaging/classic; python setup.py sdist
     #
     # For:
     # - python packaging/classic/setup.py sdist, it does not
     #     execute this branch.
     #
     # Move to spark/python
     os.chdir(Path(__file__).parent.parent.parent.absolute())

 try:
     exec(open("pyspark/version.py").read())
 except IOError:
     print(
         "Failed to load PySpark version file for packaging. You must be in Spark's python dir.",
         file=sys.stderr,
     )
     sys.exit(-1)
 try:
     spec = importlib.util.spec_from_file_location("install", "pyspark/install.py")
     install_module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(install_module)
 except IOError:
     print(
         "Failed to load the installing module (pyspark/install.py) which had to be "
         "packaged together.",
         file=sys.stderr,
     )
     sys.exit(-1)
 VERSION = __version__  # noqa
 # A temporary path so we can access above the Python project root and fetch scripts and jars we need
 TEMP_PATH = "deps"
 SPARK_HOME = os.path.abspath("../")

 # Provide guidance about how to use setup.py
 incorrect_invocation_message = """
 If you are installing pyspark from spark source, you must first build Spark and
 run sdist.

     To build Spark with maven you can run:
       ./build/mvn -DskipTests clean package
     Building the source dist is done in the Python directory:
       cd python
       python packaging/classic/setup.py sdist
       pip install dist/*.tar.gz"""

 # Figure out where the jars are we need to package with PySpark.
 JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/"))

 if len(JARS_PATH) == 1:
     JARS_PATH = JARS_PATH[0]
 elif os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1:
     # Release mode puts the jars in a jars directory
     JARS_PATH = os.path.join(SPARK_HOME, "jars")
 elif len(JARS_PATH) > 1:
     print(
         "Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format(
             JARS_PATH
         ),
         file=sys.stderr,
     )
     sys.exit(-1)
 elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
     print(incorrect_invocation_message, file=sys.stderr)
     sys.exit(-1)

 EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
 SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
 USER_SCRIPTS_PATH = os.path.join(SPARK_HOME, "sbin")
 DATA_PATH = os.path.join(SPARK_HOME, "data")
 LICENSES_PATH = os.path.join(SPARK_HOME, "licenses")

 SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
 USER_SCRIPTS_TARGET = os.path.join(TEMP_PATH, "sbin")
 JARS_TARGET = os.path.join(TEMP_PATH, "jars")
 EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
 DATA_TARGET = os.path.join(TEMP_PATH, "data")
 LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses")

 # Check and see if we are under the spark path in which case we need to build the symlink farm.
 # This is important because we only want to build the symlink farm while under Spark otherwise we
 # want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a
 # partially built sdist) we should error and have the user sort it out.
 in_spark = os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or (
     os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1
 )


 def _supports_symlinks():
     """Check if the system supports symlinks (e.g. *nix) or not."""
     return hasattr(os, "symlink") and (
         (not hasattr(ctypes, "windll"))  # Non-Windows
         or (
             # In some Windows, `os.symlink` works but only for admins.
             hasattr(ctypes.windll, "shell32")
             and hasattr(ctypes.windll.shell32, "IsUserAnAdmin")
             and bool(ctypes.windll.shell32.IsUserAnAdmin())
         )
     )


 if in_spark:
     # Construct links for setup
     try:
         os.mkdir(TEMP_PATH)
     except BaseException:
         print(
             "Temp path for symlink to parent already exists {0}".format(TEMP_PATH), file=sys.stderr
         )
         sys.exit(-1)

 # If you are changing the versions here, please also change ./python/pyspark/sql/pandas/utils.py
 # For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the
 # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications.
 # Also don't forget to update python/docs/source/getting_started/install.rst, and
 # python/packaging/connect/setup.py
 _minimum_pandas_version = "2.0.0"
 _minimum_numpy_version = "1.21"
 _minimum_pyarrow_version = "10.0.0"
 _minimum_grpc_version = "1.62.0"
 _minimum_googleapis_common_protos_version = "1.56.4"


 class InstallCommand(install):
     # TODO(SPARK-32837) leverage pip's custom options

     def run(self):
         install.run(self)

         # Make sure the destination is always clean.
         spark_dist = os.path.join(self.install_lib, "pyspark", "spark-distribution")
         rmtree(spark_dist, ignore_errors=True)

         if ("PYSPARK_HADOOP_VERSION" in os.environ) or ("PYSPARK_HIVE_VERSION" in os.environ):
             # Note that PYSPARK_VERSION environment is just a testing purpose.
             # PYSPARK_HIVE_VERSION environment variable is also internal for now in case
             # we support another version of Hive in the future.
             spark_version, hadoop_version, hive_version = install_module.checked_versions(
                 os.environ.get("PYSPARK_VERSION", VERSION).lower(),
                 os.environ.get("PYSPARK_HADOOP_VERSION", install_module.DEFAULT_HADOOP).lower(),
                 os.environ.get("PYSPARK_HIVE_VERSION", install_module.DEFAULT_HIVE).lower(),
             )

             if "PYSPARK_VERSION" not in os.environ and (
                 (install_module.DEFAULT_HADOOP, install_module.DEFAULT_HIVE)
                 == (hadoop_version, hive_version)
             ):
                 # Do not download and install if they are same as default.
                 return

             install_module.install_spark(
                 dest=spark_dist,
                 spark_version=spark_version,
                 hadoop_version=hadoop_version,
                 hive_version=hive_version,
             )


 try:
     # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts
     # find it where expected. The rest of the files aren't copied because they are accessed
     # using Python imports instead which will be resolved correctly.
     try:
         os.makedirs("pyspark/python/pyspark")
     except OSError:
         # Don't worry if the directory already exists.
         pass
     copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py")

     if in_spark:
         # !!HACK ALTERT!!
         # `setup.py` has to be located with the same directory with the package.
         # Therefore, we copy the current file, and place it at `spark/python` directory.
         # After that, we remove it in the end.
         copyfile("packaging/classic/setup.py", "setup.py")
         copyfile("packaging/classic/setup.cfg", "setup.cfg")

         # Construct the symlink farm - this is nein_sparkcessary since we can't refer to
         # the path above the package root and we need to copy the jars and scripts which
         # are up above the python root.
         if _supports_symlinks():
             os.symlink(JARS_PATH, JARS_TARGET)
             os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
             os.symlink(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET)
             os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
             os.symlink(DATA_PATH, DATA_TARGET)
             os.symlink(LICENSES_PATH, LICENSES_TARGET)
         else:
             # For windows fall back to the slower copytree
             copytree(JARS_PATH, JARS_TARGET)
             copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
             copytree(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET)
             copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
             copytree(DATA_PATH, DATA_TARGET)
             copytree(LICENSES_PATH, LICENSES_TARGET)
     else:
         # If we are not inside of SPARK_HOME verify we have the required symlink farm
         if not os.path.exists(JARS_TARGET):
             print(
                 "To build packaging must be in the python directory under the SPARK_HOME.",
                 file=sys.stderr,
             )

     if not os.path.isdir(SCRIPTS_TARGET):
         print(incorrect_invocation_message, file=sys.stderr)
         sys.exit(-1)

     # Scripts directive requires a list of each script path and does not take wild cards.
     script_names = os.listdir(SCRIPTS_TARGET)
     scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names))
     # We add find_spark_home.py to the bin directory we install so that pip installed PySpark
     # will search for SPARK_HOME with Python.
     scripts.append("pyspark/find_spark_home.py")

     with open("README.md") as f:
         long_description = f.read()

     setup(
         name="pyspark",
         version=VERSION,
         description="Apache Spark Python API",
         long_description=long_description,
         long_description_content_type="text/markdown",
         author="Spark Developers",
         author_email="dev@spark.apache.org",
         url="https://github.com/apache/spark/tree/master/python",
         packages=[
             "pyspark",
             "pyspark.core",
             "pyspark.cloudpickle",
             "pyspark.mllib",
             "pyspark.mllib.linalg",
             "pyspark.mllib.stat",
             "pyspark.ml",
             "pyspark.ml.connect",
             "pyspark.ml.linalg",
             "pyspark.ml.param",
             "pyspark.ml.torch",
             "pyspark.ml.deepspeed",
             "pyspark.sql",
             "pyspark.sql.avro",
             "pyspark.sql.classic",
             "pyspark.sql.connect",
             "pyspark.sql.connect.avro",
             "pyspark.sql.connect.client",
             "pyspark.sql.connect.functions",
             "pyspark.sql.connect.proto",
             "pyspark.sql.connect.protobuf",
             "pyspark.sql.connect.resource",
             "pyspark.sql.connect.shell",
             "pyspark.sql.connect.streaming",
             "pyspark.sql.connect.streaming.worker",
             "pyspark.sql.functions",
             "pyspark.sql.pandas",
             "pyspark.sql.plot",
             "pyspark.sql.protobuf",
             "pyspark.sql.streaming",
             "pyspark.sql.worker",
             "pyspark.streaming",
             "pyspark.bin",
             "pyspark.sbin",
             "pyspark.jars",
             "pyspark.pandas",
             "pyspark.pandas.data_type_ops",
             "pyspark.pandas.indexes",
             "pyspark.pandas.missing",
             "pyspark.pandas.plot",
             "pyspark.pandas.spark",
             "pyspark.pandas.typedef",
             "pyspark.pandas.usage_logging",
             "pyspark.python.pyspark",
             "pyspark.python.lib",
             "pyspark.testing",
             "pyspark.data",
             "pyspark.licenses",
             "pyspark.resource",
             "pyspark.errors",
             "pyspark.errors.exceptions",
             "pyspark.examples.src.main.python",
             "pyspark.logger",
         ],
         include_package_data=True,
         package_dir={
             "pyspark.jars": "deps/jars",
             "pyspark.bin": "deps/bin",
             "pyspark.sbin": "deps/sbin",
             "pyspark.python.lib": "lib",
             "pyspark.data": "deps/data",
             "pyspark.licenses": "deps/licenses",
             "pyspark.examples.src.main.python": "deps/examples",
         },
         package_data={
             "pyspark.jars": ["*.jar"],
             "pyspark.bin": ["*"],
             "pyspark.sbin": [
                 "spark-config.sh",
                 "spark-daemon.sh",
                 "start-history-server.sh",
                 "stop-history-server.sh",
             ],
             "pyspark.python.lib": ["*.zip"],
             "pyspark.data": ["*.txt", "*.data"],
             "pyspark.licenses": ["*.txt"],
             "pyspark.examples.src.main.python": ["*.py", "*/*.py"],
         },
         scripts=scripts,
         license="http://www.apache.org/licenses/LICENSE-2.0",
         # Don't forget to update python/docs/source/getting_started/install.rst
         # if you're updating the versions or dependencies.
         install_requires=["py4j==0.10.9.7"],
         extras_require={
             "ml": ["numpy>=%s" % _minimum_numpy_version],
             "mllib": ["numpy>=%s" % _minimum_numpy_version],
             "sql": [
                 "pandas>=%s" % _minimum_pandas_version,
                 "pyarrow>=%s" % _minimum_pyarrow_version,
                 "numpy>=%s" % _minimum_numpy_version,
             ],
             "pandas_on_spark": [
                 "pandas>=%s" % _minimum_pandas_version,
                 "pyarrow>=%s" % _minimum_pyarrow_version,
                 "numpy>=%s" % _minimum_numpy_version,
             ],
             "connect": [
                 "pandas>=%s" % _minimum_pandas_version,
                 "pyarrow>=%s" % _minimum_pyarrow_version,
                 "grpcio>=%s" % _minimum_grpc_version,
                 "grpcio-status>=%s" % _minimum_grpc_version,
                 "googleapis-common-protos>=%s" % _minimum_googleapis_common_protos_version,
                 "numpy>=%s" % _minimum_numpy_version,
             ],
         },
         python_requires=">=3.9",
         classifiers=[
             "Development Status :: 5 - Production/Stable",
             "License :: OSI Approved :: Apache Software License",
             "Programming Language :: Python :: 3.9",
             "Programming Language :: Python :: 3.10",
             "Programming Language :: Python :: 3.11",
             "Programming Language :: Python :: 3.12",
             "Programming Language :: Python :: Implementation :: CPython",
             "Programming Language :: Python :: Implementation :: PyPy",
             "Typing :: Typed",
         ],
         cmdclass={
             "install": InstallCommand,
         },
     )
 finally:
     # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than
     # packaging.
     if in_spark:
         os.remove("setup.py")
         os.remove("setup.cfg")
         # Depending on cleaning up the symlink farm or copied version
         if _supports_symlinks():
             os.remove(os.path.join(TEMP_PATH, "jars"))
             os.remove(os.path.join(TEMP_PATH, "bin"))
             os.remove(os.path.join(TEMP_PATH, "sbin"))
             os.remove(os.path.join(TEMP_PATH, "examples"))
             os.remove(os.path.join(TEMP_PATH, "data"))
             os.remove(os.path.join(TEMP_PATH, "licenses"))
         else:
             rmtree(os.path.join(TEMP_PATH, "jars"))
             rmtree(os.path.join(TEMP_PATH, "bin"))
             rmtree(os.path.join(TEMP_PATH, "sbin"))
             rmtree(os.path.join(TEMP_PATH, "examples"))
             rmtree(os.path.join(TEMP_PATH, "data"))
             rmtree(os.path.join(TEMP_PATH, "licenses"))
         os.rmdir(TEMP_PATH)
	#!/usr/bin/env python3

	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import importlib.util
	import glob
	import os
	import sys
	import ctypes
	from setuptools import setup
	from setuptools.command.install import install
	from shutil import copyfile, copytree, rmtree
	from pathlib import Path

	if (
	# When we package, the parent diectory 'classic' dir
	# (as we pip install -e python/packaging/classic)
	os.getcwd() == str(Path(__file__).parent.absolute())
	and str(Path(__file__).parent.name) == "classic"
	):
	# For:
	# - pip install -e python/packaging/classic
	# It moves the current working directory to 'classic'
	# - cd python/packaging/classic; python setup.py sdist
	#
	# For:
	# - python packaging/classic/setup.py sdist, it does not
	# execute this branch.
	#
	# Move to spark/python
	os.chdir(Path(__file__).parent.parent.parent.absolute())

	try:
	exec(open("pyspark/version.py").read())
	except IOError:
	print(
	"Failed to load PySpark version file for packaging. You must be in Spark's python dir.",
	file=sys.stderr,
	)
	sys.exit(-1)
	try:
	spec = importlib.util.spec_from_file_location("install", "pyspark/install.py")
	install_module = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(install_module)
	except IOError:
	print(
	"Failed to load the installing module (pyspark/install.py) which had to be "
	"packaged together.",
	file=sys.stderr,
	)
	sys.exit(-1)
	VERSION = __version__ # noqa
	# A temporary path so we can access above the Python project root and fetch scripts and jars we need
	TEMP_PATH = "deps"
	SPARK_HOME = os.path.abspath("../")

	# Provide guidance about how to use setup.py
	incorrect_invocation_message = """
	If you are installing pyspark from spark source, you must first build Spark and
	run sdist.

	To build Spark with maven you can run:
	./build/mvn -DskipTests clean package
	Building the source dist is done in the Python directory:
	cd python
	python packaging/classic/setup.py sdist
	pip install dist/*.tar.gz"""

	# Figure out where the jars are we need to package with PySpark.
	JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/"))

	if len(JARS_PATH) == 1:
	JARS_PATH = JARS_PATH[0]
	elif os.path.isfile("../RELEASE") and len(glob.glob("../jars/sparkcore.jar")) == 1:
	# Release mode puts the jars in a jars directory
	JARS_PATH = os.path.join(SPARK_HOME, "jars")
	elif len(JARS_PATH) > 1:
	print(
	"Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format(
	JARS_PATH
	),
	file=sys.stderr,
	)
	sys.exit(-1)
	elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
	print(incorrect_invocation_message, file=sys.stderr)
	sys.exit(-1)

	EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
	SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
	USER_SCRIPTS_PATH = os.path.join(SPARK_HOME, "sbin")
	DATA_PATH = os.path.join(SPARK_HOME, "data")
	LICENSES_PATH = os.path.join(SPARK_HOME, "licenses")

	SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
	USER_SCRIPTS_TARGET = os.path.join(TEMP_PATH, "sbin")
	JARS_TARGET = os.path.join(TEMP_PATH, "jars")
	EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
	DATA_TARGET = os.path.join(TEMP_PATH, "data")
	LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses")

	# Check and see if we are under the spark path in which case we need to build the symlink farm.
	# This is important because we only want to build the symlink farm while under Spark otherwise we
	# want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a
	# partially built sdist) we should error and have the user sort it out.
	in_spark = os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or (
	os.path.isfile("../RELEASE") and len(glob.glob("../jars/sparkcore.jar")) == 1
	)


	def _supports_symlinks():
	"""Check if the system supports symlinks (e.g. *nix) or not."""
	return hasattr(os, "symlink") and (
	(not hasattr(ctypes, "windll")) # Non-Windows
	or (
	# In some Windows, `os.symlink` works but only for admins.
	hasattr(ctypes.windll, "shell32")
	and hasattr(ctypes.windll.shell32, "IsUserAnAdmin")
	and bool(ctypes.windll.shell32.IsUserAnAdmin())
	)
	)


	if in_spark:
	# Construct links for setup
	try:
	os.mkdir(TEMP_PATH)
	except BaseException:
	print(
	"Temp path for symlink to parent already exists {0}".format(TEMP_PATH), file=sys.stderr
	)
	sys.exit(-1)

	# If you are changing the versions here, please also change ./python/pyspark/sql/pandas/utils.py
	# For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the
	# binary format protocol with the Java version, see ARROW_HOME/format/* for specifications.
	# Also don't forget to update python/docs/source/getting_started/install.rst, and
	# python/packaging/connect/setup.py
	_minimum_pandas_version = "2.0.0"
	_minimum_numpy_version = "1.21"
	_minimum_pyarrow_version = "10.0.0"
	_minimum_grpc_version = "1.62.0"
	_minimum_googleapis_common_protos_version = "1.56.4"


	class InstallCommand(install):
	# TODO(SPARK-32837) leverage pip's custom options

	def run(self):
	install.run(self)

	# Make sure the destination is always clean.
	spark_dist = os.path.join(self.install_lib, "pyspark", "spark-distribution")
	rmtree(spark_dist, ignore_errors=True)

	if ("PYSPARK_HADOOP_VERSION" in os.environ) or ("PYSPARK_HIVE_VERSION" in os.environ):
	# Note that PYSPARK_VERSION environment is just a testing purpose.
	# PYSPARK_HIVE_VERSION environment variable is also internal for now in case
	# we support another version of Hive in the future.
	spark_version, hadoop_version, hive_version = install_module.checked_versions(
	os.environ.get("PYSPARK_VERSION", VERSION).lower(),
	os.environ.get("PYSPARK_HADOOP_VERSION", install_module.DEFAULT_HADOOP).lower(),
	os.environ.get("PYSPARK_HIVE_VERSION", install_module.DEFAULT_HIVE).lower(),
	)

	if "PYSPARK_VERSION" not in os.environ and (
	(install_module.DEFAULT_HADOOP, install_module.DEFAULT_HIVE)
	== (hadoop_version, hive_version)
	):
	# Do not download and install if they are same as default.
	return

	install_module.install_spark(
	dest=spark_dist,
	spark_version=spark_version,
	hadoop_version=hadoop_version,
	hive_version=hive_version,
	)


	try:
	# We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts
	# find it where expected. The rest of the files aren't copied because they are accessed
	# using Python imports instead which will be resolved correctly.
	try:
	os.makedirs("pyspark/python/pyspark")
	except OSError:
	# Don't worry if the directory already exists.
	pass
	copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py")

	if in_spark:
	# !!HACK ALTERT!!
	# `setup.py` has to be located with the same directory with the package.
	# Therefore, we copy the current file, and place it at `spark/python` directory.
	# After that, we remove it in the end.
	copyfile("packaging/classic/setup.py", "setup.py")
	copyfile("packaging/classic/setup.cfg", "setup.cfg")

	# Construct the symlink farm - this is nein_sparkcessary since we can't refer to
	# the path above the package root and we need to copy the jars and scripts which
	# are up above the python root.
	if _supports_symlinks():
	os.symlink(JARS_PATH, JARS_TARGET)
	os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
	os.symlink(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET)
	os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
	os.symlink(DATA_PATH, DATA_TARGET)
	os.symlink(LICENSES_PATH, LICENSES_TARGET)
	else:
	# For windows fall back to the slower copytree
	copytree(JARS_PATH, JARS_TARGET)
	copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
	copytree(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET)
	copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
	copytree(DATA_PATH, DATA_TARGET)
	copytree(LICENSES_PATH, LICENSES_TARGET)
	else:
	# If we are not inside of SPARK_HOME verify we have the required symlink farm
	if not os.path.exists(JARS_TARGET):
	print(
	"To build packaging must be in the python directory under the SPARK_HOME.",
	file=sys.stderr,
	)

	if not os.path.isdir(SCRIPTS_TARGET):
	print(incorrect_invocation_message, file=sys.stderr)
	sys.exit(-1)

	# Scripts directive requires a list of each script path and does not take wild cards.
	script_names = os.listdir(SCRIPTS_TARGET)
	scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names))
	# We add find_spark_home.py to the bin directory we install so that pip installed PySpark
	# will search for SPARK_HOME with Python.
	scripts.append("pyspark/find_spark_home.py")

	with open("README.md") as f:
	long_description = f.read()

	setup(
	name="pyspark",
	version=VERSION,
	description="Apache Spark Python API",
	long_description=long_description,
	long_description_content_type="text/markdown",
	author="Spark Developers",
	author_email="dev@spark.apache.org",
	url="https://github.com/apache/spark/tree/master/python",
	packages=[
	"pyspark",
	"pyspark.core",
	"pyspark.cloudpickle",
	"pyspark.mllib",
	"pyspark.mllib.linalg",
	"pyspark.mllib.stat",
	"pyspark.ml",
	"pyspark.ml.connect",
	"pyspark.ml.linalg",
	"pyspark.ml.param",
	"pyspark.ml.torch",
	"pyspark.ml.deepspeed",
	"pyspark.sql",
	"pyspark.sql.avro",
	"pyspark.sql.classic",
	"pyspark.sql.connect",
	"pyspark.sql.connect.avro",
	"pyspark.sql.connect.client",
	"pyspark.sql.connect.functions",
	"pyspark.sql.connect.proto",
	"pyspark.sql.connect.protobuf",
	"pyspark.sql.connect.resource",
	"pyspark.sql.connect.shell",
	"pyspark.sql.connect.streaming",
	"pyspark.sql.connect.streaming.worker",
	"pyspark.sql.functions",
	"pyspark.sql.pandas",
	"pyspark.sql.plot",
	"pyspark.sql.protobuf",
	"pyspark.sql.streaming",
	"pyspark.sql.worker",
	"pyspark.streaming",
	"pyspark.bin",
	"pyspark.sbin",
	"pyspark.jars",
	"pyspark.pandas",
	"pyspark.pandas.data_type_ops",
	"pyspark.pandas.indexes",
	"pyspark.pandas.missing",
	"pyspark.pandas.plot",
	"pyspark.pandas.spark",
	"pyspark.pandas.typedef",
	"pyspark.pandas.usage_logging",
	"pyspark.python.pyspark",
	"pyspark.python.lib",
	"pyspark.testing",
	"pyspark.data",
	"pyspark.licenses",
	"pyspark.resource",
	"pyspark.errors",
	"pyspark.errors.exceptions",
	"pyspark.examples.src.main.python",
	"pyspark.logger",
	],
	include_package_data=True,
	package_dir={
	"pyspark.jars": "deps/jars",
	"pyspark.bin": "deps/bin",
	"pyspark.sbin": "deps/sbin",
	"pyspark.python.lib": "lib",
	"pyspark.data": "deps/data",
	"pyspark.licenses": "deps/licenses",
	"pyspark.examples.src.main.python": "deps/examples",
	},
	package_data={
	"pyspark.jars": ["*.jar"],
	"pyspark.bin": ["*"],
	"pyspark.sbin": [
	"spark-config.sh",
	"spark-daemon.sh",
	"start-history-server.sh",
	"stop-history-server.sh",
	],
	"pyspark.python.lib": ["*.zip"],
	"pyspark.data": [".txt", ".data"],
	"pyspark.licenses": ["*.txt"],
	"pyspark.examples.src.main.python": [".py", "/*.py"],
	},
	scripts=scripts,
	license="http://www.apache.org/licenses/LICENSE-2.0",
	# Don't forget to update python/docs/source/getting_started/install.rst
	# if you're updating the versions or dependencies.
	install_requires=["py4j==0.10.9.7"],
	extras_require={
	"ml": ["numpy>=%s" % _minimum_numpy_version],
	"mllib": ["numpy>=%s" % _minimum_numpy_version],
	"sql": [
	"pandas>=%s" % _minimum_pandas_version,
	"pyarrow>=%s" % _minimum_pyarrow_version,
	"numpy>=%s" % _minimum_numpy_version,
	],
	"pandas_on_spark": [
	"pandas>=%s" % _minimum_pandas_version,
	"pyarrow>=%s" % _minimum_pyarrow_version,
	"numpy>=%s" % _minimum_numpy_version,
	],
	"connect": [
	"pandas>=%s" % _minimum_pandas_version,
	"pyarrow>=%s" % _minimum_pyarrow_version,
	"grpcio>=%s" % _minimum_grpc_version,
	"grpcio-status>=%s" % _minimum_grpc_version,
	"googleapis-common-protos>=%s" % _minimum_googleapis_common_protos_version,
	"numpy>=%s" % _minimum_numpy_version,
	],
	},
	python_requires=">=3.9",
	classifiers=[
	"Development Status :: 5 - Production/Stable",
	"License :: OSI Approved :: Apache Software License",
	"Programming Language :: Python :: 3.9",
	"Programming Language :: Python :: 3.10",
	"Programming Language :: Python :: 3.11",
	"Programming Language :: Python :: 3.12",
	"Programming Language :: Python :: Implementation :: CPython",
	"Programming Language :: Python :: Implementation :: PyPy",
	"Typing :: Typed",
	],
	cmdclass={
	"install": InstallCommand,
	},
	)
	finally:
	# We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than
	# packaging.
	if in_spark:
	os.remove("setup.py")
	os.remove("setup.cfg")
	# Depending on cleaning up the symlink farm or copied version
	if _supports_symlinks():
	os.remove(os.path.join(TEMP_PATH, "jars"))
	os.remove(os.path.join(TEMP_PATH, "bin"))
	os.remove(os.path.join(TEMP_PATH, "sbin"))
	os.remove(os.path.join(TEMP_PATH, "examples"))
	os.remove(os.path.join(TEMP_PATH, "data"))
	os.remove(os.path.join(TEMP_PATH, "licenses"))
	else:
	rmtree(os.path.join(TEMP_PATH, "jars"))
	rmtree(os.path.join(TEMP_PATH, "bin"))
	rmtree(os.path.join(TEMP_PATH, "sbin"))
	rmtree(os.path.join(TEMP_PATH, "examples"))
	rmtree(os.path.join(TEMP_PATH, "data"))
	rmtree(os.path.join(TEMP_PATH, "licenses"))
	os.rmdir(TEMP_PATH)