| #!/usr/bin/env python3 |
| |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| # cd python |
| # python packaging/classic/setup.py sdist |
| |
| # cd python/packaging/classic |
| # python setup.py sdist |
| |
| import sys |
| from setuptools import setup |
| import os |
| from shutil import copyfile, move |
| import glob |
| from pathlib import Path |
| |
| if ( |
| # When we package, the parent directory 'client' dir |
| # (as we pip install -e python/packaging/client) |
| os.getcwd() == str(Path(__file__).parent.absolute()) |
| and str(Path(__file__).parent.name) == "client" |
| ): |
| # For: |
| # - pip install -e python/packaging/client |
| # It moves the current working directory to 'client' |
| # - cd python/packaging/client; python setup.py sdist |
| # |
| # For: |
| # - python packaging/client/setup.py sdist, it does not |
| # execute this branch. |
| # |
| # Move to spark/python |
| os.chdir(Path(__file__).parent.parent.parent.absolute()) |
| |
| try: |
| exec(open("pyspark/version.py").read()) |
| except IOError: |
| print( |
| "Failed to load PySpark version file for packaging. You must be in Spark's python dir.", |
| file=sys.stderr, |
| ) |
| sys.exit(-1) |
| VERSION = __version__ # noqa |
| |
| # Check and see if we are under the spark path in which case we need to build the symlink farm. |
| # This is important because we only want to build the symlink farm while under Spark otherwise we |
| # want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a |
| # partially built sdist) we should error and have the user sort it out. |
| in_spark = os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or ( |
| os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1 |
| ) |
| |
| test_packages = [] |
| if "SPARK_TESTING" in os.environ: |
| test_packages = [ |
| "pyspark.errors.tests.connect", |
| "pyspark.tests", # for Memory profiler parity tests |
| "pyspark.resource.tests", |
| "pyspark.sql.tests", |
| "pyspark.sql.tests.arrow", |
| "pyspark.sql.tests.connect", |
| "pyspark.sql.tests.connect.arrow", |
| "pyspark.sql.tests.connect.streaming", |
| "pyspark.sql.tests.connect.client", |
| "pyspark.sql.tests.connect.pandas", |
| "pyspark.sql.tests.connect.shell", |
| "pyspark.sql.tests.pandas", |
| "pyspark.sql.tests.pandas.helper", |
| "pyspark.sql.tests.plot", |
| "pyspark.sql.tests.streaming", |
| "pyspark.ml.tests", |
| "pyspark.ml.tests.connect", |
| "pyspark.pandas.tests", |
| "pyspark.pandas.tests.computation", |
| "pyspark.pandas.tests.data_type_ops", |
| "pyspark.pandas.tests.diff_frames_ops", |
| "pyspark.pandas.tests.frame", |
| "pyspark.pandas.tests.groupby", |
| "pyspark.pandas.tests.indexes", |
| "pyspark.pandas.tests.io", |
| "pyspark.pandas.tests.plot", |
| "pyspark.pandas.tests.resample", |
| "pyspark.pandas.tests.reshape", |
| "pyspark.pandas.tests.series", |
| "pyspark.pandas.tests.window", |
| "pyspark.pandas.tests.connect", |
| "pyspark.pandas.tests.connect.computation", |
| "pyspark.pandas.tests.connect.data_type_ops", |
| "pyspark.pandas.tests.connect.diff_frames_ops", |
| "pyspark.pandas.tests.connect.frame", |
| "pyspark.pandas.tests.connect.groupby", |
| "pyspark.pandas.tests.connect.indexes", |
| "pyspark.pandas.tests.connect.io", |
| "pyspark.pandas.tests.connect.plot", |
| "pyspark.pandas.tests.connect.resample", |
| "pyspark.pandas.tests.connect.reshape", |
| "pyspark.pandas.tests.connect.series", |
| "pyspark.pandas.tests.connect.window", |
| "pyspark.pipelines.tests", |
| "pyspark.logger.tests", |
| "pyspark.logger.tests.connect", |
| ] |
| |
| try: |
| if in_spark: |
| # !!HACK ALTERT!! |
| # 1. `setup.py` has to be located with the same directory with the package. |
| # Therefore, we copy the current file, and place it at `spark/python` directory. |
| # After that, we remove it in the end. |
| # 2. Here it renames `lib` to `lib.back` so MANIFEST.in does not pick `py4j` up. |
| # We rename it back in the end. |
| move("lib", "lib.back") |
| copyfile("packaging/client/setup.py", "setup.py") |
| copyfile("packaging/client/setup.cfg", "setup.cfg") |
| |
| # If you are changing the versions here, please also change ./python/pyspark/sql/pandas/utils.py |
| # For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the |
| # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. |
| # Also don't forget to update python/docs/source/getting_started/install.rst, |
| # python/packaging/classic/setup.py, and python/packaging/connect/setup.py |
| _minimum_pandas_version = "2.2.0" |
| _minimum_numpy_version = "1.21" |
| _minimum_pyarrow_version = "15.0.0" |
| _minimum_grpc_version = "1.67.0" |
| _minimum_googleapis_common_protos_version = "1.65.0" |
| _minimum_pyyaml_version = "3.11" |
| |
| with open("README.md") as f: |
| long_description = f.read() |
| |
| connect_packages = [ |
| "pyspark", |
| "pyspark.cloudpickle", |
| "pyspark.mllib", |
| "pyspark.mllib.linalg", |
| "pyspark.mllib.stat", |
| "pyspark.ml", |
| "pyspark.ml.connect", |
| "pyspark.ml.linalg", |
| "pyspark.ml.param", |
| "pyspark.ml.torch", |
| "pyspark.ml.deepspeed", |
| "pyspark.sql", |
| "pyspark.sql.avro", |
| "pyspark.sql.connect", |
| "pyspark.sql.connect.avro", |
| "pyspark.sql.connect.client", |
| "pyspark.sql.connect.functions", |
| "pyspark.sql.connect.proto", |
| "pyspark.sql.connect.protobuf", |
| "pyspark.sql.connect.resource", |
| "pyspark.sql.connect.shell", |
| "pyspark.sql.connect.streaming", |
| "pyspark.sql.connect.streaming.worker", |
| "pyspark.sql.functions", |
| "pyspark.sql.pandas", |
| "pyspark.sql.plot", |
| "pyspark.sql.protobuf", |
| "pyspark.sql.streaming", |
| "pyspark.sql.streaming.proto", |
| "pyspark.sql.worker", |
| "pyspark.streaming", |
| "pyspark.pandas", |
| "pyspark.pandas.data_type_ops", |
| "pyspark.pandas.indexes", |
| "pyspark.pandas.missing", |
| "pyspark.pandas.plot", |
| "pyspark.pandas.spark", |
| "pyspark.pandas.typedef", |
| "pyspark.pandas.usage_logging", |
| "pyspark.pipelines", |
| "pyspark.testing", |
| "pyspark.resource", |
| "pyspark.errors", |
| "pyspark.errors.exceptions", |
| "pyspark.logger", |
| ] |
| |
| setup( |
| name="pyspark-client", |
| version=VERSION, |
| description="Python Spark Connect client for Apache Spark", |
| long_description=long_description, |
| long_description_content_type="text/markdown", |
| author="Spark Developers", |
| author_email="dev@spark.apache.org", |
| url="https://github.com/apache/spark/tree/master/python", |
| packages=connect_packages + test_packages, |
| include_package_data=True, |
| license="Apache-2.0", |
| # Don't forget to update python/docs/source/getting_started/install.rst |
| # if you're updating the versions or dependencies. |
| install_requires=[ |
| "pandas>=%s" % _minimum_pandas_version, |
| "pyarrow>=%s" % _minimum_pyarrow_version, |
| "grpcio>=%s" % _minimum_grpc_version, |
| "grpcio-status>=%s" % _minimum_grpc_version, |
| "googleapis-common-protos>=%s" % _minimum_googleapis_common_protos_version, |
| "numpy>=%s" % _minimum_numpy_version, |
| "pyyaml>=%s" % _minimum_pyyaml_version, |
| ], |
| python_requires=">=3.10", |
| classifiers=[ |
| "Development Status :: 5 - Production/Stable", |
| "Programming Language :: Python :: 3.10", |
| "Programming Language :: Python :: 3.11", |
| "Programming Language :: Python :: 3.12", |
| "Programming Language :: Python :: 3.13", |
| "Programming Language :: Python :: Implementation :: CPython", |
| "Programming Language :: Python :: Implementation :: PyPy", |
| "Typing :: Typed", |
| ], |
| ) |
| finally: |
| if in_spark: |
| move("lib.back", "lib") |
| os.remove("setup.py") |
| os.remove("setup.cfg") |