| #!/usr/bin/env python |
| |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| from __future__ import print_function |
| import glob |
| import os |
| import sys |
| from setuptools import setup, find_packages |
| from shutil import copyfile, copytree, rmtree |
| |
| if sys.version_info < (2, 7): |
| print("Python versions prior to 2.7 are not supported for pip installed PySpark.", |
| file=sys.stderr) |
| sys.exit(-1) |
| |
| try: |
| exec(open('pyspark/version.py').read()) |
| except IOError: |
| print("Failed to load PySpark version file for packaging. You must be in Spark's python dir.", |
| file=sys.stderr) |
| sys.exit(-1) |
| VERSION = __version__ # noqa |
| # A temporary path so we can access above the Python project root and fetch scripts and jars we need |
| TEMP_PATH = "deps" |
| SPARK_HOME = os.path.abspath("../") |
| |
| # Provide guidance about how to use setup.py |
| incorrect_invocation_message = """ |
| If you are installing pyspark from spark source, you must first build Spark and |
| run sdist. |
| |
| To build Spark with maven you can run: |
| ./build/mvn -DskipTests clean package |
| Building the source dist is done in the Python directory: |
| cd python |
| python setup.py sdist |
| pip install dist/*.tar.gz""" |
| |
| # Figure out where the jars are we need to package with PySpark. |
| JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/")) |
| |
| if len(JARS_PATH) == 1: |
| JARS_PATH = JARS_PATH[0] |
| elif (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1): |
| # Release mode puts the jars in a jars directory |
| JARS_PATH = os.path.join(SPARK_HOME, "jars") |
| elif len(JARS_PATH) > 1: |
| print("Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format( |
| JARS_PATH), file=sys.stderr) |
| sys.exit(-1) |
| elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH): |
| print(incorrect_invocation_message, file=sys.stderr) |
| sys.exit(-1) |
| |
| EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python") |
| SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin") |
| DATA_PATH = os.path.join(SPARK_HOME, "data") |
| LICENSES_PATH = os.path.join(SPARK_HOME, "licenses") |
| |
| SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin") |
| JARS_TARGET = os.path.join(TEMP_PATH, "jars") |
| EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples") |
| DATA_TARGET = os.path.join(TEMP_PATH, "data") |
| LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses") |
| |
| # Check and see if we are under the spark path in which case we need to build the symlink farm. |
| # This is important because we only want to build the symlink farm while under Spark otherwise we |
| # want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a |
| # partially built sdist) we should error and have the user sort it out. |
| in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or |
| (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1)) |
| |
| |
| def _supports_symlinks(): |
| """Check if the system supports symlinks (e.g. *nix) or not.""" |
| return getattr(os, "symlink", None) is not None |
| |
| |
| if (in_spark): |
| # Construct links for setup |
| try: |
| os.mkdir(TEMP_PATH) |
| except: |
| print("Temp path for symlink to parent already exists {0}".format(TEMP_PATH), |
| file=sys.stderr) |
| sys.exit(-1) |
| |
| # If you are changing the versions here, please also change ./python/pyspark/sql/utils.py and |
| # ./python/run-tests.py. In case of Arrow, you should also check ./pom.xml. |
| _minimum_pandas_version = "0.19.2" |
| _minimum_pyarrow_version = "0.8.0" |
| |
| try: |
| # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts |
| # find it where expected. The rest of the files aren't copied because they are accessed |
| # using Python imports instead which will be resolved correctly. |
| try: |
| os.makedirs("pyspark/python/pyspark") |
| except OSError: |
| # Don't worry if the directory already exists. |
| pass |
| copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py") |
| |
| if (in_spark): |
| # Construct the symlink farm - this is necessary since we can't refer to the path above the |
| # package root and we need to copy the jars and scripts which are up above the python root. |
| if _supports_symlinks(): |
| os.symlink(JARS_PATH, JARS_TARGET) |
| os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) |
| os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) |
| os.symlink(DATA_PATH, DATA_TARGET) |
| os.symlink(LICENSES_PATH, LICENSES_TARGET) |
| else: |
| # For windows fall back to the slower copytree |
| copytree(JARS_PATH, JARS_TARGET) |
| copytree(SCRIPTS_PATH, SCRIPTS_TARGET) |
| copytree(EXAMPLES_PATH, EXAMPLES_TARGET) |
| copytree(DATA_PATH, DATA_TARGET) |
| copytree(LICENSES_PATH, LICENSES_TARGET) |
| else: |
| # If we are not inside of SPARK_HOME verify we have the required symlink farm |
| if not os.path.exists(JARS_TARGET): |
| print("To build packaging must be in the python directory under the SPARK_HOME.", |
| file=sys.stderr) |
| |
| if not os.path.isdir(SCRIPTS_TARGET): |
| print(incorrect_invocation_message, file=sys.stderr) |
| sys.exit(-1) |
| |
| # Scripts directive requires a list of each script path and does not take wild cards. |
| script_names = os.listdir(SCRIPTS_TARGET) |
| scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names)) |
| # We add find_spark_home.py to the bin directory we install so that pip installed PySpark |
| # will search for SPARK_HOME with Python. |
| scripts.append("pyspark/find_spark_home.py") |
| |
| # Parse the README markdown file into rst for PyPI |
| long_description = "!!!!! missing pandoc do not upload to PyPI !!!!" |
| try: |
| import pypandoc |
| long_description = pypandoc.convert('README.md', 'rst') |
| except ImportError: |
| print("Could not import pypandoc - required to package PySpark", file=sys.stderr) |
| except OSError: |
| print("Could not convert - pandoc is not installed", file=sys.stderr) |
| |
| setup( |
| name='pyspark', |
| version=VERSION, |
| description='Apache Spark Python API', |
| long_description=long_description, |
| author='Spark Developers', |
| author_email='dev@spark.apache.org', |
| url='https://github.com/apache/spark/tree/master/python', |
| packages=['pyspark', |
| 'pyspark.mllib', |
| 'pyspark.mllib.linalg', |
| 'pyspark.mllib.stat', |
| 'pyspark.ml', |
| 'pyspark.ml.linalg', |
| 'pyspark.ml.param', |
| 'pyspark.sql', |
| 'pyspark.streaming', |
| 'pyspark.bin', |
| 'pyspark.jars', |
| 'pyspark.python.pyspark', |
| 'pyspark.python.lib', |
| 'pyspark.data', |
| 'pyspark.licenses', |
| 'pyspark.examples.src.main.python'], |
| include_package_data=True, |
| package_dir={ |
| 'pyspark.jars': 'deps/jars', |
| 'pyspark.bin': 'deps/bin', |
| 'pyspark.python.lib': 'lib', |
| 'pyspark.data': 'deps/data', |
| 'pyspark.licenses': 'deps/licenses', |
| 'pyspark.examples.src.main.python': 'deps/examples', |
| }, |
| package_data={ |
| 'pyspark.jars': ['*.jar'], |
| 'pyspark.bin': ['*'], |
| 'pyspark.python.lib': ['*.zip'], |
| 'pyspark.data': ['*.txt', '*.data'], |
| 'pyspark.licenses': ['*.txt'], |
| 'pyspark.examples.src.main.python': ['*.py', '*/*.py']}, |
| scripts=scripts, |
| license='http://www.apache.org/licenses/LICENSE-2.0', |
| install_requires=['py4j==0.10.7'], |
| setup_requires=['pypandoc'], |
| extras_require={ |
| 'ml': ['numpy>=1.7'], |
| 'mllib': ['numpy>=1.7'], |
| 'sql': [ |
| 'pandas>=%s' % _minimum_pandas_version, |
| 'pyarrow>=%s' % _minimum_pyarrow_version, |
| ] |
| }, |
| classifiers=[ |
| 'Development Status :: 5 - Production/Stable', |
| 'License :: OSI Approved :: Apache Software License', |
| 'Programming Language :: Python :: 2.7', |
| 'Programming Language :: Python :: 3', |
| 'Programming Language :: Python :: 3.4', |
| 'Programming Language :: Python :: 3.5', |
| 'Programming Language :: Python :: 3.6', |
| 'Programming Language :: Python :: 3.7', |
| 'Programming Language :: Python :: Implementation :: CPython', |
| 'Programming Language :: Python :: Implementation :: PyPy'] |
| ) |
| finally: |
| # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than |
| # packaging. |
| if (in_spark): |
| # Depending on cleaning up the symlink farm or copied version |
| if _supports_symlinks(): |
| os.remove(os.path.join(TEMP_PATH, "jars")) |
| os.remove(os.path.join(TEMP_PATH, "bin")) |
| os.remove(os.path.join(TEMP_PATH, "examples")) |
| os.remove(os.path.join(TEMP_PATH, "data")) |
| os.remove(os.path.join(TEMP_PATH, "licenses")) |
| else: |
| rmtree(os.path.join(TEMP_PATH, "jars")) |
| rmtree(os.path.join(TEMP_PATH, "bin")) |
| rmtree(os.path.join(TEMP_PATH, "examples")) |
| rmtree(os.path.join(TEMP_PATH, "data")) |
| rmtree(os.path.join(TEMP_PATH, "licenses")) |
| os.rmdir(TEMP_PATH) |