IMPALA-1071: Distributable python package for impala-shell

The patch adds a set of scripts for converting the impala-shell
into a true distributable python package. The package can be
installed using familiar python commands, e.g.:

  $ python setup.py (install|develop)

or

  $ pip install -e /path/to/dist/dir

The entry point script, make_python_package.sh, will run as a
part of the standard sequence of steps that results from calling
buildall.sh, and will produce a gzipped tarball inside of
Impala/shell/dist as an artifact. Thereafter, make_python_package.sh
can be run manually any time.

The expectation is that an official maintainer would need to manually
upload official releases to the Python Package Index as appropriate.

Change-Id: Ib8c745bddddf6a16f0c039430152745a2f00e044
Reviewed-on: http://gerrit.cloudera.org:8080/14181
Reviewed-by: David Knupp <dknupp@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d72430..c355218 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -421,6 +421,10 @@
   COMMAND "${CMAKE_SOURCE_DIR}/shell/make_shell_tarball.sh"
 )
 
+add_custom_target(shell_pypi_package DEPENDS shell_tarball
+  COMMAND "DIST_DIR=${CMAKE_SOURCE_DIR}/shell/dist CLEAN_DIST=true ${CMAKE_SOURCE_DIR}/shell/packaging/make_python_package.sh"
+)
+
 add_custom_target(cscope ALL DEPENDS gen-deps
   COMMAND "${CMAKE_SOURCE_DIR}/bin/gen-cscope.sh"
 )
diff --git a/bin/rat_exclude_files.txt b/bin/rat_exclude_files.txt
index 9e15b0a..aefaeef 100644
--- a/bin/rat_exclude_files.txt
+++ b/bin/rat_exclude_files.txt
@@ -24,6 +24,8 @@
 www/index.html
 lib/python/impala_py_lib/__init__.py
 lib/python/impala_py_lib/jenkins/__init__.py
+shell/packaging/MANIFEST.in
+shell/packaging/requirements.txt
 
 # See $IMPALA_HOME/LICENSE.txt
 be/src/gutil/*
@@ -91,6 +93,7 @@
 be/src/thirdparty/pcg-cpp-0.98/README.md
 lib/python/README.md
 lib/python/impala_py_lib/gdb/README.md
+shell/packaging/README.md
 
 # http://www.apache.org/legal/src-headers.html: "Test data for which the addition of a
 # source header would cause the tests to fail."
diff --git a/shell/impala_client.py b/shell/impala_client.py
index 6761e8e..4dd22a8 100755
--- a/shell/impala_client.py
+++ b/shell/impala_client.py
@@ -939,6 +939,8 @@
       if t.type == TApplicationException.UNKNOWN_METHOD:
         raise MissingThriftMethodException(t.message)
       raise
+    except TTransportException as e:
+      raise DisconnectedException("Error communicating with impalad: %s" % e)
     return (resp.version, resp.webserver_address)
 
   def _create_query_req(self, query_str, set_query_options):
@@ -1094,4 +1096,39 @@
       if t.type == TApplicationException.UNKNOWN_METHOD:
         raise MissingThriftMethodException(t.message)
       raise RPCException("Application Exception : %s" % t)
+    except Exception as e:
+      # This final except clause should ONLY be exercised in the case of Impala
+      # shell being installed as a standalone python package from public PyPI,
+      # rather than being included as part of a typical Impala deployment.
+      #
+      # Essentially, it's a hack that is required due to issues stemming from
+      # IMPALA-6808. Because of the way the Impala python environment has been
+      # somewhat haphazardly constructed, we end up polluting the top level Impala
+      # python environment with modules that should really be sub-modules. One of
+      # the principal places this occurs is with the various modules required by
+      # the Impala shell. This isn't a concern when the shell is invoked via a
+      # specially installed version of python that belongs to Impala, but it does
+      # become an issue when the shell is being run using the system python.
+      #
+      # When we install the shell as a standalone package, we need to construct
+      # it in such a way that all of the internal modules are contained within
+      # a top-level impala_shell namespace. However, this then breaks various
+      # imports and, in this case, exception handling in the original code.
+      # As far as I can tell, there's no clean way to address this without fully
+      # resolving IMPALA-6808.
+      #
+      # Without taking some additional measure here to recognize certain common
+      # exceptions, especially Beeswax exceptions raised by RPC calls, when
+      # errors occur during a standalone shell session, we wind up falling
+      # entirely through this block and returning nothing to the caller (which
+      # happens to be the primary command loop in impala_shell.py). This in turn
+      # has the result of disconnecting the shell in the case of, say, even simple
+      # typos in database or table names.
+      if suppress_error_on_cancel and self.is_query_cancelled:
+        raise QueryCancelledByShellException()
+      else:
+        if "BeeswaxException" in str(e):
+          raise RPCException("ERROR: %s" % e.message)
+        if "QueryNotFoundException" in str(e):
+          raise QueryStateException('Error: Stale query handle')
 
diff --git a/shell/packaging/MANIFEST.in b/shell/packaging/MANIFEST.in
new file mode 100644
index 0000000..ec0d80f
--- /dev/null
+++ b/shell/packaging/MANIFEST.in
@@ -0,0 +1,3 @@
+include *.txt *.md *.py
+recursive-include impala_shell *.py
+recursive-exclude impala_shell *.pyc
diff --git a/shell/packaging/README.md b/shell/packaging/README.md
new file mode 100644
index 0000000..cd40b12
--- /dev/null
+++ b/shell/packaging/README.md
@@ -0,0 +1,73 @@
+# Impala Interactive Shell
+
+You can use the Impala shell tool (impala-shell) to connect to an Impala
+service. The shell allows you to set up databases and tables, insert data,
+and issue queries. For ad hoc queries and exploration, you can submit SQL
+statements in an interactive session. The impala-shell interpreter accepts
+all the same SQL statements listed in
+[Impala SQL Statements](http://impala.apache.org/docs/build/html/topics/impala_langref_sql.html),
+plus some shell-only commands that you can use for tuning performance and
+diagnosing problems.
+
+To automate your work, you can specify command-line options to process a single
+statement or a script file. (Other avenues for Impala automation via python
+are provided by Impyla or ODBC.)
+
+## Installing
+
+```
+$ pip install impala-shell
+```
+
+## Online documentation
+
+* [Impala Shell Documentation](http://impala.apache.org/docs/build/html/topics/impala_impala_shell.html)
+* [Apache Impala Documentation](http://impala.apache.org/impala-docs.html)
+
+## Quickstart
+
+### Non-interactive mode
+
+Processing a single query, e.g., ```show tables```:
+
+```
+$ impala-shell -i impalad-host.domain.com -d some_database -q 'show tables'
+```
+
+Processing a text file with a series of queries:
+
+```
+$ impala-shell -i impalad-host.domain.com -d some_database -f /path/to/queries.sql
+```
+
+### Launching the interactive shell
+
+To connect to an impalad host at the default service port (21000):
+
+```
+$ impala-shell -i impalad-host.domain.com
+Starting Impala Shell without Kerberos authentication
+Connected to impalad-host.domain.com:21000
+Server version: impalad version 2.11.0-SNAPSHOT RELEASE (build d4596f9ca3ea32a8008cdc809a7ac9a3dea47962)
+***********************************************************************************
+Welcome to the Impala shell.
+(Impala Shell v3.0.0-SNAPSHOT (73e90d2) built on Thu Mar  8 00:59:00 PST 2018)
+
+The '-B' command line flag turns off pretty-printing for query results. Use this
+flag to remove formatting from results you want to save for later, or to benchmark
+Impala.
+***********************************************************************************
+[impalad-host.domain.com:21000] >
+```
+
+### Launching the interactive shell (secure mode)
+
+To connect to a secure host using kerberos and SSL:
+
+```
+$ impala-shell -k --ssl -i impalad-secure-host.domain.com
+```
+
+### Disconnecting
+
+To exit the shell when running interactively, press ```Ctrl-D``` at the shell prompt.
diff --git a/shell/packaging/__init__.py b/shell/packaging/__init__.py
new file mode 100644
index 0000000..43e0baa
--- /dev/null
+++ b/shell/packaging/__init__.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from os.path import dirname, abspath
+import sys
+
+# When installing the python shell as a standalone package, this __init__ is
+# used to workaround the issues stemming from IMPALA-6808. Because of the way
+# the Impala python environment has been somewhat haphazardly constructed in
+# a deployed cluster, it ends up being "polluted" with top-level modules that
+# should really be sub-modules. One of the principal places this occurs is with
+# the various modules required by the Impala shell. This isn't a concern when
+# the shell is invoked via a specially installed version of python that belongs
+# to Impala, but it does become an issue when the shell is being run using the
+# system python.
+#
+# If we want to install the shell as a standalone package, we need to construct
+# it in such a way that all of the internal modules are contained within a
+# top-level impala_shell namespace. However, this then breaks various imports
+# throughout the Impala shell code. The way this file corrects that is to add
+# the impala_shell directory to PYTHONPATH only when the shell is invoked. As
+# far as I can tell, there's no cleaner way to address this without fully
+# resolving IMPALA-6808.
+impala_shell_dir = dirname(abspath(__file__))
+sys.path.append(impala_shell_dir)
diff --git a/shell/packaging/make_python_package.sh b/shell/packaging/make_python_package.sh
new file mode 100755
index 0000000..ba95148
--- /dev/null
+++ b/shell/packaging/make_python_package.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# ----------------------------------------------------------------------
+# This script is invoked during the Impala build process, and creates
+# a distributable python package of the Impala shell. The resulting
+# archive will be saved to:
+#
+#   ${IMPALA_HOME}/shell/dist/impala_shell-<version>.tar.gz
+#
+# Until the thrift-generated python files in ${IMPALA_HOME}/shell/gen-py
+# have been created by the build process, this script will not work.
+# It also relies upon the impala_build_version.py file created by the
+# parent packaging script, ${IMPALA_HOME}/shell/make_shell_tarball.sh,
+# which needs to be run before this script will work.
+#
+# After those files exist, however, this script can be run again at will.
+
+set -eu -o pipefail
+
+WORKING_DIR="$(cd "$(dirname "$0")" ; pwd -P )"
+SHELL_HOME="${IMPALA_HOME}"/shell
+STAGING_DIR="${WORKING_DIR}"/staging
+DIST_DIR="${DIST_DIR:-$WORKING_DIR/dist}"
+PACKAGE_DIR="${STAGING_DIR}"/impala_shell_package
+MODULE_LIB_DIR="${PACKAGE_DIR}"/impala_shell
+NO_CLEAN_DIST="${NO_CLEAN_DIST:-}"
+
+assemble_package_files() {
+  mkdir -p "${MODULE_LIB_DIR}"
+
+  cp -r "${SHELL_HOME}/gen-py"/* "${MODULE_LIB_DIR}"
+  cp -r "${THRIFT_HOME}/python/lib/python2.7/site-packages/thrift" "${MODULE_LIB_DIR}"
+
+  cp "${WORKING_DIR}/__init__.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/impala_shell.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/impala_client.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/option_parser.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/shell_output.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/impala_shell_config_defaults.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/TSSLSocketWithWildcardSAN.py" "${MODULE_LIB_DIR}"
+
+  cp "${SHELL_HOME}/packaging/README.md" "${PACKAGE_DIR}"
+  cp "${SHELL_HOME}/packaging/MANIFEST.in" "${PACKAGE_DIR}"
+  cp "${SHELL_HOME}/packaging/requirements.txt" "${PACKAGE_DIR}"
+  cp "${SHELL_HOME}/packaging/setup.py" "${PACKAGE_DIR}"
+
+  cp "${IMPALA_HOME}/LICENSE.txt" "${PACKAGE_DIR}"
+}
+
+create_distributable_python_package() {
+  # Generate a new python package tarball in ${IMPALA_HOME}/shell/dist
+  if [[ "${NO_CLEAN_DIST}" != "true" ]]; then
+    rm -rf "${DIST_DIR}"
+  fi
+
+  mkdir -p "${DIST_DIR}"
+
+  pushd "${PACKAGE_DIR}"
+  echo "Building package..."
+  PACKAGE_TYPE="${PACKAGE_TYPE:-}" OFFICIAL="${OFFICIAL:-}" \
+    python setup.py sdist --dist-dir "${DIST_DIR}"
+  popd
+
+  if [[ "${NO_CLEAN_DIST}" != "true" ]]; then
+    rm -rf "${STAGING_DIR}"
+  fi
+}
+
+assemble_package_files
+create_distributable_python_package
diff --git a/shell/packaging/requirements.txt b/shell/packaging/requirements.txt
new file mode 100644
index 0000000..32aef56
--- /dev/null
+++ b/shell/packaging/requirements.txt
@@ -0,0 +1,8 @@
+bitarray==1.0.1
+prettytable==0.7.1
+sasl==0.2.1
+setuptools>=36.8.0
+six==1.11.0
+sqlparse==0.1.19
+thrift==0.9.3
+thrift_sasl==0.2.1
diff --git a/shell/packaging/setup.py b/shell/packaging/setup.py
new file mode 100644
index 0000000..173e0d8
--- /dev/null
+++ b/shell/packaging/setup.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""Set up the Impala shell python package."""
+
+import datetime
+import os
+import re
+import sys
+import time
+
+from impala_shell import impala_build_version
+from setuptools import find_packages, setup
+from textwrap import dedent
+
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+def parse_requirements(requirements_file='requirements.txt'):
+    """
+    Parse requirements from the requirements file, stripping comments.
+
+    Args:
+      requirements_file: path to a requirements file
+
+    Returns:
+      a list of python packages
+    """
+    lines = []
+    with open(requirements_file) as reqs:
+        for _ in reqs:
+            line = _.split('#')[0]
+            if line.strip():
+                lines.append(line)
+    return lines
+
+
+def get_version():
+  """Generate package version string when calling 'setup.py'.
+
+  When setup.py is being used to CREATE a distribution, e.g., via setup.py sdist
+  or setup.py bdist, then use the output from impala_build_version.get_version(),
+  and append modifiers as specified by the RELEASE_TYPE and OFFICIAL environment
+  variables. By default, the package created will be a dev release, designated
+  by timestamp. For example, if get_version() returns the string 3.0.0-SNAPSHOT,
+  the package version may be something like 3.0.0.dev20180322154653.
+
+  It's also possible set an evironment variable for BUILD_VERSION to override the
+  default build value returned from impala_build_version.get_version().
+
+  E.g., to specify an offical 3.4 beta 2 release (3.4b2), one would call:
+
+    BUILD_VERSION=3.4 RELEASE_TYPE=b2 OFFICIAL=true python setup.py sdist
+
+  The generated version string will be written to a version.txt file to be
+  referenced when the distribution is installed.
+
+  When setup.py is invoked during installation, e.g., via pip install or
+  setup.py install, read the package version from the version.txt file, which
+  is presumed to contain a single line containing a valid PEP-440 version string.
+  The file should have been generated when the distribution being installed was
+  created. (Although a version.txt file can also be created manually.)
+
+  See https://www.python.org/dev/peps/pep-0440/ for more info on python
+  version strings.
+
+  Returns:
+    A package version string compliant with PEP-440
+  """
+  version_file = os.path.join(CURRENT_DIR, 'version.txt')
+
+  if not os.path.isfile(version_file):
+    # If setup.py is being executed to create a distribution, e.g., via setup.py
+    # sdist or setup.py bdist, then derive the version and WRITE the version.txt
+    # file that will later be used for installations.
+    if os.getenv('BUILD_VERSION') is not None:
+      package_version = os.getenv('BUILD_VERSION')
+    else:
+      version_match = re.search('\d+\.\d+\.\d+', impala_build_version.get_version())
+      if version_match is None:
+        sys.exit('Unable to acquire Impala version.')
+      package_version = version_match.group(0)
+
+    # packages can be marked as alpha, beta, or rc RELEASE_TYPE
+    release_type = os.getenv('RELEASE_TYPE')
+    if release_type:
+      if not re.match('(a|b|rc)\d+?', release_type):
+        msg = """\
+            RELEASE_TYPE \'{0}\' does not conform to any PEP-440 release format:
+
+              aN (for alpha releases)
+              bN (for beta releases)
+              rcN (for release candidates)
+
+            where N is the number of the release"""
+        sys.exit(dedent(msg).format(release_type))
+      package_version += release_type
+
+    # packages that are not marked OFFICIAL have ".dev" + a timestamp appended
+    if os.getenv('OFFICIAL') != 'true':
+      epoch_t = time.time()
+      ts_fmt = '%Y%m%d%H%M%S'
+      timestamp = datetime.datetime.fromtimestamp(epoch_t).strftime(ts_fmt)
+      package_version = '{0}.dev{1}'.format(package_version, timestamp)
+
+    with open('version.txt', 'w') as version_file:
+      version_file.write(package_version)
+  else:
+    # If setup.py is being invoked during installation, e.g., via pip install
+    # or setup.py install, we expect a version.txt file from which to READ the
+    # version string.
+    with open(version_file) as version_file:
+      package_version = version_file.readline()
+
+  return package_version
+
+
+setup(
+  name='impala_shell',
+  python_requires='>2.6, <3.0.0',
+  version=get_version(),
+  description='Impala Shell',
+  long_description_content_type='text/markdown',
+  long_description=open('README.md').read(),
+  author="Impala Dev",
+  author_email='dev@impala.apache.org',
+  url='https://impala.apache.org/',
+  license='Apache Software License',
+  packages=find_packages(),
+  include_package_data=True,
+  install_requires=parse_requirements(),
+  entry_points={
+    'console_scripts': [
+      'impala-shell = impala_shell.impala_shell:impala_shell_main'
+    ]
+  },
+  classifiers=[
+    'Development Status :: 5 - Production/Stable',
+    'Environment :: Console',
+    'Intended Audience :: Developers',
+    'Intended Audience :: End Users/Desktop',
+    'Intended Audience :: Science/Research',
+    'License :: OSI Approved :: Apache Software License',
+    'Operating System :: MacOS :: MacOS X',
+    'Operating System :: POSIX :: Linux',
+    'Programming Language :: Python :: 2 :: Only',
+    'Programming Language :: Python :: 2.6',
+    'Programming Language :: Python :: 2.7',
+    'Topic :: Database :: Front-Ends'
+  ]
+)