| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| # WARNING: THIS DOCKERFILE IS NOT INTENDED FOR PRODUCTION USE OR DEPLOYMENT. |
| # |
| ARG PYTHON_BASE_IMAGE="python:3.6-slim-stretch" |
| FROM ${PYTHON_BASE_IMAGE} as main |
| |
| SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"] |
| |
| ARG PYTHON_BASE_IMAGE="python:3.6-slim-stretch" |
| ENV PYTHON_BASE_IMAGE=${PYTHON_BASE_IMAGE} |
| |
| ARG AIRFLOW_VERSION="2.0.0.dev0" |
| ENV AIRFLOW_VERSION=$AIRFLOW_VERSION |
| |
| # Print versions |
| RUN echo "Base image: ${PYTHON_BASE_IMAGE}" |
| RUN echo "Airflow version: ${AIRFLOW_VERSION}" |
| |
| # Make sure noninteractive debian install is used and language variables set |
| ENV DEBIAN_FRONTEND=noninteractive LANGUAGE=C.UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 \ |
| LC_CTYPE=C.UTF-8 LC_MESSAGES=C.UTF-8 |
| |
| # By increasing this number we can do force build of all dependencies |
| ARG DEPENDENCIES_EPOCH_NUMBER="2" |
| # Increase the value below to force renstalling of all dependencies |
| ENV DEPENDENCIES_EPOCH_NUMBER=${DEPENDENCIES_EPOCH_NUMBER} |
| |
| # Install curl and gnupg2 - needed to download nodejs in the next step |
| RUN apt-get update \ |
| && apt-get install -y --no-install-recommends \ |
| curl \ |
| gnupg2 \ |
| && apt-get autoremove -yqq --purge \ |
| && apt-get clean \ |
| && rm -rf /var/lib/apt/lists/* |
| |
| # Install basic apt dependencies |
| RUN curl -L https://deb.nodesource.com/setup_10.x | bash - \ |
| && curl https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - > /dev/null \ |
| && echo "deb https://dl.yarnpkg.com/debian/ stable main" > /etc/apt/sources.list.d/yarn.list \ |
| && apt-get update \ |
| && apt-get install -y --no-install-recommends \ |
| apt-utils \ |
| build-essential \ |
| dirmngr \ |
| freetds-bin \ |
| freetds-dev \ |
| git \ |
| gosu \ |
| libffi-dev \ |
| libkrb5-dev \ |
| libpq-dev \ |
| libsasl2-2 \ |
| libsasl2-dev \ |
| libsasl2-modules \ |
| libssl-dev \ |
| locales \ |
| netcat \ |
| nodejs \ |
| rsync \ |
| sasl2-bin \ |
| sudo \ |
| unixodbc \ |
| unixodbc-dev \ |
| yarn \ |
| && apt-get autoremove -yqq --purge \ |
| && apt-get clean \ |
| && rm -rf /var/lib/apt/lists/* |
| |
| # Install graphviz - needed to build docs with diagrams |
| RUN apt-get update \ |
| && apt-get install -y --no-install-recommends \ |
| graphviz \ |
| && apt-get autoremove -yqq --purge \ |
| && apt-get clean \ |
| && rm -rf /var/lib/apt/lists/* |
| |
| # Install MySQL client from Oracle repositories (Debian installs mariadb) |
| RUN KEY="A4A9406876FCBD3C456770C88C718D3B5072E1F5" \ |
| && GNUPGHOME="$(mktemp -d)" \ |
| && export GNUPGHOME \ |
| && for KEYSERVER in $(shuf -e \ |
| ha.pool.sks-keyservers.net \ |
| hkp://p80.pool.sks-keyservers.net:80 \ |
| keyserver.ubuntu.com \ |
| hkp://keyserver.ubuntu.com:80 \ |
| pgp.mit.edu) ; do \ |
| gpg --keyserver "${KEYSERVER}" --recv-keys "${KEY}" && break || true ; \ |
| done \ |
| && gpg --export "${KEY}" | apt-key add - \ |
| && gpgconf --kill all \ |
| rm -rf "${GNUPGHOME}"; \ |
| apt-key list > /dev/null \ |
| && echo "deb http://repo.mysql.com/apt/debian/ stretch mysql-5.6" | tee -a /etc/apt/sources.list.d/mysql.list \ |
| && apt-get update \ |
| && apt-get install --no-install-recommends -y \ |
| libmysqlclient-dev \ |
| mysql-client \ |
| && apt-get autoremove -yqq --purge \ |
| && apt-get clean && rm -rf /var/lib/apt/lists/* |
| |
| RUN adduser airflow \ |
| && echo "airflow ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/airflow \ |
| && chmod 0440 /etc/sudoers.d/airflow |
| |
| ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ |
| |
| # Note missing man directories on debian-stretch |
| # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=863199 |
| RUN mkdir -pv /usr/share/man/man1 \ |
| && mkdir -pv /usr/share/man/man7 \ |
| && apt-get update \ |
| && apt-get install --no-install-recommends -y \ |
| gnupg \ |
| apt-transport-https \ |
| bash-completion \ |
| ca-certificates \ |
| software-properties-common \ |
| krb5-user \ |
| ldap-utils \ |
| less \ |
| lsb-release \ |
| net-tools \ |
| openjdk-8-jdk \ |
| openssh-client \ |
| openssh-server \ |
| postgresql-client \ |
| python-selinux \ |
| sqlite3 \ |
| tmux \ |
| unzip \ |
| vim \ |
| && apt-get autoremove -yqq --purge \ |
| && apt-get clean \ |
| && rm -rf /var/lib/apt/lists/* |
| |
| |
| # Install Hadoop and Hive |
| # It is done in one step to share variables. |
| ENV HADOOP_HOME="/opt/hadoop-cdh" HIVE_HOME="/opt/hive" |
| |
| RUN HADOOP_DISTRO="cdh" \ |
| && HADOOP_MAJOR="5" \ |
| && HADOOP_DISTRO_VERSION="5.11.0" \ |
| && HADOOP_VERSION="2.6.0" \ |
| && HADOOP_URL="https://archive.cloudera.com/${HADOOP_DISTRO}${HADOOP_MAJOR}/${HADOOP_DISTRO}/${HADOOP_MAJOR}/"\ |
| && HADOOP_DOWNLOAD_URL="${HADOOP_URL}hadoop-${HADOOP_VERSION}-${HADOOP_DISTRO}${HADOOP_DISTRO_VERSION}.tar.gz" \ |
| && HADOOP_TMP_FILE="/tmp/hadoop.tar.gz" \ |
| && mkdir -pv "${HADOOP_HOME}" \ |
| && curl -L "${HADOOP_DOWNLOAD_URL}" -o "${HADOOP_TMP_FILE}" \ |
| && tar xzf "${HADOOP_TMP_FILE}" --absolute-names --strip-components 1 -C "${HADOOP_HOME}" \ |
| && rm "${HADOOP_TMP_FILE}" \ |
| && echo "Installing Hive" \ |
| && HIVE_VERSION="1.1.0" \ |
| && HIVE_URL="${HADOOP_URL}hive-${HIVE_VERSION}-${HADOOP_DISTRO}${HADOOP_DISTRO_VERSION}.tar.gz" \ |
| && HIVE_VERSION="1.1.0" \ |
| && HIVE_TMP_FILE="/tmp/hive.tar.gz" \ |
| && mkdir -pv "${HIVE_HOME}" \ |
| && mkdir -pv "/user/hive/warehouse" \ |
| && chmod -R 777 "${HIVE_HOME}" \ |
| && chmod -R 777 "/user/" \ |
| && curl -L "${HIVE_URL}" -o "${HIVE_TMP_FILE}" \ |
| && tar xzf "${HIVE_TMP_FILE}" --strip-components 1 -C "${HIVE_HOME}" \ |
| && rm "${HIVE_TMP_FILE}" |
| |
| ENV PATH "${PATH}:/opt/hive/bin" |
| |
| # Install Minicluster |
| ENV MINICLUSTER_HOME="/opt/minicluster" |
| |
| RUN MINICLUSTER_BASE="https://github.com/bolkedebruin/minicluster/releases/download/" \ |
| && MINICLUSTER_VER="1.1" \ |
| && MINICLUSTER_URL="${MINICLUSTER_BASE}${MINICLUSTER_VER}/minicluster-${MINICLUSTER_VER}-SNAPSHOT-bin.zip" \ |
| && MINICLUSTER_TMP_FILE="/tmp/minicluster.zip" \ |
| && mkdir -pv "${MINICLUSTER_HOME}" \ |
| && curl -L "${MINICLUSTER_URL}" -o "${MINICLUSTER_TMP_FILE}" \ |
| && unzip "${MINICLUSTER_TMP_FILE}" -d "/opt" \ |
| && rm "${MINICLUSTER_TMP_FILE}" |
| |
| # Install Docker |
| RUN curl -L https://download.docker.com/linux/debian/gpg | apt-key add - \ |
| && add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian stretch stable" \ |
| && apt-get update \ |
| && apt-get -y install --no-install-recommends docker-ce \ |
| && apt-get autoremove -yqq --purge \ |
| && apt-get clean && rm -rf /var/lib/apt/lists/* |
| |
| # Install kubectl |
| ARG KUBECTL_VERSION="v1.15.3" |
| |
| RUN KUBECTL_URL="https://storage.googleapis.com/kubernetes-release/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" \ |
| && curl -L "${KUBECTL_URL}" -o "/usr/local/bin/kubectl" \ |
| && chmod +x /usr/local/bin/kubectl |
| |
| # Install Kind |
| ARG KIND_VERSION="v0.6.1" |
| |
| RUN KIND_URL="https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-amd64" \ |
| && curl -L "${KIND_URL}" -o "/usr/local/bin/kind" \ |
| && chmod +x /usr/local/bin/kind |
| |
| # Install Apache RAT |
| ARG RAT_VERSION="0.13" |
| |
| RUN RAT_URL="https://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar" \ |
| && RAT_JAR="/opt/apache-rat.jar" \ |
| && RAT_JAR_MD5="${RAT_JAR}.md5" \ |
| && RAT_URL_MD5="${RAT_URL}.md5" \ |
| && echo "Downloading RAT from ${RAT_URL} to ${RAT_JAR}" \ |
| && curl -L "${RAT_URL}" -o "${RAT_JAR}" \ |
| && curl -L "${RAT_URL_MD5}" -o "${RAT_JAR_MD5}" \ |
| && jar -tf "${RAT_JAR}" > /dev/null \ |
| && md5sum -c <<<"$(cat "${RAT_JAR_MD5}") ${RAT_JAR}" |
| |
| # Setup PIP |
| # By default PIP install run without cache to make image smaller |
| ARG PIP_NO_CACHE_DIR="true" |
| ENV PIP_NO_CACHE_DIR=${PIP_NO_CACHE_DIR} |
| RUN echo "Pip no cache dir: ${PIP_NO_CACHE_DIR}" |
| |
| # PIP version used to install dependencies |
| ARG PIP_VERSION="19.0.2" |
| ENV PIP_VERSION=${PIP_VERSION} |
| RUN echo "Pip version: ${PIP_VERSION}" |
| |
| RUN pip install --upgrade pip==${PIP_VERSION} |
| |
| # Install Google SDK |
| ENV GCLOUD_HOME="/opt/gcloud" |
| |
| RUN GCLOUD_VERSION="274.0.1" \ |
| && GCOUD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${GCLOUD_VERSION}-linux-x86_64.tar.gz" \ |
| && GCLOUD_TMP_FILE="/tmp/gcloud.tar.gz" \ |
| && export CLOUDSDK_CORE_DISABLE_PROMPTS=1 \ |
| && mkdir -p /opt/gcloud \ |
| && curl "${GCOUD_URL}" -o "${GCLOUD_TMP_FILE}"\ |
| && tar xzf "${GCLOUD_TMP_FILE}" --strip-components 1 -C "${GCLOUD_HOME}" \ |
| && rm -rf "${GCLOUD_TMP_FILE}" \ |
| && echo '. /opt/gcloud/completion.bash.inc' >> /etc/bash.bashrc |
| |
| ENV PATH="$PATH:${GCLOUD_HOME}/bin" |
| |
| # Install AWS CLI |
| # Unfortunately, AWS does not provide a versioned bundle |
| ENV AWS_HOME="/opt/aws" |
| |
| RUN AWS_TMP_DIR="/tmp/awscli/" \ |
| && AWS_TMP_BUNDLE="${AWS_TMP_DIR}/awscli-bundle.zip" \ |
| && AWS_URL="https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" \ |
| && mkdir -pv "${AWS_TMP_DIR}" \ |
| && curl "${AWS_URL}" -o "${AWS_TMP_BUNDLE}" \ |
| && unzip "${AWS_TMP_BUNDLE}" -d "${AWS_TMP_DIR}" \ |
| && "${AWS_TMP_DIR}/awscli-bundle/install" -i "${AWS_HOME}" -b /usr/local/bin/aws \ |
| && echo "complete -C '${AWS_HOME}/bin/aws_completer' aws" >> /etc/bash.bashrc \ |
| && rm -rf "${AWS_TMP_DIR}" |
| |
| ARG HOME=/root |
| ENV HOME=${HOME} |
| |
| ARG AIRFLOW_HOME=/root/airflow |
| ENV AIRFLOW_HOME=${AIRFLOW_HOME} |
| |
| ARG AIRFLOW_SOURCES=/opt/airflow |
| ENV AIRFLOW_SOURCES=${AIRFLOW_SOURCES} |
| |
| WORKDIR ${AIRFLOW_SOURCES} |
| |
| RUN mkdir -pv ${AIRFLOW_HOME} \ |
| mkdir -pv ${AIRFLOW_HOME}/dags \ |
| mkdir -pv ${AIRFLOW_HOME}/logs |
| |
| # Increase the value here to force reinstalling Apache Airflow pip dependencies |
| ARG PIP_DEPENDENCIES_EPOCH_NUMBER="2" |
| ENV PIP_DEPENDENCIES_EPOCH_NUMBER=${PIP_DEPENDENCIES_EPOCH_NUMBER} |
| |
| # Optimizing installation of Cassandra driver |
| # Speeds up building the image - cassandra driver without CYTHON saves around 10 minutes |
| ARG CASS_DRIVER_NO_CYTHON="1" |
| # Build cassandra driver on multiple CPUs |
| ARG CASS_DRIVER_BUILD_CONCURRENCY="8" |
| |
| ENV CASS_DRIVER_BUILD_CONCURRENCY=${CASS_DRIVER_BUILD_CONCURRENCY} |
| ENV CASS_DRIVER_NO_CYTHON=${CASS_DRIVER_NO_CYTHON} |
| |
| ARG AIRFLOW_REPO=apache/airflow |
| ENV AIRFLOW_REPO=${AIRFLOW_REPO} |
| |
| ARG AIRFLOW_BRANCH=master |
| ENV AIRFLOW_BRANCH=${AIRFLOW_BRANCH} |
| |
| # Airflow Extras installed |
| ARG AIRFLOW_EXTRAS="all" |
| ENV AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS} |
| |
| RUN echo "Installing with extras: ${AIRFLOW_EXTRAS}." |
| |
| ARG AIRFLOW_CONTAINER_CI_OPTIMISED_BUILD="false" |
| ENV AIRFLOW_CONTAINER_CI_OPTIMISED_BUILD=${AIRFLOW_CONTAINER_CI_OPTIMISED_BUILD} |
| |
| # By changing the CI build epoch we can force reinstalling Arflow from the current master |
| # It can also be overwritten manually by setting the AIRFLOW_CI_BUILD_EPOCH environment variable. |
| ARG AIRFLOW_CI_BUILD_EPOCH="1" |
| ENV AIRFLOW_CI_BUILD_EPOCH=${AIRFLOW_CI_BUILD_EPOCH} |
| |
| # In case of CI-optimised builds we want to pre-install master version of airflow dependencies so that |
| # We do not have to always reinstall it from the scratch. |
| # This can be reinstalled from latest master by increasing PIP_DEPENDENCIES_EPOCH_NUMBER. |
| # And is automatically reinstalled from the scratch every month |
| RUN \ |
| if [[ "${AIRFLOW_CONTAINER_CI_OPTIMISED_BUILD}" == "true" ]]; then \ |
| pip install \ |
| "https://github.com/apache/airflow/archive/${AIRFLOW_BRANCH}.tar.gz#egg=apache-airflow[${AIRFLOW_EXTRAS}]" \ |
| && pip uninstall --yes apache-airflow; \ |
| fi |
| |
| # Install NPM dependencies here. The NPM dependencies don't change that often and we already have pip |
| # installed dependencies in case of CI optimised build, so it is ok to install NPM deps here |
| # Rather than after setup.py is added. |
| COPY airflow/www/yarn.lock airflow/www/package.json ${AIRFLOW_SOURCES}/airflow/www/ |
| |
| WORKDIR ${AIRFLOW_SOURCES}/airflow/www |
| |
| RUN yarn install --frozen-lockfile |
| |
| WORKDIR ${AIRFLOW_SOURCES} |
| |
| # Note! We are copying everything with airflow:airflow user:group even if we use root to run the scripts |
| # This is fine as root user will be able to use those dirs anyway. |
| |
| # Airflow sources change frequently but dependency configuration won't change that often |
| # We copy setup.py and other files needed to perform setup of dependencies |
| # So in case setup.py changes we can install latest dependencies required. |
| COPY setup.py ${AIRFLOW_SOURCES}/setup.py |
| COPY setup.cfg ${AIRFLOW_SOURCES}/setup.cfg |
| |
| COPY airflow/version.py ${AIRFLOW_SOURCES}/airflow/version.py |
| COPY airflow/__init__.py ${AIRFLOW_SOURCES}/airflow/__init__.py |
| COPY airflow/bin/airflow ${AIRFLOW_SOURCES}/airflow/bin/airflow |
| |
| # The goal of this line is to install the dependencies from the most current setup.py from sources |
| # This will be usually incremental small set of packages in CI optimized build, so it will be very fast |
| # In non-CI optimized build this will install all dependencies before installing sources. |
| RUN pip install -e ".[${AIRFLOW_EXTRAS}]" |
| |
| WORKDIR ${AIRFLOW_SOURCES}/airflow/www |
| |
| # Copy all www files here so that we can run yarn building for production |
| COPY airflow/www/ ${AIRFLOW_SOURCES}/airflow/www/ |
| |
| # Package NPM for production |
| RUN yarn run prod |
| |
| COPY scripts/docker/entrypoint.sh /entrypoint.sh |
| |
| # Copy selected subdirectories only |
| COPY .github/ ${AIRFLOW_SOURCES}/.github/ |
| COPY dags/ ${AIRFLOW_SOURCES}/dags/ |
| COPY common/ ${AIRFLOW_SOURCES}/common/ |
| COPY licenses/ ${AIRFLOW_SOURCES}/licenses/ |
| COPY scripts/ci/ ${AIRFLOW_SOURCES}/scripts/ci/ |
| COPY docs/ ${AIRFLOW_SOURCES}/docs/ |
| COPY tests/ ${AIRFLOW_SOURCES}/tests/ |
| COPY airflow/ ${AIRFLOW_SOURCES}/airflow/ |
| COPY .coveragerc .rat-excludes .flake8 pylintrc LICENSE MANIFEST.in NOTICE CHANGELOG.txt \ |
| .github pytest.ini \ |
| setup.cfg setup.py \ |
| ${AIRFLOW_SOURCES}/ |
| |
| # Needed for building images via docker-in-docker inside the docker |
| COPY Dockerfile ${AIRFLOW_SOURCES}/Dockerfile |
| |
| # Install autocomplete for airflow |
| RUN register-python-argcomplete airflow >> ~/.bashrc |
| |
| # Install autocomplete for Kubeclt |
| RUN echo "source /etc/bash_completion" >> ~/.bashrc \ |
| && kubectl completion bash >> ~/.bashrc |
| |
| WORKDIR ${AIRFLOW_SOURCES} |
| |
| # Additional python deps to install |
| ARG ADDITIONAL_PYTHON_DEPS="" |
| |
| RUN if [[ -n "${ADDITIONAL_PYTHON_DEPS}" ]]; then \ |
| pip install ${ADDITIONAL_PYTHON_DEPS}; \ |
| fi |
| |
| WORKDIR ${AIRFLOW_SOURCES} |
| |
| ENV PATH="${HOME}:${PATH}" |
| |
| EXPOSE 8080 |
| |
| ENTRYPOINT ["/usr/local/bin/dumb-init", "--", "/entrypoint.sh"] |
| |
| CMD ["--help"] |