| #!/usr/bin/env bash |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # Might be empty |
| AIRFLOW_COMMAND="${1:-}" |
| |
| set -euo pipefail |
| |
| # This one is to workaround https://github.com/apache/airflow/issues/17546 |
| # issue with /usr/lib/<MACHINE>-linux-gnu/libstdc++.so.6: cannot allocate memory in static TLS block |
| # We do not yet a more "correct" solution to the problem but in order to avoid raising new issues |
| # by users of the prod image, we implement the workaround now. |
| # The side effect of this is slightly (in the range of 100s of milliseconds) slower load for any |
| # binary started and a little memory used for Heap allocated by initialization of libstdc++ |
| # This overhead is not happening for binaries that already link dynamically libstdc++ |
| LD_PRELOAD="/usr/lib/$(uname -m)-linux-gnu/libstdc++.so.6" |
| export LD_PRELOAD |
| |
| function run_check_with_retries { |
| local cmd |
| cmd="${1}" |
| local countdown |
| countdown="${CONNECTION_CHECK_MAX_COUNT}" |
| |
| while true |
| do |
| set +e |
| local last_check_result |
| local res |
| last_check_result=$(eval "${cmd} 2>&1") |
| res=$? |
| set -e |
| if [[ ${res} == 0 ]]; then |
| echo |
| break |
| else |
| echo -n "." |
| countdown=$((countdown-1)) |
| fi |
| if [[ ${countdown} == 0 ]]; then |
| echo |
| echo "ERROR! Maximum number of retries (${CONNECTION_CHECK_MAX_COUNT}) reached." |
| echo |
| echo "Last check result:" |
| echo "$ ${cmd}" |
| echo "${last_check_result}" |
| echo |
| exit 1 |
| else |
| sleep "${CONNECTION_CHECK_SLEEP_TIME}" |
| fi |
| done |
| } |
| |
| function run_nc() { |
| # Checks if it is possible to connect to the host using netcat. |
| # |
| # We want to avoid misleading messages and perform only forward lookup of the service IP address. |
| # Netcat when run without -n performs both forward and reverse lookup and fails if the reverse |
| # lookup name does not match the original name even if the host is reachable via IP. This happens |
| # randomly with docker-compose in GitHub Actions. |
| # Since we are not using reverse lookup elsewhere, we can perform forward lookup in python |
| # And use the IP in NC and add '-n' switch to disable any DNS use. |
| # Even if this message might be harmless, it might hide the real reason for the problem |
| # Which is the long time needed to start some services, seeing this message might be totally misleading |
| # when you try to analyse the problem, that's why it's best to avoid it, |
| local host="${1}" |
| local port="${2}" |
| local ip |
| ip=$(python -c "import socket; print(socket.gethostbyname('${host}'))") |
| nc -zvvn "${ip}" "${port}" |
| } |
| |
| |
| function wait_for_connection { |
| # Waits for Connection to the backend specified via URL passed as first parameter |
| # Detects backend type depending on the URL schema and assigns |
| # default port numbers if not specified in the URL. |
| # Then it loops until connection to the host/port specified can be established |
| # It tries `CONNECTION_CHECK_MAX_COUNT` times and sleeps `CONNECTION_CHECK_SLEEP_TIME` between checks |
| local connection_url |
| connection_url="${1}" |
| local detected_backend |
| detected_backend=$(python -c "from urllib.parse import urlsplit; import sys; print(urlsplit(sys.argv[1]).scheme)" "${connection_url}") |
| local detected_host |
| detected_host=$(python -c "from urllib.parse import urlsplit; import sys; print(urlsplit(sys.argv[1]).hostname or '')" "${connection_url}") |
| local detected_port |
| detected_port=$(python -c "from urllib.parse import urlsplit; import sys; print(urlsplit(sys.argv[1]).port or '')" "${connection_url}") |
| |
| echo BACKEND="${BACKEND:=${detected_backend}}" |
| readonly BACKEND |
| |
| if [[ -z "${detected_port=}" ]]; then |
| if [[ ${BACKEND} == "postgres"* ]]; then |
| detected_port=5432 |
| elif [[ ${BACKEND} == "mysql"* ]]; then |
| detected_port=3306 |
| elif [[ ${BACKEND} == "mssql"* ]]; then |
| detected_port=1433 |
| elif [[ ${BACKEND} == "redis"* ]]; then |
| detected_port=6379 |
| elif [[ ${BACKEND} == "amqp"* ]]; then |
| detected_port=5672 |
| fi |
| fi |
| |
| detected_host=${detected_host:="localhost"} |
| |
| # Allow the DB parameters to be overridden by environment variable |
| echo DB_HOST="${DB_HOST:=${detected_host}}" |
| readonly DB_HOST |
| |
| echo DB_PORT="${DB_PORT:=${detected_port}}" |
| readonly DB_PORT |
| if [[ -n "${DB_HOST=}" ]] && [[ -n "${DB_PORT=}" ]]; then |
| run_check_with_retries "run_nc ${DB_HOST@Q} ${DB_PORT@Q}" |
| else |
| >&2 echo "The connection details to the broker could not be determined. Connectivity checks were skipped." |
| fi |
| } |
| |
| function create_www_user() { |
| local local_password="" |
| # Warning: command environment variables (*_CMD) have priority over usual configuration variables |
| # for configuration parameters that require sensitive information. This is the case for the SQL database |
| # and the broker backend in this entrypoint script. |
| if [[ -n "${_AIRFLOW_WWW_USER_PASSWORD_CMD=}" ]]; then |
| local_password=$(eval "${_AIRFLOW_WWW_USER_PASSWORD_CMD}") |
| unset _AIRFLOW_WWW_USER_PASSWORD_CMD |
| elif [[ -n "${_AIRFLOW_WWW_USER_PASSWORD=}" ]]; then |
| local_password="${_AIRFLOW_WWW_USER_PASSWORD}" |
| unset _AIRFLOW_WWW_USER_PASSWORD |
| fi |
| if [[ -z ${local_password} ]]; then |
| echo |
| echo "ERROR! Airflow Admin password not set via _AIRFLOW_WWW_USER_PASSWORD or _AIRFLOW_WWW_USER_PASSWORD_CMD variables!" |
| echo |
| exit 1 |
| fi |
| |
| airflow users create \ |
| --username "${_AIRFLOW_WWW_USER_USERNAME="admin"}" \ |
| --firstname "${_AIRFLOW_WWW_USER_FIRSTNAME="Airflow"}" \ |
| --lastname "${_AIRFLOW_WWW_USER_LASTNAME="Admin"}" \ |
| --email "${_AIRFLOW_WWW_USER_EMAIL="airflowadmin@example.com"}" \ |
| --role "${_AIRFLOW_WWW_USER_ROLE="Admin"}" \ |
| --password "${local_password}" || true |
| } |
| |
| function create_system_user_if_missing() { |
| # This is needed in case of OpenShift-compatible container execution. In case of OpenShift random |
| # User id is used when starting the image, however group 0 is kept as the user group. Our production |
| # Image is OpenShift compatible, so all permissions on all folders are set so that 0 group can exercise |
| # the same privileges as the default "airflow" user, this code checks if the user is already |
| # present in /etc/passwd and will create the system user dynamically, including setting its |
| # HOME directory to the /home/airflow so that (for example) the ${HOME}/.local folder where airflow is |
| # Installed can be automatically added to PYTHONPATH |
| if ! whoami &> /dev/null; then |
| if [[ -w /etc/passwd ]]; then |
| echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${AIRFLOW_USER_HOME_DIR}:/sbin/nologin" \ |
| >> /etc/passwd |
| fi |
| export HOME="${AIRFLOW_USER_HOME_DIR}" |
| fi |
| } |
| |
| function set_pythonpath_for_root_user() { |
| # Airflow is installed as a local user application which means that if the container is running as root |
| # the application is not available. because Python then only load system-wide applications. |
| # Now also adds applications installed as local user "airflow". |
| if [[ $UID == "0" ]]; then |
| local python_major_minor |
| python_major_minor="$(python --version | cut -d " " -f 2 | cut -d "." -f 1-2)" |
| export PYTHONPATH="${AIRFLOW_USER_HOME_DIR}/.local/lib/python${python_major_minor}/site-packages:${PYTHONPATH:-}" |
| >&2 echo "The container is run as root user. For security, consider using a regular user account." |
| fi |
| } |
| |
| function wait_for_airflow_db() { |
| # Wait for the command to run successfully to validate the database connection. |
| run_check_with_retries "airflow db check" |
| } |
| |
| function upgrade_db() { |
| # Runs airflow db upgrade |
| airflow db upgrade || true |
| } |
| |
| function wait_for_celery_broker() { |
| # Verifies connection to Celery Broker |
| local executor |
| executor="$(airflow config get-value core executor)" |
| if [[ "${executor}" == "CeleryExecutor" ]]; then |
| local connection_url |
| connection_url="$(airflow config get-value celery broker_url)" |
| wait_for_connection "${connection_url}" |
| fi |
| } |
| |
| function exec_to_bash_or_python_command_if_specified() { |
| # If one of the commands: 'bash', 'python' is used, either run appropriate |
| # command with exec |
| if [[ ${AIRFLOW_COMMAND} == "bash" ]]; then |
| shift |
| exec "/bin/bash" "${@}" |
| elif [[ ${AIRFLOW_COMMAND} == "python" ]]; then |
| shift |
| exec "python" "${@}" |
| fi |
| } |
| |
| function check_uid_gid() { |
| if [[ $(id -g) == "0" ]]; then |
| return |
| fi |
| if [[ $(id -u) == "50000" ]]; then |
| >&2 echo |
| >&2 echo "WARNING! You should run the image with GID (Group ID) set to 0" |
| >&2 echo " even if you use 'airflow' user (UID=50000)" |
| >&2 echo |
| >&2 echo " You started the image with UID=$(id -u) and GID=$(id -g)" |
| >&2 echo |
| >&2 echo " This is to make sure you can run the image with an arbitrary UID in the future." |
| >&2 echo |
| >&2 echo " See more about it in the Airflow's docker image documentation" |
| >&2 echo " http://airflow.apache.org/docs/docker-stack/entrypoint" |
| >&2 echo |
| # We still allow the image to run with `airflow` user. |
| return |
| else |
| >&2 echo |
| >&2 echo "ERROR! You should run the image with GID=0" |
| >&2 echo |
| >&2 echo " You started the image with UID=$(id -u) and GID=$(id -g)" |
| >&2 echo |
| >&2 echo "The image should always be run with GID (Group ID) set to 0 regardless of the UID used." |
| >&2 echo " This is to make sure you can run the image with an arbitrary UID." |
| >&2 echo |
| >&2 echo " See more about it in the Airflow's docker image documentation" |
| >&2 echo " http://airflow.apache.org/docs/docker-stack/entrypoint" |
| # This will not work so we fail hard |
| exit 1 |
| fi |
| } |
| |
| # In Airflow image we are setting PIP_USER variable to true, in order to install all the packages |
| # by default with the ``--user`` flag. However this is a problem if a virtualenv is created later |
| # which happens in PythonVirtualenvOperator. We are unsetting this variable here, so that it is |
| # not set when PIP is run by Airflow later on |
| unset PIP_USER |
| |
| check_uid_gid |
| |
| # Set umask to 0002 to make all the directories created by the current user group-writeable |
| # This allows the same directories to be writeable for any arbitrary user the image will be |
| # run with, when the directory is created on a mounted volume and when that volume is later |
| # reused with a different UID (but with GID=0) |
| umask 0002 |
| |
| CONNECTION_CHECK_MAX_COUNT=${CONNECTION_CHECK_MAX_COUNT:=20} |
| readonly CONNECTION_CHECK_MAX_COUNT |
| |
| CONNECTION_CHECK_SLEEP_TIME=${CONNECTION_CHECK_SLEEP_TIME:=3} |
| readonly CONNECTION_CHECK_SLEEP_TIME |
| |
| create_system_user_if_missing |
| set_pythonpath_for_root_user |
| if [[ "${CONNECTION_CHECK_MAX_COUNT}" -gt "0" ]]; then |
| wait_for_airflow_db |
| fi |
| |
| if [[ -n "${_AIRFLOW_DB_UPGRADE=}" ]] ; then |
| upgrade_db |
| fi |
| |
| if [[ -n "${_AIRFLOW_WWW_USER_CREATE=}" ]] ; then |
| create_www_user |
| fi |
| |
| if [[ -n "${_PIP_ADDITIONAL_REQUIREMENTS=}" ]] ; then |
| >&2 echo |
| >&2 echo "!!!!! Installing additional requirements: '${_PIP_ADDITIONAL_REQUIREMENTS}' !!!!!!!!!!!!" |
| >&2 echo |
| >&2 echo "WARNING: This is a development/test feature only. NEVER use it in production!" |
| >&2 echo " Instead, build a custom image as described in" |
| >&2 echo |
| >&2 echo " https://airflow.apache.org/docs/docker-stack/build.html" |
| >&2 echo |
| >&2 echo " Adding requirements at container startup is fragile and is done every time" |
| >&2 echo " the container starts, so it is onlny useful for testing and trying out" |
| >&2 echo " of adding dependencies." |
| >&2 echo |
| pip install --root-user-action ignore --no-cache-dir ${_PIP_ADDITIONAL_REQUIREMENTS} |
| fi |
| |
| |
| # The `bash` and `python` commands should also verify the basic connections |
| # So they are run after the DB check |
| exec_to_bash_or_python_command_if_specified "${@}" |
| |
| # Remove "airflow" if it is specified as airflow command |
| # This way both command types work the same way: |
| # |
| # docker run IMAGE airflow webserver |
| # docker run IMAGE webserver |
| # |
| if [[ ${AIRFLOW_COMMAND} == "airflow" ]]; then |
| AIRFLOW_COMMAND="${2:-}" |
| shift |
| fi |
| |
| # Note: the broker backend configuration concerns only a subset of Airflow components |
| if [[ ${AIRFLOW_COMMAND} =~ ^(scheduler|celery)$ ]] \ |
| && [[ "${CONNECTION_CHECK_MAX_COUNT}" -gt "0" ]]; then |
| wait_for_celery_broker |
| fi |
| |
| exec "airflow" "${@}" |