Refactor Docker image to allow faster rebuilds (#23) * Convert Docker image to multi-stage build * Replace empty "_FILE" build args with a default of _NOT_SET and check for that default when deciding whether or not to download the tarball from Apache servers. A "COPY" command with an empty build arg ends up copying the entire build context into the image, which is not what we want. * Change glob patterns for extracted archive copy so that it does not depend on the "*_VERSION" variables. Now one can change the included file without having to update the corresponding VERSION variable. * Fix a potential issue where if the "_FILE" arg is specified but named a non-existent file, the download script would silently try to download whatever version was specific in the "_VERSION" variable. This could lead to an unintended version getting included in the final image. Instead, the download script now fails if a "_FILE" build arg is set, but the corresponding file does not exist.

commit: 7cf9ca9573f45b7844d48df99d3195a6dc09d2f7 [log] [tgz]
author: Brian Loss <brianloss@gmail.com> Mon Sep 19 18:39:29 2022 -0400
committer: GitHub <noreply@github.com> Mon Sep 19 18:39:29 2022 -0400
tree: 072471a43d5b00cb6c31269830dbe8278ee45cbb
parent: 3ef2e2396b97cd2068c5b2d185063a80f834b5fa [diff]
diff --git a/Dockerfile b/Dockerfile
index 5e1984e..6aa631e 100644
--- a/Dockerfile
+++ b/Dockerfile

@@ -13,72 +13,110 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM rockylinux:9
 
-ARG ACCUMULO_VERSION=2.1.0
-ARG HADOOP_VERSION=3.3.3
-ARG ZOOKEEPER_VERSION=3.8.0
-ARG HADOOP_USER_NAME=accumulo
-ARG ACCUMULO_FILE=
-ARG HADOOP_FILE=
-ARG ZOOKEEPER_FILE=
+##
+## Base image. Rocky Linux 9 with updates, JRE 11 headless, and updated CA certs.
+##
+FROM rockylinux:9 as base
 
-ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk HADOOP_USER_NAME=$HADOOP_USER_NAME
-
-ENV APACHE_DIST_URLS \
-  https://www.apache.org/dyn/closer.cgi?action=download&filename= \
-# if the version is outdated (or we're grabbing the .asc file), we might have to pull from the dist/archive :/
-  https://www-us.apache.org/dist/ \
-  https://www.apache.org/dist/ \
-  https://archive.apache.org/dist/
-
-COPY README.md $ACCUMULO_FILE $HADOOP_FILE $ZOOKEEPER_FILE /tmp/
-
-RUN yum install -y ca-certificates java-11-openjdk-devel make gcc-c++ wget && \
+RUN set -eux; \
+  yum install -y ca-certificates java-11-openjdk-headless && \
   update-ca-trust extract && \
-  set -eux; \
-  download() { \
-    local f="$1"; shift; \
-    local distFile="$1"; shift; \
-    local success=; \
-    local distUrl=; \
-    for distUrl in $APACHE_DIST_URLS; do \
-      if wget -nv -O "$f" "$distUrl$distFile"; then \
-        success=1; \
-        break; \
-      fi; \
-    done; \
-    [ -n "$success" ]; \
-  }; \
-  \
-  if [ -z "$HADOOP_FILE" ]; then \
-    download "hadoop.tar.gz" "hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz"; \
-  else \
-    mv "/tmp/$HADOOP_FILE" "hadoop.tar.gz"; \
-  fi; \
-  if [ -z "$ZOOKEEPER_FILE" ]; then \
-    download "zookeeper.tar.gz" "zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz"; \
-  else \
-    mv "/tmp/$ZOOKEEPER_FILE" "zookeeper.tar.gz"; \
-  fi; \
-  if [ -z "$ACCUMULO_FILE" ]; then \
-    download "accumulo.tar.gz" "accumulo/$ACCUMULO_VERSION/accumulo-$ACCUMULO_VERSION-bin.tar.gz"; \
-  else \
-    mv "/tmp/$ACCUMULO_FILE" "accumulo.tar.gz"; \
-  fi && \
-  tar xzf accumulo.tar.gz -C /tmp/ && \
-  tar xzf hadoop.tar.gz -C /tmp/ && \
-  tar xzf zookeeper.tar.gz -C /tmp/ && \
-  mv /tmp/hadoop-$HADOOP_VERSION /opt/hadoop && \
-  mv /tmp/apache-zookeeper-$ZOOKEEPER_VERSION-bin /opt/zookeeper && \
-  mv /tmp/accumulo-$ACCUMULO_VERSION* /opt/accumulo && \
-  rm -f accumulo.tar.gz hadoop.tar.gz zookeeper.tar.gz && \
-  rm -rf /opt/hadoop/share/doc/hadoop && \
+  yum clean all && \
+  rm -rf /var/cache/yum
+
+##
+## Base image for building. Adds wget, JDK and make (for building Accumulo native libs).
+##
+FROM base as buildbase
+
+RUN set -eux; \
+  yum install -y java-11-openjdk-devel make gcc-c++ wget && \
+  update-ca-trust extract
+
+COPY download.sh /usr/local/bin/
+
+##
+## Hadoop image. Download/copy and extract the Hadoop installation.
+##
+FROM buildbase as hadoop
+
+ARG HADOOP_VERSION=3.3.3 \
+  HADOOP_FILE=_NOT_SET
+
+# Copy a known file along with the optional files (that might not exist).
+# The known file, along with '*' for the optional file allows the command
+# to succeed even if the optional file does not exist. If we used an empty
+# string for the optional file default value, then this command would copy
+# the entire build context, which is not what we want.
+COPY download.sh ${HADOOP_FILE}* /tmp/
+
+RUN set -eux; \
+  download.sh "${HADOOP_FILE}" "hadoop.tar.gz" "hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz"; \
+  tar xzf hadoop.tar.gz -C /tmp/; \
+  mv /tmp/hadoop-*/ /opt/hadoop; \
+  rm -rf /opt/hadoop/share/doc/hadoop
+
+##
+## Zookeeper image. Download/copy and extract the Zookeeper installation.
+##
+FROM buildbase as zookeeper
+
+ARG ZOOKEEPER_VERSION=3.8.0 \
+  ZOOKEEPER_FILE=_NOT_SET
+# Copy a known file along with the optional files (that might not exist).
+# The known file, along with '*' for the optional file allows the command
+# to succeed even if the optional file does not exist. If we used an empty
+# string for the optional file default value, then this command would copy
+# the entire build context, which is not what we want.
+COPY download.sh ${ZOOKEEPER_FILE}* /tmp/
+
+RUN set -eux; \
+  download.sh "${ZOOKEEPER_FILE}" "zookeeper.tar.gz" "zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz"; \
+  tar xzf zookeeper.tar.gz -C /tmp/; \
+  mv /tmp/apache-zookeeper-*/ /opt/zookeeper
+
+##
+## Accumulo image. Download/copy and extract the Accumulo installation, build native libs, and copy in properties.
+##
+FROM buildbase as accumulo
+
+ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk
+
+ARG ACCUMULO_VERSION=2.1.0 \
+  ACCUMULO_FILE=_NOT_SET
+# Copy a known file along with the optional files (that might not exist).
+# The known file, along with '*' for the optional file allows the command
+# to succeed even if the optional file does not exist. If we used an empty
+# string for the optional file default value, then this command would copy
+# the entire build context, which is not what we want.
+COPY download.sh ${ACCUMULO_FILE}* /tmp/
+
+RUN set -eux; \
+  download.sh "${ACCUMULO_FILE}" "accumulo.tar.gz" "accumulo/$ACCUMULO_VERSION/accumulo-$ACCUMULO_VERSION-bin.tar.gz"; \
+  tar xzf accumulo.tar.gz -C /tmp/; \
+  mv /tmp/accumulo-*/ /opt/accumulo; \
   /opt/accumulo/bin/accumulo-util build-native
 
 ADD properties/ /opt/accumulo/conf/
 
-ENV HADOOP_HOME=/opt/hadoop ZOOKEEPER_HOME=/opt/zookeeper ACCUMULO_HOME=/opt/accumulo PATH="$PATH:/opt/accumulo/bin"
+##
+## Final image. Copy extracted/built installations for hadoop, zookeeper, and accumulo.
+## Also set environment variables and entrypoint.
+##
+FROM base
+
+ARG HADOOP_USER_NAME=accumulo
+ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk \
+  HADOOP_HOME=/opt/hadoop \
+  HADOOP_USER_NAME=$HADOOP_USER_NAME \
+  ZOOKEEPER_HOME=/opt/zookeeper \
+  ACCUMULO_HOME=/opt/accumulo \
+  PATH="$PATH:/opt/accumulo/bin"
+
+COPY --from=hadoop /opt/hadoop /opt/hadoop
+COPY --from=zookeeper /opt/zookeeper /opt/zookeeper
+COPY --from=accumulo /opt/accumulo /opt/accumulo
 
 ENTRYPOINT ["accumulo"]
 CMD ["help"]

diff --git a/README.md b/README.md
index e94185b..4b5af52 100644
--- a/README.md
+++ b/README.md

@@ -18,7 +18,7 @@
 |-------------|---------------|
 | [Accumulo]  | 2.1.0         |
 | [Hadoop]    | 3.3.3         |
-| [ZooKeeper] | 3.7.1         |
+| [ZooKeeper] | 3.8.0         |
 
 If these versions do not match what is running on your cluster, you should consider building
 your own image with matching versions. However, Accumulo must be 2.0.0+. Below are instructions for
@@ -39,7 +39,7 @@
 
    Or build with an Accumulo tarball (located in same directory as DockerFile) using the command below:
 
-        docker build --build-arg ACCUMULO_VERSION=2.0.0-SNAPSHOT --build-arg ACCUMULO_FILE=accumulo-2.0.0-SNAPSHOT-bin.tar.gz -t accumulo .
+        docker build --build-arg --build-arg ACCUMULO_FILE=accumulo-2.0.0-SNAPSHOT-bin.tar.gz -t accumulo .
 
 ## Image basics
 

diff --git a/download.sh b/download.sh
new file mode 100755
index 0000000..985caa3
--- /dev/null
+++ b/download.sh

@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+APACHE_DIST_URLS=(
+  "https://www.apache.org/dyn/closer.cgi?action=download&filename="
+  # if the version is outdated (or we're grabbing the .asc file), we might have to pull from the dist/archive :/
+  "https://www-us.apache.org/dist/"
+  "https://www.apache.org/dist/"
+  "https://archive.apache.org/dist/"
+)
+
+download() {
+  local f="$1"; shift
+  local distFile="$1"; shift
+  local success=
+  local distUrl=
+  for distUrl in "${APACHE_DIST_URLS[@]}"; do
+    echo "Attempting to fetch $f from $distUrl$distFile"
+    if wget -nv -O "$f" "$distUrl$distFile"; then
+      success=1
+      break
+    fi
+  done
+  [ -n "$success" ]
+}
+
+existing_file=$1
+download_file=$2
+dist_file=$3
+
+if [[ "$existing_file" == "_NOT_SET" ]]; then
+  download "$download_file" "$dist_file"
+else
+  [ -f "/tmp/$existing_file" ] || { echo "Existing file $existing_file does not exist"; exit 1; }
+  echo "Skipping download of $existing_file"
+  mv "/tmp/$existing_file" "$download_file"
+fi
commit	7cf9ca9573f45b7844d48df99d3195a6dc09d2f7	[log] [tgz]
author	Brian Loss <brianloss@gmail.com>	Mon Sep 19 18:39:29 2022 -0400
committer	GitHub <noreply@github.com>	Mon Sep 19 18:39:29 2022 -0400
tree	072471a43d5b00cb6c31269830dbe8278ee45cbb
parent	3ef2e2396b97cd2068c5b2d185063a80f834b5fa [diff]