ACCUMULO-4706 Initial implementation of Accumulo docker image (#1)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..335ec95
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.tar.gz
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..59e331d
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,79 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM centos:7
+
+RUN yum install -y java-1.8.0-openjdk-devel make gcc-c++ wget
+ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk
+
+ARG HADOOP_VERSION
+ARG HADOOP_USER_NAME
+ARG ZOOKEEPER_VERSION
+
+ENV HADOOP_VERSION ${HADOOP_VERSION:-2.7.4}
+ENV HADOOP_USER_NAME ${HADOOP_USER_NAME:-accumulo}
+ENV ZOOKEEPER_VERSION ${ZOOKEEPER_VERSION:-3.4.9}
+ENV ACCUMULO_VERSION 2.0.0-SNAPSHOT
+
+ENV APACHE_DIST_URLS \
+  https://www.apache.org/dyn/closer.cgi?action=download&filename= \
+# if the version is outdated (or we're grabbing the .asc file), we might have to pull from the dist/archive :/
+  https://www-us.apache.org/dist/ \
+  https://www.apache.org/dist/ \
+  https://archive.apache.org/dist/
+
+RUN set -eux; \
+  download() { \
+    local f="$1"; shift; \
+    local distFile="$1"; shift; \
+    local success=; \
+    local distUrl=; \
+    for distUrl in $APACHE_DIST_URLS; do \
+      if wget -nv -O "$f" "$distUrl$distFile"; then \
+        success=1; \
+        break; \
+      fi; \
+    done; \
+    [ -n "$success" ]; \
+  }; \
+  \
+  #download "accumulo.tar.gz" "accumulo/$ACCUMULO_VERSION/accumulo-$ACCUMULO_VERSION-bin.tar.gz" \
+  download "hadoop.tar.gz" "hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz"; \
+  download "zookeeper.tar.gz" "zookeeper/zookeeper-$ZOOKEEPER_VERSION/zookeeper-$ZOOKEEPER_VERSION.tar.gz"
+
+#RUN tar xzf accumulo.tar.gz -C /tmp/
+RUN tar xzf hadoop.tar.gz -C /tmp/
+RUN tar xzf zookeeper.tar.gz -C /tmp/
+
+# Comment out accumulo-related lines above and remove line below when 2.0.0 is released
+ADD ./accumulo-$ACCUMULO_VERSION-bin.tar.gz /tmp/
+
+RUN mv /tmp/hadoop-$HADOOP_VERSION /opt/hadoop
+RUN mv /tmp/zookeeper-$ZOOKEEPER_VERSION /opt/zookeeper
+RUN mv /tmp/accumulo-$ACCUMULO_VERSION /opt/accumulo
+
+RUN /opt/accumulo/bin/accumulo-util build-native
+
+ADD ./accumulo-site.xml /opt/accumulo/conf
+ADD ./log4j-service.properties /opt/accumulo/conf
+ADD ./log4j-monitor.properties /opt/accumulo/conf
+
+ENV HADOOP_PREFIX /opt/hadoop
+ENV ZOOKEEPER_HOME /opt/zookeeper
+ENV ACCUMULO_HOME /opt/accumulo
+ENV PATH "$PATH:$ACCUMULO_HOME/bin"
+
+ENTRYPOINT ["accumulo"]
+CMD ["help"]
diff --git a/README.md b/README.md
index 0547beb..83f23de 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,145 @@
-# Accumulo Docker Image
+# Apache Accumulo Docker Image
+
+**This is currently a work in progress that depends on unreleased features of Accumulo and will not be ready
+for use until after Accumulo 2.0.0 is released.**  Sometime after Accumulo 2.0.0 is released this project
+will make its first release. Eventually, this will project will create a `apache/accumulo` image at DockerHub.
+Until then, you will need to build your own image.
+
+## Obtain the Docker image
+
+To obtain the docker image created by this project, you can either pull it from DockerHub at
+`apache/accumulo` or build it yourself. To pull the image from DockerHub, run the command below:
+
+    docker pull apache/accumulo
+
+While it is easier to pull from DockerHub, the image will default to the software versions below:
+
+| Software    | Version        |
+|-------------|----------------|
+| [Accumulo]  | 2.0.0-SNAPSHOT |
+| [Hadoop]    | 2.7.4          |
+| [Zookeeper] | 3.4.9          |
+
+If these versions do not match what is running on your cluster, you should consider building
+your own image with matching versions. However, Accumulo must be 2.0.0+. Below are instructions for
+building an image:
+
+1. Clone the Accumulo docker repo
+
+        git clone git@github.com:apache/accumulo-docker.git
+
+2. Until Accumulo 2.0.0 is released, build a Accumulo tarball distribution and copy it to the root
+   directory of the repo.
+
+        git clone git@github.com:apache/accumulo.git
+        cd accumulo/
+        mvn clean package
+        cp assemble/target/accumulo-2.0.0-SNAPSHOT-bin.tar.gz /path/to/accumulo-docker/
+
+3. Build the default Accumulo docker image using the command below.
+
+        cd /path/to/accumulo-docker
+        docker build -t accumulo .
+
+   Or build the Accumulo docker image with specific versions of Hadoop, Zookeeper, etc using the command below:
+
+        docker build --build-arg ZOOKEEPER_VERSION=3.4.8 --build-arg HADOOP_VERSION=2.7.0 -t accumulo .
+
+## Image basics
+
+The entrypoint for the Accumulo docker image is the `accumulo` script. While the primary use
+case for this image is to start Accumulo processes (i.e tserver, master, etc), you can run other
+commands in the `accumulo` script to test out the image:
+
+```bash
+# No arguments prints Accumulo command usage
+docker run accumulo
+# Print Accumulo version
+docker run accumulo version
+# Print Accumulo classpath
+docker run accumulo classpath
+```
+
+# Run Accumulo using Docker
+
+Before you can run Accumulo services in Docker, you will need to install Accumulo, configure `accumulo-site.xml`,
+and initialize your instance with `--upload-accumulo-site`. This will upload configuration to Zookeeper and limit
+how much configuration needs to be set on the command line.
+
+```bash
+$ accumulo init --upload-accumulo-site
+...
+Uploading properties in accumulo-site.xml to Zookeeper. Properties that cannot be set in Zookeeper will be skipped:
+Skipped - instance.secret = <hidden>
+Skipped - instance.volumes = hdfs://localhost:8020/accumulo
+Skipped - instance.zookeeper.host = localhost:2181
+Uploaded - table.durability = flush
+Uploaded - tserver.memory.maps.native.enabled = false
+Uploaded - tserver.readahead.concurrent.max = 64
+Uploaded - tserver.server.threads.minimum = 64
+Uploaded - tserver.walog.max.size = 512M
+```
+
+Any configuration that is skipped above will need to be passed in as a command line option to Accumulo services running
+in Docker containers. These options can be set in an environment variable which is used in later commands.
+
+```
+export ACCUMULO_CL_OPTS="-o instance.secret=mysecret -o instance.volumes=hdfs://localhost:8020/accumulo -o instance.zookeeper.host=localhost:2181"
+```
+
+The Accumulo docker image expects that the HDFS path set by `instance.volumes` is owned by the `accumulo` user. This
+can be accomplished by running the command below (replace the HDFS path with yours):
+
+```bash
+hdfs dfs -chown -R accumulo hdfs://localhost:8020/accumulo
+```
+
+## Docker engine
+
+Use the `docker` command to start local docker containers. The commands below will start a local Accumulo cluster
+with two tablet servers.
+
+```
+docker run -d --network="host" accumulo monitor $ACCUMULO_CL_OPTS
+docker run -d --network="host" accumulo tserver $ACCUMULO_CL_OPTS
+docker run -d --network="host" accumulo tserver $ACCUMULO_CL_OPTS
+docker run -d --network="host" accumulo master $ACCUMULO_CL_OPTS
+docker run -d --network="host" accumulo gc $ACCUMULO_CL_OPTS
+```
+
+If you would like to set Java heap size inside the Docker container, start the local docker container using the
+command below:
+
+```
+docker run -e ACCUMULO_JAVA_OPTS='-Xmx1g' -d --network="host" accumulo tserver $ACCUMULO_CL_OPTS
+```
+
+## Marathon
+
+Using the Marathon UI, you can start Accumulo services using the following
+JSON configuration template.  The template is configured to start an Accumulo
+monitor but it can be modified to start other Accumulo services such as
+`master`, `tserver` and `gc`. For tablet servers, set `instances` to the number
+of tablet servers that you want to run.
+
+```
+{
+  "id": "accumulo-monitor",
+  "cmd": "accumulo monitor -o instance.secret=mysecret -o instance.volumes=hdfs://localhost:8020/accumulo -o instance.zookeeper.host=localhost:2181",
+  "cpus": 1,
+  "mem": 512,
+  "disk": 0,
+  "instances": 1,
+  "container": {
+    "docker": {
+      "image": "apache/accumulo",
+      "network": "HOST"
+    },
+    "type": "DOCKER"
+  }
+}
+```
+
+[Accumulo]: https://accumulo.apache.org/
+[Hadoop]: https://hadoop.apache.org/
+[Zookeeper]: https://zookeeper.apache.org/
diff --git a/accumulo-site.xml b/accumulo-site.xml
new file mode 100644
index 0000000..09c7394
--- /dev/null
+++ b/accumulo-site.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!-- This is the main configuration file for Apache Accumulo. Available configuration properties can be
+     found in the Accumulo documentation on the Accumulo project website (https://accumulo.apache.org/) -->
+<configuration>
+  <!-- Set location in HDFS where Accumulo will store data -->
+  <property>
+    <name>instance.volumes</name>
+    <value>hdfs://localhost:8020/accumulo</value>
+  </property>
+  <!-- Set location of Zookeepers -->
+  <property>
+    <name>instance.zookeeper.host</name>
+    <value>localhost:2181</value>
+  </property>
+  <!-- Change secret before initialization. All servers must have same secret -->
+  <property>
+    <name>instance.secret</name>
+    <value>DEFAULT</value>
+  </property>
+  <!-- Set to false if 'accumulo-util build-native' fails -->
+  <property>
+    <name>tserver.memory.maps.native.enabled</name>
+    <value>true</value>
+  </property>
+  <!-- Set a correct user/password below -->
+  <property>
+    <name>trace.user</name>
+    <value>root</value>
+  </property>
+  <property>
+    <name>trace.password</name>
+    <value>secret</value>
+  </property>
+  <property>
+    <name>tserver.port.search</name>
+    <value>true</value>
+  </property>
+</configuration>
diff --git a/log4j-monitor.properties b/log4j-monitor.properties
new file mode 100644
index 0000000..2057ede
--- /dev/null
+++ b/log4j-monitor.properties
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+## Log4j 1.2 file that configures logging for Accumulo Monitor
+## The system properties referenced below are configured by accumulo-env.sh
+
+## Define a console appender
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.Target=System.out
+log4j.appender.console.Threshold=ALL
+log4j.appender.console.layout.ConversionPattern=%d{ISO8601} [%-8c{2}] %-5p: %m%n
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+
+## Define an appender for the Accumulo Monitor to log to its own web GUI
+log4j.appender.gui=org.apache.accumulo.server.monitor.LogService
+log4j.appender.gui.Threshold=WARN
+
+## Append monitor logs to its own web GUI
+log4j.logger.org.apache.accumulo=INHERITED, gui
+
+## Append most logs to file
+log4j.rootLogger=INFO, console
diff --git a/log4j-service.properties b/log4j-service.properties
new file mode 100644
index 0000000..4588041
--- /dev/null
+++ b/log4j-service.properties
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+## Log4j 1.2 file that configures logging for all Accumulo services (Master, TabletServer, GC, and Tracer) except Monitor
+## The system properties referenced below are configured by accumulo-env.sh
+
+## Define a console appender
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.Target=System.out
+log4j.appender.console.Threshold=ALL
+log4j.appender.console.layout.ConversionPattern=%d{ISO8601} [%-8c{2}] %-5p: %m%n
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+
+## Define an appender to send important logs to the the primary Accumulo Monitor
+## The primary monitor is the one currently holding a shared lock in ZooKeeper,
+## and is typically the one that started first.
+log4j.appender.monitor=org.apache.accumulo.monitor.util.AccumuloMonitorAppender
+log4j.appender.monitor.Threshold=WARN
+
+## Change this log level from OFF to one of the following to enable audit logging:
+##   INFO
+##     enables audit logging (inherit appenders from root logger)
+##   INFO, audit
+##     enables audit logging using the audit log appender
+##     (requires audit log file appender above to be uncommented)
+log4j.logger.org.apache.accumulo.audit=OFF
+
+## Append logs to the primary Accumulo Monitor
+log4j.logger.org.apache.accumulo=INHERITED, monitor
+
+## Constrain some particularly spammy loggers
+log4j.logger.org.apache.accumulo.core.file.rfile.bcfile=INFO
+log4j.logger.org.mortbay.log=WARN
+log4j.logger.org.apache.zookeeper=ERROR
+
+## Append most logs to console
+log4j.rootLogger=INFO, console