blob: 6a4f30737cbf4a557fc0e4f9db057cd33ec70d51 [file] [log] [blame]
###############################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###############################################################################
# This image contains a Python SDK build and dependencies.
# By default it runs wordcount against a locally accessible HDFS service.
# See hdfs_integration_test.sh for example usage.
FROM python:2
WORKDIR /app
ENV HDFSCLI_CONFIG /app/sdks/python/apache_beam/io/hdfs_integration_test/hdfscli.cfg
RUN pip install --no-cache-dir holdup gsutil
RUN gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt .
# Install Beam and dependencies.
ADD sdks/python /app/sdks/python
ADD model /app/model
RUN cd sdks/python && \
python setup.py sdist && \
pip install --no-cache-dir $(ls dist/apache-beam-*.tar.gz | tail -n1)[gcp]
# Run wordcount, and write results to HDFS.
CMD holdup -t 45 http://namenode:50070 http://datanode:50075 && \
echo "Waiting for safe mode to end." && \
sleep 45 && \
hdfscli -v -v -v upload -f kinglear.txt / && \
python -m apache_beam.examples.wordcount \
--input hdfs://kinglear* \
--output hdfs://py-wordcount-integration \
--hdfs_host namenode --hdfs_port 50070 --hdfs_user root