blob: 61d9379b7ce98dbf08f31d08a71d8dcfbfd50cb0 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
from ubuntu:14.04
MAINTAINER Nutch Developers <>
WORKDIR /root/
# Install package with add-apt-repository
RUN apt-get update && apt-get install -y software-properties-common
# Enable Ubuntu repositories
RUN add-apt-repository -y multiverse && \
add-apt-repository -y restricted && \
add-apt-repository -y ppa:webupd8team/java && \
apt-get update && apt-get upgrade -y
# Install latest Oracle Java from PPA
RUN echo oracle-java7-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections && \
apt-get install -y oracle-java7-installer oracle-java7-set-default
# Install dependencies
RUN apt-get install -y ant openssh-server zookeeperd vim telnet subversion rsync curl build-essential maven
# Download Hadoop
RUN wget -q ''
RUN wget -q ''
RUN wget -q ''
RUN svn checkout nutch-sources
# Setup system user and group to own and run Hadoop
RUN addgroup hadoop && adduser --ingroup hadoop hduser
RUN usermod -a -G hadoop zookeeper
# Setup SSH keys for passwordless access
RUN su -l -c 'ssh-keygen -t rsa -f /home/hduser/.ssh/id_rsa -P ""' hduser && \
cat /home/hduser/.ssh/ | su -l -c 'tee -a /home/hduser/.ssh/authorized_keys' hduser
ADD config/ssh-config /home/hduser/.ssh/config
RUN chmod 600 /home/hduser/.ssh/config
RUN chown hduser /home/hduser/.ssh/config
# Fix Ubuntu 13.10 SSH daemon problem with docker:
RUN sed -ri 's/session[[:blank:]]+required[[:blank:]] optional' /etc/pam.d/sshd
# Deploy and setup file permissions
RUN tar xvfz /root/hadoop-2.5.1.tar.gz -C /opt && \
ln -s /opt/hadoop-2.5.1 /opt/hadoop && \
chown -R root:root /opt/hadoop-2.5.1 && \
mkdir /opt/hadoop-2.5.1/logs && \
chown -R hduser:hadoop /opt/hadoop-2.5.1/logs
# Unpack and compile Google protobuf
RUN tar xvfz /root/protobuf-2.5.0.tar.gz -C /opt && \
chown -R root:root /opt/protobuf-2.5.0
RUN cd /opt/protobuf-2.5.0 && ./configure && make && make check && make install
# Deploy and setup file permissions
RUN tar xvfz /root/hbase-0.98.8-hadoop2-bin.tar.gz -C /opt && \
chown -R root:root /opt/hbase-0.98.8-hadoop2
# link binaries, create logs directory
RUN ln -s /opt/hbase-0.98.8-hadoop2 /opt/hbase && mkdir /opt/hbase/logs && \
chown -R hduser:hadoop /opt/hbase/logs
# Deploy and setup file permissions
RUN mv /root/nutch-sources /opt/apache-nutch-2.x && \
chown -R root:root /opt/apache-nutch-2.x && \
mkdir /opt/apache-nutch-2.x/logs && \
chown -R hduser:hadoop /opt/apache-nutch-2.x/logs
# Setup hduser environment
ADD config/bashrc /home/hduser/.bashrc
# Add Hadoop, HBase and nutch configs
ADD config/core-site.xml /tmp/hadoop-etc/core-site.xml
ADD config/mapred-site.xml /tmp/hadoop-etc/mapred-site.xml
ADD config/hdfs-site.xml /tmp/hadoop-etc/hdfs-site.xml
ADD config/yarn-site.xml /tmp/hadoop-etc/yarn-site.xml
ADD config/hbase-site.xml /tmp/hbase-etc/hbase-site.xml
ADD config/nutch-site.xml /tmp/nutch-etc/nutch-site.xml
RUN mv /tmp/hadoop-etc/* /opt/hadoop/etc/hadoop
RUN mv /tmp/hbase-etc/* /opt/hbase/conf/
RUN mv /tmp/nutch-etc/* /opt/apache-nutch-2.x/conf/
ENV NUTCH_ROOT /opt/apache-nutch-2.x
RUN echo '' >> /opt/apache-nutch-2.x/conf/
RUN vim -c 'g/name="gora-hbase"/+1d' -c 'x' $NUTCH_ROOT/ivy/ivy.xml
RUN vim -c 'g/name="gora-hbase"/-1d' -c 'x' $NUTCH_ROOT/ivy/ivy.xml
RUN cd $NUTCH_ROOT && ant runtime
RUN ln -s /opt/apache-nutch-2.x/runtime/local /opt/nutch
ENV NUTCH_HOME /opt/nutch
ENV HADOOP_HOME /opt/hadoop
# expose ports, probably not all, probably some old (pre Hadoop 2.0) ports as well
# Expose SSHD
# QuorumPeerMain (Zookeeper)
EXPOSE 2181 39534
# NameNode (HDFS)
EXPOSE 8020 50070 9000
# DataNode (HDFS)
EXPOSE 50010 50020 50075
# SecondaryNameNode (HDFS)
EXPOSE 50090
# Trackers
EXPOSE 50030 50060
EXPOSE 6000 60010 60020 60030
# Thrift
EXPOSE 9090 9095
# Create start script
ADD config/ /root/
RUN chmod +x /root/
CMD ["/root/"]