blob: 775867057d170e38d8b19c49ff41c0c6fd27efeb [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# meabed/debian-jdk
# docker build -t apache/nutch:2.x .
#
FROM meabed/debian-jdk
MAINTAINER Nutch Developers "dev@nutch.apache.org"
USER root
ENV DEBIAN_FRONTEND noninteractive
#ant
RUN apt-get update && apt-get install -y ant subversion --fix-missing
#Download nutch
RUN mkdir -p /opt/downloads && cd /opt/downloads && svn co http://svn.apache.org/repos/asf/nutch/branches/2.x apache-nutch-2.x
RUN cd /opt
RUN ln -s /opt/downloads/apache-nutch-2.x /opt/apache-nutch-2.x
ENV NUTCH_ROOT /opt/apache-nutch-2.x
ENV HOME /root
#Nutch-default
# RUN sed -i '/^ <name>http.agent.name<\/name>$/{$!{N;s/^ <name>http.agent.name<\/name>\n <value><\/value>$/ <name>http.agent.name<\/name>\n <value>Nutch 2.X Cassandra Docker<\/value>/;ty;P;D;:y}}' $NUTCH_ROOT/conf/nutch-default.xml
RUN vim -c 'g/name="gora-cassandra"/+1d' -c 'x' $NUTCH_ROOT/ivy/ivy.xml
RUN vim -c 'g/name="gora-cassandra"/-1d' -c 'x' $NUTCH_ROOT/ivy/ivy.xml
RUN cd $NUTCH_ROOT && ant runtime
#native libs
RUN rm $NUTCH_ROOT/lib/native/*
#RUN mkdir -p $NUTCH_ROOT/lib/native/Linux-amd64-64
#RUN curl -Ls http://dl.bintray.com/meabed/hadoop-debian/hadoop-native-64-2.5.1.tar|tar -x -C $NUTCH_ROOT/lib/native/Linux-amd64-64/
#Modification and compilation again
#ADD plugin/nutch2-index-html/src/plugin/ $NUTCH_ROOT/src/plugin/
#RUN sed -i '/dir="index-more" target="deploy".*/ s/.*/&\n <ant dir="index-html" target="deploy"\/>/' #$NUTCH_ROOT/src/plugin/build.xml
#RUN sed -i '/dir="index-more" target="clean".*/ s/.*/&\n <ant dir="index-html" target="clean"\/>/' #$NUTCH_ROOT/src/plugin/build.xml
#RUN cd $NUTCH_ROOT && ant runtime
RUN ln -s /opt/apache-nutch-2.x/runtime/local /opt/nutch
ENV NUTCH_HOME /opt/nutch
# urls folder we will use in crawling $NUTCH_HOME/bin/crawl urls crawlId(test01) elasticsearch_node_name(iData) iteration(1)
RUN mkdir $NUTCH_HOME/urls
# Adding test urls to use in crawling
CMD mkdir -p $NUTCH_HOME/testUrls
ADD testUrls $NUTCH_HOME/testUrls
# Adding rawcontent that hold html of the page field in index to elasticsearch
#RUN sed -i '/field name="date" type.*/ s/.*/&\n\n <field name="rawcontent" type="text" sstored="true" indexed="true" multiValued="false"\/>\n/' $NUTCH_HOME/conf/schema.xml
# remove nutche-site.xml default file to replace it by our configuration
RUN rm $NUTCH_HOME/conf/nutch-site.xml
ADD config/nutch-site.xml $NUTCH_HOME/conf/nutch-site.xml
# Port that nutchserver will use
ENV NUTCHSERVER_PORT 8899
ADD bootstrap.sh /etc/bootstrap.sh
RUN chown root:root /etc/bootstrap.sh
RUN chmod 700 /etc/bootstrap.sh
VOLUME ["/data"]
CMD ["/etc/bootstrap.sh", "-d"]
EXPOSE 8899