blob: 49ce37cf02c3cbc87af0e5f356a66741f64f1a1a [file] [log] [blame]
#!/usr/bin/env bash
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
# Script that is run on each EC2 instance on boot. It is passed in the EC2 user
# data, so should not exceed 16K in size.
################################################################################
################################################################################
# Initialize variables
################################################################################
# Slaves are started after the master, and are told its address by sending a
# modified copy of this file which sets the MASTER_HOST variable.
# A node knows if it is the master or not by inspecting the security group
# name. If it is the master then it retrieves its address using instance data.
MASTER_HOST=%MASTER_HOST% # Interpolated before being sent to EC2 node
SECURITY_GROUPS=`wget -q -O - http://169.254.169.254/latest/meta-data/security-groups`
IS_MASTER=`echo $SECURITY_GROUPS | awk '{ a = match ($0, "-master$"); if (a) print "true"; else print "false"; }'`
if [ "$IS_MASTER" == "true" ]; then
# use public hostnames for master. private hostnames can be used by substituting:
# MASTER_HOST=`wget -q -O - http://169.254.169.254/latest/meta-data/local-hostname`
MASTER_HOST=`wget -q -O - 'http://169.254.169.254/latest/meta-data/public-hostname'`
fi
HADOOP_HOME=`ls -d /usr/local/hadoop-*`
################################################################################
# Hadoop configuration
# Modify this section to customize your Hadoop cluster.
################################################################################
cat > $HADOOP_HOME/conf/hadoop-site.xml <<EOF
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>hadoop.tmp.dir</name>
<value>/mnt/hadoop</value>
</property>
<property>
<name>fs.default.name</name>
<value>hdfs://$MASTER_HOST:50001</value>
</property>
<property>
<name>mapred.job.tracker</name>
<value>hdfs://$MASTER_HOST:50002</value>
</property>
<property>
<name>tasktracker.http.threads</name>
<value>80</value>
</property>
<property>
<name>mapred.tasktracker.map.tasks.maximum</name>
<value>3</value>
</property>
<property>
<name>mapred.tasktracker.reduce.tasks.maximum</name>
<value>3</value>
</property>
<property>
<name>mapred.output.compress</name>
<value>true</value>
</property>
<property>
<name>mapred.output.compression.type</name>
<value>BLOCK</value>
</property>
<property>
<name>dfs.client.block.write.retries</name>
<value>3</value>
</property>
<property>
<name>hadoop.rpc.socket.factory.class.default</name>
<value>org.apache.hadoop.net.StandardSocketFactory</value>
<final>true</final>
</property>
</configuration>
EOF
# Configure Hadoop for Ganglia
# overwrite hadoop-metrics.properties
cat > $HADOOP_HOME/conf/hadoop-metrics.properties <<EOF
# Ganglia
# we push to the master gmond so hostnames show up properly
dfs.class=org.apache.hadoop.metrics.ganglia.GangliaContext
dfs.period=10
dfs.servers=$MASTER_HOST:8649
mapred.class=org.apache.hadoop.metrics.ganglia.GangliaContext
mapred.period=10
mapred.servers=$MASTER_HOST:8649
jvm.class=org.apache.hadoop.metrics.ganglia.GangliaContext
jvm.period=10
jvm.servers=$MASTER_HOST:8649
EOF
################################################################################
# Start services
################################################################################
[ ! -f /etc/hosts ] && echo "127.0.0.1 localhost" > /etc/hosts
mkdir -p /mnt/hadoop/logs
# not set on boot
export USER="root"
if [ "$IS_MASTER" == "true" ]; then
# MASTER
# Prep Ganglia
sed -i -e "s|\( *mcast_join *=.*\)|#\1|" \
-e "s|\( *bind *=.*\)|#\1|" \
-e "s|\( *mute *=.*\)| mute = yes|" \
-e "s|\( *location *=.*\)| location = \"master-node\"|" \
/etc/gmond.conf
mkdir -p /mnt/ganglia/rrds
chown -R ganglia:ganglia /mnt/ganglia/rrds
rm -rf /var/lib/ganglia; cd /var/lib; ln -s /mnt/ganglia ganglia; cd
service gmond start
service gmetad start
apachectl start
# Hadoop
# only format on first boot
[ ! -e /mnt/hadoop/dfs ] && "$HADOOP_HOME"/bin/hadoop namenode -format
"$HADOOP_HOME"/bin/hadoop-daemon.sh start namenode
"$HADOOP_HOME"/bin/hadoop-daemon.sh start jobtracker
else
# SLAVE
# Prep Ganglia
sed -i -e "s|\( *mcast_join *=.*\)|#\1|" \
-e "s|\( *bind *=.*\)|#\1|" \
-e "s|\(udp_send_channel {\)|\1\n host=$MASTER_HOST|" \
/etc/gmond.conf
service gmond start
# Hadoop
"$HADOOP_HOME"/bin/hadoop-daemon.sh start datanode
"$HADOOP_HOME"/bin/hadoop-daemon.sh start tasktracker
fi
# Run this script on next boot
rm -f /var/ec2/ec2-run-user-data.*