| #!/usr/bin/env bash |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| set -o nounset |
| set -o errexit # exit script if any command exits with nonzero value |
| |
| readonly PROG_NAME=$(basename $0) |
| readonly PROG_DIR=$(dirname $(realpath $0)) |
| readonly INVOKE_DIR=$(pwd) |
| readonly ARGS="$@" |
| |
| # overrideable defaults |
| AWS=false |
| PARALLEL=true |
| MAX_PARALLEL=5 |
| DEBUG=false |
| |
| readonly USAGE="Usage: $PROG_NAME [-h | --help] [--aws [--no-parallel] [--max-parallel MAX]]" |
| readonly HELP="$(cat <<EOF |
| Tool to bring up a vagrant cluster on local machine or aws. |
| |
| -h | --help Show this help message |
| --aws Use if you are running in aws |
| --no-parallel Bring up machines not in parallel. Only applicable on aws |
| --max-parallel MAX Maximum number of machines to bring up in parallel. Note: only applicable on test worker machines on aws. default: $MAX_PARALLEL |
| --debug Enable debug information for vagrant |
| Approximately speaking, this wrapper script essentially wraps 2 commands: |
| vagrant up |
| vagrant hostmanager |
| |
| The situation on aws is complicated by the fact that aws imposes a maximum request rate, |
| which effectively caps the number of machines we are able to bring up in parallel. Therefore, on aws, |
| this wrapper script attempts to bring up machines in small batches. |
| |
| If you are seeing rate limit exceeded errors, you may need to use a reduced --max-parallel setting. |
| |
| EOF |
| )" |
| |
| function help { |
| echo "$USAGE" |
| echo "$HELP" |
| exit 0 |
| } |
| |
| while [[ $# > 0 ]]; do |
| key="$1" |
| case $key in |
| -h | --help) |
| help |
| ;; |
| --aws) |
| AWS=true |
| ;; |
| --no-parallel) |
| PARALLEL=false |
| ;; |
| --max-parallel) |
| MAX_PARALLEL="$2" |
| shift |
| ;; |
| --debug) |
| DEBUG=true |
| ;; |
| *) |
| # unknown option |
| echo "Unknown option $1" |
| exit 1 |
| ;; |
| esac |
| shift # past argument or value |
| done |
| |
| # Get a list of vagrant machines (in any state) |
| function read_vagrant_machines { |
| local ignore_state="ignore" |
| local reading_state="reading" |
| local tmp_file="tmp-$RANDOM" |
| |
| local state="$ignore_state" |
| local machines="" |
| |
| while read -r line; do |
| # Lines before the first empty line are ignored |
| # The first empty line triggers change from ignore state to reading state |
| # When in reading state, we parse in machine names until we hit the next empty line, |
| # which signals that we're done parsing |
| if [[ -z "$line" ]]; then |
| if [[ "$state" == "$ignore_state" ]]; then |
| state="$reading_state" |
| else |
| # all done |
| echo "$machines" |
| return |
| fi |
| continue |
| fi |
| |
| # Parse machine name while in reading state |
| if [[ "$state" == "$reading_state" ]]; then |
| line=$(echo "$line" | cut -d ' ' -f 1) |
| if [[ -z "$machines" ]]; then |
| machines="$line" |
| else |
| machines="${machines} ${line}" |
| fi |
| fi |
| done < <(vagrant status) |
| } |
| |
| # Filter "list", returning a list of strings containing pattern as a substring |
| function filter { |
| local list="$1" |
| local pattern="$2" |
| |
| local result="" |
| for item in $list; do |
| if [[ ! -z "$(echo $item | grep "$pattern")" ]]; then |
| result="$result $item" |
| fi |
| done |
| echo "$result" |
| } |
| |
| # Given a list of machine names, return only test worker machines |
| function worker { |
| local machines="$1" |
| local workers=$(filter "$machines" "worker") |
| workers=$(echo "$workers" | xargs) # trim leading/trailing whitespace |
| echo "$workers" |
| } |
| |
| # Given a list of machine names, return only zookeeper and broker machines |
| function zk_broker { |
| local machines="$1" |
| local zk_broker_list=$(filter "$machines" "zk") |
| zk_broker_list="$zk_broker_list $(filter "$machines" "broker")" |
| zk_broker_list=$(echo "$zk_broker_list" | xargs) # trim leading/trailing whitespace |
| echo "$zk_broker_list" |
| } |
| |
| # Run a vagrant command on batches of machines of size $group_size |
| # This is annoying but necessary on aws to avoid errors due to AWS request rate |
| # throttling |
| # |
| # Example |
| # $ vagrant_batch_command "vagrant up" "m1 m2 m3 m4 m5" "2" |
| # |
| # This is equivalent to running "vagrant up" on groups of machines of size 2 or less, i.e.: |
| # $ vagrant up m1 m2 |
| # $ vagrant up m3 m4 |
| # $ vagrant up m5 |
| function vagrant_batch_command { |
| local vagrant_cmd="$1" |
| local machines="$2" |
| local group_size="$3" |
| |
| local count=1 |
| local m_group="" |
| # Using --provision flag makes this command useable both when bringing up a cluster from scratch, |
| # and when bringing up a halted cluster. Permissions on certain directores set during provisioning |
| # seem to revert when machines are halted, so --provision ensures permissions are set correctly in all cases |
| for machine in $machines; do |
| m_group="$m_group $machine" |
| |
| if [[ $(expr $count % $group_size) == 0 ]]; then |
| # We've reached a full group |
| # Bring up this part of the cluster |
| $vagrant_cmd $m_group |
| m_group="" |
| fi |
| ((count++)) |
| done |
| |
| # Take care of any leftover partially complete group |
| if [[ ! -z "$m_group" ]]; then |
| $vagrant_cmd $m_group |
| fi |
| } |
| |
| # We assume vagrant-hostmanager is installed, but may or may not be disabled during vagrant up |
| # In this fashion, we ensure we run hostmanager after machines are up, and before provisioning. |
| # This sequence of commands is necessary for example for bringing up a multi-node zookeeper cluster |
| function bring_up_local { |
| vagrant up --no-provision |
| vagrant hostmanager |
| vagrant provision |
| } |
| |
| function bring_up_aws { |
| local parallel="$1" |
| local max_parallel="$2" |
| local machines="$(read_vagrant_machines)" |
| case "$3" in |
| true) |
| local debug="--debug" |
| ;; |
| false) |
| local debug="" |
| ;; |
| esac |
| zk_broker_machines=$(zk_broker "$machines") |
| worker_machines=$(worker "$machines") |
| |
| if [[ "$parallel" == "true" ]]; then |
| if [[ ! -z "$zk_broker_machines" ]]; then |
| # We still have to bring up zookeeper/broker nodes serially |
| echo "Bringing up zookeeper/broker machines serially" |
| vagrant up --provider=aws --no-parallel --no-provision $zk_broker_machines $debug |
| vagrant hostmanager --provider=aws |
| vagrant provision |
| fi |
| |
| if [[ ! -z "$worker_machines" ]]; then |
| echo "Bringing up test worker machines in parallel" |
| # Try to isolate this job in its own /tmp space. See note |
| # below about vagrant issue |
| local vagrant_rsync_temp_dir=$(mktemp -d); |
| TMPDIR=$vagrant_rsync_temp_dir vagrant_batch_command "vagrant up $debug --provider=aws" "$worker_machines" "$max_parallel" |
| rm -rf $vagrant_rsync_temp_dir |
| vagrant hostmanager --provider=aws |
| fi |
| else |
| vagrant up --provider=aws --no-parallel --no-provision $debug |
| vagrant hostmanager --provider=aws |
| vagrant provision |
| fi |
| |
| # Currently it seems that the AWS provider will always run rsync |
| # as part of vagrant up. However, |
| # https://github.com/mitchellh/vagrant/issues/7531 means it is not |
| # safe to do so. Since the bug doesn't seem to cause any direct |
| # errors, just missing data on some nodes, follow up with serial |
| # rsyncing to ensure we're in a clean state. Use custom TMPDIR |
| # values to ensure we're isolated from any other instances of this |
| # script that are running/ran recently and may cause different |
| # instances to sync to the wrong nodes |
| for worker in $worker_machines; do |
| local vagrant_rsync_temp_dir=$(mktemp -d); |
| TMPDIR=$vagrant_rsync_temp_dir vagrant rsync $worker; |
| rm -rf $vagrant_rsync_temp_dir |
| done |
| } |
| |
| function main { |
| if [[ "$AWS" == "true" ]]; then |
| bring_up_aws "$PARALLEL" "$MAX_PARALLEL" "$DEBUG" |
| else |
| bring_up_local |
| fi |
| } |
| |
| main |