blob: b417280b4975c026a155df477c779b33d1a847b6 [file] [log] [blame]
#!/bin/bash
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# An integration test for the client, using the vagrant environment as a testbed.
# Determine if we are already in the vagrant environment. If not, start it up and invoke the script
# from within the environment.
if [[ "$USER" != "vagrant" ]]; then
vagrant up
time vagrant ssh -c /vagrant/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh "$@"
exit $?
fi
set -u -e -x
set -o pipefail
readonly TEST_SLAVE_IP=192.168.33.7
_curl() { curl --silent --fail --retry 4 --retry-delay 10 "$@" ; }
tear_down() {
set +x # Disable command echo, as this makes it more difficult see which command failed.
local _jobs=$(aurora job list $TEST_CLUSTER/$TEST_ROLE| grep $TEST_ROLE)
for job in ${_jobs[@]}; do
aurora update abort $job >/dev/null 2>&1 || true
aurora job killall --no-batching $job >/dev/null 2>&1
done
aurora_admin set_quota $TEST_CLUSTER $TEST_ROLE 0 0m 0m
aurora_admin host_activate --hosts=$TEST_SLAVE_IP $TEST_CLUSTER
sudo mv /etc/aurora/clusters.json.old /etc/aurora/clusters.json >/dev/null 2>&1 || true
}
collect_result() {
if [[ $RETCODE = 0 ]]
then
echo "OK (all tests passed)"
else
echo "!!! FAIL (something returned non-zero) for $BASH_COMMAND"
fi
# Attempt to clean up any state we left behind.
tear_down
exit $RETCODE
}
check_url_live() {
[[ $(curl -sL -w '%{http_code}' $1 -o /dev/null) == 200 ]]
}
test_file_removed() {
local _file=$1
local _success=0
for i in {1..10}; do
if [[ ! -e $_file ]]; then
_success=1
break
fi
sleep 1
done
if [[ $_success -ne 1 ]]; then
echo "File was not removed."
exit 1
fi
}
test_version() {
# The version number is written to stderr, making it necessary to redirect the output.
[[ $(aurora --version 2>&1) = $(cat /vagrant/.auroraversion) ]]
}
clear_mesos_maintenance() {
curl http://"$TEST_SLAVE_IP":5050/maintenance/schedule \
-H "Content-type: application/json" \
-X POST \
-d "{}"
}
test_mesos_maintenance() {
local _cluster=$1 _role=$2 _env=$3
local _base_config=$4
local _job=$7
local _jobkey="$_cluster/$_role/$_env/$_job"
# Clear any previous maintenance schedules before running this test.
clear_mesos_maintenance
test_create $_jobkey $_base_config
echo "Waiting job to enter RUNNING..."
wait_until_task_status $_jobkey "0" "RUNNING"
# Create the maintenance schedule
MAINTENANCE_SCHEDULE="/tmp/maintenance_schedule.json"
python \
/vagrant/src/test/sh/org/apache/aurora/e2e/generate_mesos_maintenance_schedule.py > "$MAINTENANCE_SCHEDULE"
echo "Creating maintenance with schedule"
cat $MAINTENANCE_SCHEDULE | jq .
curl http://"$TEST_SLAVE_IP":5050/maintenance/schedule \
-H "Content-type: application/json" \
-X POST \
-d @"$MAINTENANCE_SCHEDULE"
trap clear_mesos_maintenance EXIT
# Posting of a maintenance schedule should not cause the task to drain right
# away.
assert_task_status $_jobkey "0" "RUNNING"
# When it is drain time, it should be killed.
echo "Waiting for time to drain tasks..."
wait_until_task_status $_jobkey "0" "PENDING"
clear_mesos_maintenance
echo "Waiting for drained task to re-launch..."
wait_until_task_status $_jobkey "0" "RUNNING"
test_kill $_jobkey
}
test_health_check() {
[[ $(_curl "$TEST_SLAVE_IP:8081/health") == 'OK' ]]
}
test_config() {
local _config=$1 _jobkey=$2
joblist=$(aurora config list $_config | tr -dc '[[:print:]]')
[[ "$joblist" = *"$_jobkey"* ]]
}
test_inspect() {
local _jobkey=$1 _config=$2
shift 2
local _extra_args="${@}"
aurora job inspect $_jobkey $_config $_extra_args
}
test_create() {
local _jobkey=$1 _config=$2
shift 2
local _extra_args="${@}"
aurora job create $_jobkey $_config $_extra_args
}
test_job_status() {
local _cluster=$1 _role=$2 _env=$3 _job=$4
local _jobkey="$_cluster/$_role/$_env/$_job"
echo "== Checking job status"
aurora job list $_cluster/$_role/$_env | grep "$_jobkey"
aurora job status $_jobkey
}
test_scheduler_ui() {
local _role=$1 _env=$2 _job=$3
# Check that scheduler UI pages shown
base_url="$TEST_SLAVE_IP:8081"
check_url_live "$base_url/leaderhealth"
check_url_live "$base_url/scheduler"
check_url_live "$base_url/scheduler/$_role"
check_url_live "$base_url/scheduler/$_role/$_env/$_job"
}
test_observer_ui() {
local _cluster=$1 _role=$2 _job=$3
# Check the observer page
observer_url="$TEST_SLAVE_IP:1338"
check_url_live "$observer_url"
# Poll the observer, waiting for it to receive and show information about the task.
local _success=0
for i in $(seq 1 120); do
task_id=$(aurora_admin query -l '%taskId%' --shards=0 --states=RUNNING $_cluster $_role $_job)
if check_url_live "$observer_url/task/$task_id"; then
_success=1
break
else
sleep 1
fi
done
if [[ "$_success" -ne "1" ]]; then
echo "Observer task detail page is not available."
exit 1
fi
}
test_restart() {
local _jobkey=$1
aurora job restart --batch-size=2 --watch-secs=10 $_jobkey
}
assert_active_update_state() {
local _jobkey=$1 _expected_state=$2
local _state=$(aurora update list $_jobkey --status active | tail -n +2 | awk '{print $3}')
if [[ $_state != $_expected_state ]]; then
echo "Expected update to be in state $_expected_state, but found $_state"
exit 1
fi
}
assert_update_state_by_id() {
# Assert that a given update ID is in an expected state
local _jobkey=$1 _update_id=$2 _expected_state=$3
local _state=$(aurora update info $_jobkey $_update_id | grep 'Current status' | awk '{print $NF}')
if [[ $_state != $_expected_state ]]; then
echo "Update should have completed in $_expected_state state, but found $_state"
exit 1
fi
}
assert_task_status() {
local _jobkey=$1 _id=$2 _expected_state=$3
local _state=$(aurora job status $_jobkey --write-json | jq -r ".[0].active[$_id].status")
if [[ $_state != $_expected_state ]]; then
echo "Expected task to be in state $_expected_state, but found $_state"
exit 1
fi
}
wait_until_task_status() {
# Poll the task, waiting for it to enter the target state
local _jobkey=$1 _id=$2 _expected_state=$3
local _state=""
local _success=0
for i in $(seq 1 120); do
_state=$(aurora job status $_jobkey --write-json | jq -r ".[0].active[$_id].status")
if [[ $_state == $_expected_state ]]; then
_success=1
break
else
sleep 20
fi
done
if [[ "$_success" -ne "1" ]]; then
echo "Task did not transition to $_expected_state within timeout."
exit 1
fi
}
assert_host_status() {
local _host=$1 _cluster=$2 _expected_state=$3
local _state=$(aurora_admin host_status --hosts=$_host $_cluster 2>&1 | tail -n1 | awk -F' ' '{print $6}')
if [[ $_state != $_expected_state ]]; then
echo "Expected host $_host to be in state $_expected_state, but found $_state"
exit 1
fi
}
wait_until_task_counts() {
# Poll the job, waiting for it to enter the target number of task counts
local _jobkey=$1 _expected_running=$2 _expected_pending=$3
local _num_running=0
local _num_pending=0
local _success=0
for i in $(seq 1 120); do
# || is so that we don't return an EXIT so that `trap collect_result` doesn't get triggered.
_num_running=$(aurora job status $_jobkey --write-json | jq -r ".[0].active[].status" | grep "RUNNING" | wc -l) || echo $?
_num_pending=$(aurora job status $_jobkey --write-json | jq -r ".[0].active[].status" | grep "PENDING" | wc -l) || echo $?
if [[ $_num_running == $_expected_running ]] && [[ $_num_pending == $_expected_pending ]]; then
_success=1
break
else
echo "Waiting for job $_jobkey to have $_expected_running RUNNING and $_expected_pending PENDING tasks."
sleep 20
fi
done
if [[ "$_success" -ne "1" ]]; then
echo "Job $_jobkey did not have $_expected_running RUNNING tasks and $_expected_pending PENDING tasks within timeout."
exit 1
fi
}
test_update_add_only_kill_only() {
# Tests update functionality where we only add or kill instances
local _jobkey=$1 _config=$2 _cluster=$3
shift 3
local _extra_args="${@}"
# Create the initial update with 3 instances
aurora update start $_jobkey $_config $_extra_args --bind profile.instances=3
assert_active_update_state $_jobkey 'ROLLING_FORWARD'
local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \
| tail -n +2 | awk '{print $2}')
aurora update wait $_jobkey $_update_id
assert_update_state_by_id $_jobkey $_update_id 'ROLLED_FORWARD'
wait_until_task_counts $_jobkey 3 0
# Update and kill 2 instances only
aurora update start $_jobkey $_config $_extra_args --bind profile.instances=1
assert_active_update_state $_jobkey 'ROLLING_FORWARD'
local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \
| tail -n +2 | awk '{print $2}')
aurora update wait $_jobkey $_update_id
assert_update_state_by_id $_jobkey $_update_id 'ROLLED_FORWARD'
wait_until_task_counts $_jobkey 1 0
# Update and add 2 instances only
aurora update start $_jobkey $_config $_extra_args --bind profile.instances=3
assert_active_update_state $_jobkey 'ROLLING_FORWARD'
local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \
| tail -n +2 | awk '{print $2}')
aurora update wait $_jobkey $_update_id
assert_update_state_by_id $_jobkey $_update_id 'ROLLED_FORWARD'
wait_until_task_counts $_jobkey 3 0
# Clean up
aurora job killall $_jobkey
}
test_update() {
# Tests generic update functionality like pausing and resuming
local _jobkey=$1 _config=$2 _cluster=$3
shift 3
local _extra_args="${@}"
aurora update start $_jobkey $_config $_extra_args
assert_active_update_state $_jobkey 'ROLLING_FORWARD'
local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \
| tail -n +2 | awk '{print $2}')
aurora_admin scheduler_snapshot devcluster
sudo systemctl restart aurora-scheduler
assert_active_update_state $_jobkey 'ROLLING_FORWARD'
aurora update pause $_jobkey --message='hello'
assert_active_update_state $_jobkey 'ROLL_FORWARD_PAUSED'
aurora update resume $_jobkey
assert_active_update_state $_jobkey 'ROLLING_FORWARD'
aurora update wait $_jobkey $_update_id
# Check that the update ended in ROLLED_FORWARD state. Assumes the status is the last column.
assert_update_state_by_id $_jobkey $_update_id 'ROLLED_FORWARD'
}
test_update_fail() {
local _jobkey=$1 _config=$2 _cluster=$3 _bad_healthcheck_config=$4
shift 4
local _extra_args="${@}"
# Make sure our updates works.
aurora update start $_jobkey $_config $_extra_args
assert_active_update_state $_jobkey 'ROLLING_FORWARD'
local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \
| tail -n +2 | awk '{print $2}')
# Need to wait until udpate finishes before we can start one that we want to fail.
aurora update wait $_jobkey $_update_id
# Starting update with a health check that is meant to fail. Expected behavior is roll back.
aurora update start $_jobkey $_bad_healthcheck_config $_extra_args
local _update_id=$(aurora update list $_jobkey --status active \
| tail -n +2 | awk '{print $2}')
# || is so that we don't return an EXIT so that `trap collect_result` doesn't get triggered.
aurora update wait $_jobkey $_update_id || echo $?
# Making sure we rolled back due to a failed health check
assert_update_state_by_id $_jobkey $_update_id 'ROLLED_BACK'
}
test_partition_awareness() {
local _config=$1 _cluster=$2 _default_jobkey=$3 _disabled_jobkey=$4 _delay_jobkey=$5
# create three jobs with different partition policies
aurora update start --wait $_default_jobkey $_config
aurora update start --wait $_disabled_jobkey $_config
aurora update start --wait $_delay_jobkey $_config
# partition the agent
sudo systemctl stop mesos-slave
# the default job should become LOST and then transition to PENDING
wait_until_task_status $_default_jobkey "0" "PENDING"
# the other two should be PARTITIONED
assert_task_status $_disabled_jobkey "0" "PARTITIONED"
assert_task_status $_delay_jobkey "0" "PARTITIONED"
# start the agent back up
sudo systemctl start mesos-slave
# This can be removed when https://issues.apache.org/jira/browse/MESOS-6406 is resolved.
# We have to pause and let the agent reregister with Mesos, then ask Aurora to explicitly
# reconcile to get the RUNNING status update.
sleep 30
aurora_admin reconcile_tasks $_cluster
# the PARTITIONED tasks should now be running
assert_task_status $_disabled_jobkey "0" "RUNNING"
assert_task_status $_delay_jobkey "0" "RUNNING"
# Clean up
aurora job killall $_default_jobkey
aurora job killall $_disabled_jobkey
aurora job killall $_delay_jobkey
}
run_sla_aware_maintenance() {
local _config=$1
local _cluster=$2
local _jobkey=$3
aurora job create $_jobkey $_config --wait-until RUNNING
# assert the number of tasks, the job should have 2 RUNNING tasks
wait_until_task_counts $_jobkey 2 0
# check that the host starts with no maintenance mode
assert_host_status $TEST_SLAVE_IP $_cluster "NONE"
# trigger sla aware drain with default timeout of 2hr
# so, only allowed number (1 each) of tasks should drain for each job
aurora_admin sla_host_drain --hosts=$TEST_SLAVE_IP $_cluster
# force a scheduler restart and make sure that the maintenance request is still satisfied
sudo systemctl restart aurora-scheduler
# host must have maintenance mode set
assert_host_status $TEST_SLAVE_IP $_cluster "DRAINING"
# tasks get drained as allowed by the sla policy
wait_until_task_counts $_jobkey 1 1
# for coordinator sla check specific task states
if [[ $_jobkey == $TEST_JOB_COORDINATOR_SLA ]]; then
assert_task_status $_jobkey "0" PENDING
assert_task_status $_jobkey "1" RUNNING
fi
# host must have maintenance mode set and should be waiting in DRAINING
assert_host_status $TEST_SLAVE_IP $_cluster "DRAINING"
# force sla aware drain with zero timeout
aurora_admin sla_host_drain --force_drain_timeout=0s --hosts=$TEST_SLAVE_IP $_cluster
# tasks get drained as allowed by the sla policy
wait_until_task_counts $_jobkey 0 2
# activate host again
aurora_admin host_activate --hosts=$TEST_SLAVE_IP $_cluster
# assert the number of tasks the job should have 2 RUNNING tasks
wait_until_task_counts $_jobkey 2 0
# clean up
aurora job killall $_jobkey
}
test_sla_aware_maintenance() {
local _config=$1
local _cluster=$2
local _role=$3
local _count_jobkey=$4
local _percentage_jobkey=$5
local _coordinator_jobkey=$6
# add quota for each job (addl. for executor overhead) since only preferred jobs get sla policy
aurora_admin increase_quota $_cluster $_role 1.0 10m 50m
run_sla_aware_maintenance $_config $_cluster $_count_jobkey
run_sla_aware_maintenance $_config $_cluster $_percentage_jobkey
run_sla_aware_maintenance $_config $_cluster $_coordinator_jobkey
}
test_announce() {
local _role=$1 _env=$2 _job=$3
# default python return code
local retcode=0
# launch aurora client in interpreter mode to get access to the kazoo client
env SERVERSET="/aurora/$_role/$_env/$_job" PEX_INTERPRETER=1 \
aurora /vagrant/src/test/sh/org/apache/aurora/e2e/validate_serverset.py || retcode=$?
if [[ $retcode = 1 ]]; then
echo "Validated announced job."
return 0
elif [[ $retcode = 2 ]]; then
echo "Job failed to announce in serverset."
elif [[ $retcode = 3 ]]; then
echo "Job failed to re-announce when expired."
else
echo "Unknown failure in test script."
fi
exit 1
validate_serverset "/aurora/$_jobkey"
}
setup_ssh() {
# Create an SSH public key so that local SSH works without a password.
local _ssh_key=~/.ssh/id_rsa
rm -f ${_ssh_key}*
ssh-keygen -t rsa -N "" -f $_ssh_key
# Ensure a new line for the new key to start on.
# See: https://issues.apache.org/jira/browse/AURORA-1728
echo >> ~/.ssh/authorized_keys
cat ${_ssh_key}.pub >> ~/.ssh/authorized_keys
}
test_run() {
local _jobkey=$1
# Using the sandbox contents as a proxy for functioning SSH. List sandbox contents, looking for
# the .logs directory. We expect to find 3 instances.
sandbox_contents=$(aurora task run $_jobkey 'ls -a' | awk '{print $2}' | grep ".logs" | sort | uniq -c)
echo "$sandbox_contents"
[[ "$sandbox_contents" = " 3 .logs" ]]
}
test_scp_success() {
local _jobkey=$1/0
local _filename=scp_success.txt
local _expected_return=" 1 scp_success.txt"
# Unset because grep can return 1 if the file does not exist
set +e
# Ensure file does not exists before scp
pre_sandbox_contents=$(aurora task run $_jobkey "ls" | awk '{print $2}' | grep ${_filename} | sort | uniq -c)
[[ "$pre_sandbox_contents" != $_expected_return ]]
# Reset -e after command has been run
set -e
# Create a file and move it to the sandbox of a job
touch $_filename
aurora task scp $_filename ${_jobkey}:
sandbox_contents=$(aurora task run $_jobkey "ls" | awk '{print $2}' | grep ${_filename} | sort | uniq -c)
[[ "$sandbox_contents" == $_expected_return ]]
}
test_scp_permissions() {
local _jobkey=$1/0
local _filename=scp_fail_permission.txt
local _retcode=0
local _sandbox_contents
# Create a file and try to move it, ensure we get permission denied
touch $_filename
# Unset because we are expecting an error
set +e
# $_filename is a path relative to "/var/lib/mesos/slaves/x/frameworks/y/executors/z/runs/latest/sandbox".
# We shouldn't have write permissions outside of the "/latest" executor scratch dir created by Mesos.
_sandbox_contents=$(aurora task scp $_filename ${_jobkey}:../../ 2>&1 > /dev/null)
_retcode=$?
# Reset -e after command has been run
set -e
if [[ "$_retcode" != 1 ]]; then
echo "Permission to exit chroot jail given when should have failed"
exit 1
fi
if [[ "$_sandbox_contents" != *"../scp_fail_permission.txt: Permission denied"* ]]; then
echo "Unexpected response from invalid scp command"
exit 1
fi
}
test_kill() {
local _jobkey=$1
shift 1
local _extra_args="${@}"
aurora job kill $_jobkey/1 $_extra_args
aurora job killall $_jobkey $_extra_args
}
test_quota() {
local _cluster=$1 _role=$2
aurora quota get $_cluster/$_role
}
test_discovery_info() {
local _task_id_prefix=$1
local _discovery_name=$2
if ! [[ -x "$(command -v jq)" ]]; then
echo "jq is not installed, skipping discovery info test"
return 0
fi
framework_info=$(curl --silent '192.168.33.7:5050/state' | jq '.frameworks | map(select(.name == "Aurora"))')
if [[ -z $framework_info ]]; then
echo "Cannot get framework info for $framework"
exit 1
fi
task_info=$(echo $framework_info | jq --arg task_id_prefix "${_task_id_prefix}" '.[0]["tasks"] | map(select(.id | contains($task_id_prefix)))')
if [[ -z $task_info ]]; then
echo "Cannot get task blob json for task id prefix ${_task_id_prefix}"
exit 1
fi
discovery_info=$(echo $task_info | jq '.[0]["discovery"]')
if [[ -z $discovery_info ]]; then
echo "Cannot get discovery info json from task blob ${task_blob}"
exit 1
fi
name=$(echo $discovery_info | jq '.["name"]')
if [[ "$name" -ne "\"$_discovery_name\"" ]]; then
echo "discovery info name $name does not equal to expected \"$_discovery_name\""
exit 1
fi
num_ports=$(echo $discovery_info | jq '.["ports"]["ports"] | length')
if ! [[ "$num_ports" -gt 0 ]]; then
echo "num of ports in discovery info is $num_ports which is not greater than zero"
exit 1
fi
}
test_thermos_profile() {
read_env_output=$(aurora task ssh $_jobkey/0 --command='tail -1 .logs/read_env/0/stdout' |tr -d '\r\n' 2>/dev/null)
echo "$read_env_output"
[[ "$read_env_output" = "hello" ]]
}
BACKUPS_DIR='/var/lib/aurora/backups'
REPLICATED_LOG_DIR='/var/db/aurora'
test_recovery_tool() {
local _cluster=$1
# As a cursory data validation step, fetch an arbitrary job update to ensure it exists after
# recovery completes.
update=$(aurora update list devcluster --write-json | jq -r '.[0] | .job + " " + .id')
# Take a backup
aurora_admin scheduler_backup_now $_cluster
sudo systemctl stop aurora-scheduler
# Reset storage
sudo rm -r $REPLICATED_LOG_DIR
sudo mesos-log initialize --path=$REPLICATED_LOG_DIR
# Identify the newest backup file
backup=$(basename $(ls -dtr1 $BACKUPS_DIR/* | tail -n1))
# Recover
sudo /home/vagrant/aurora/dist/install/aurora-scheduler/bin/recovery-tool \
-from BACKUP \
-to LOG \
-backup $BACKUPS_DIR/$backup \
-native_log_zk_group_path=/aurora/replicated-log \
-native_log_file_path=$REPLICATED_LOG_DIR \
-zk_endpoints=localhost:2181
sudo systemctl start aurora-scheduler
# This command exits non-zero if the update is not found.
aurora update info $update
}
test_http_example() {
local _cluster=$1 _role=$2 _env=$3
local _base_config=$4 _updated_config=$5
local _bad_healthcheck_config=$6
local _job=$7
local _bind_parameters=${8:-""}
local _jobkey="$_cluster/$_role/$_env/$_job"
local _task_id_prefix="${_role}-${_env}-${_job}-0"
local _discovery_name="${_job}.${_env}.${_role}"
test_config $_base_config $_jobkey
test_inspect $_jobkey $_base_config $_bind_parameters
test_create $_jobkey $_base_config $_bind_parameters
test_job_status $_cluster $_role $_env $_job
test_scheduler_ui $_role $_env $_job
test_observer_ui $_cluster $_role $_job
test_discovery_info $_task_id_prefix $_discovery_name
test_thermos_profile $_jobkey
test_file_mount $_cluster $_role $_env $_job
test_restart $_jobkey
test_update_add_only_kill_only $_jobkey $_base_config $_cluster $_bind_parameters
test_update $_jobkey $_updated_config $_cluster $_bind_parameters
test_update_fail $_jobkey $_base_config $_cluster $_bad_healthcheck_config $_bind_parameters
# Running test_update second time to change state to success.
test_update $_jobkey $_updated_config $_cluster $_bind_parameters
test_announce $_role $_env $_job
test_run $_jobkey
# TODO(AURORA-1926): 'aurora task scp' only works fully on Mesos containers (can only read for
# Docker). See if it is possible to enable write for Docker sandboxes as well then remove the
# 'if' guard below.
if [[ $_job != *"docker"* ]]; then
test_scp_success $_jobkey
test_scp_permissions $_jobkey
fi
test_kill $_jobkey
test_quota $_cluster $_role
}
test_http_example_basic() {
local _cluster=$1 _role=$2 _env=$3
local _base_config=$4
local _job=$7
local _jobkey="$_cluster/$_role/$_env/$_job"
test_create $_jobkey $_base_config
test_observer_ui $_cluster $_role $_job
test_kill $_jobkey
}
test_admin() {
local _cluster=$1
echo '== Testing admin commands'
echo '== Getting leading scheduler'
aurora_admin get_scheduler $_cluster | grep ":8081"
# host maintenance commands currently have a separate entry point and use their own api client.
# Until we address that, at least verify that the command group still works.
aurora_admin host_status --hosts=$TEST_SLAVE_IP $_cluster
}
test_ephemeral_daemon_with_final() {
local _cluster=$1 _role=$2 _env=$3 _job=$4 _config=$5
local _jobkey="$_cluster/$_role/$_env/$_job"
local _stop_file=$(mktemp)
local _extra_args="--bind stop_file=$_stop_file"
rm $_stop_file
test_create $_jobkey $_config $_extra_args
test_observer_ui $_cluster $_role $_job
test_job_status $_cluster $_role $_env $_job
touch $_stop_file # Stops 'main_process'.
test_file_removed $_stop_file # Removed by 'final_process'.
}
test_daemonizing_process() {
local _cluster=$1 _role=$2 _env=$3 _job=$4 _config=$5
local _jobkey="$_cluster/$_role/$_env/$_job"
local _term_file=$(mktemp)
local _extra_args="--bind term_file=$_term_file"
test_create $_jobkey $_config $_extra_args
test_observer_ui $_cluster $_role $_job
test_job_status $_cluster $_role $_env $_job
test_kill $_jobkey
test_file_removed $_term_file
}
restore_netrc() {
mv ~/.netrc.bak ~/.netrc >/dev/null 2>&1 || true
}
test_basic_auth_unauthenticated() {
local _cluster=$1 _role=$2 _env=$3
local _config=$4
local _job=$7
local _jobkey="$_cluster/$_role/$_env/$_job"
mv ~/.netrc ~/.netrc.bak
trap restore_netrc EXIT
aurora job create $_jobkey $_config || retcode=$?
if [[ $retcode != 30 ]]; then
echo "Expected auth error exit code, got $retcode"
exit 1
fi
restore_netrc
}
setup_image_stores() {
TEMP_PATH=$(mktemp -d)
pushd "$TEMP_PATH"
# build the docker image and save it as a tarball.
sudo docker build -t http_example_netcat -f "${TEST_ROOT}/Dockerfile.netcat" ${TEST_ROOT}
docker save -o http_example_netcat-latest.tar http_example_netcat
DOCKER_IMAGE_DIRECTORY="/tmp/mesos/images/docker"
sudo mkdir -p "$DOCKER_IMAGE_DIRECTORY"
sudo cp http_example_netcat-latest.tar "$DOCKER_IMAGE_DIRECTORY/http_example_netcat:latest.tar"
# build the appc image from the docker image
docker2aci http_example_netcat-latest.tar
APPC_IMAGE_ID="sha512-$(sha512sum library-http_example_netcat-latest.aci | awk '{print $1}')"
export APPC_IMAGE_ID
APPC_IMAGE_DIRECTORY="/tmp/mesos/images/appc/images/$APPC_IMAGE_ID"
sudo mkdir -p "$APPC_IMAGE_DIRECTORY"
sudo tar -xf library-http_example_netcat-latest.aci -C "$APPC_IMAGE_DIRECTORY"
# This restart is necessary for mesos to pick up the image from the local store.
sudo systemctl restart mesos-slave
popd
rm -rf "$TEMP_PATH"
}
setup_docker_registry() {
# build the test docker image
sudo docker build -t http_example -f "${TEST_ROOT}/Dockerfile.python" ${TEST_ROOT}
docker tag http_example:latest aurora.local:5000/http_example:latest
docker login -p testpassword -u testuser http://aurora.local:5000
docker push aurora.local:5000/http_example:latest
sudo mv /etc/aurora/clusters.json /etc/aurora/clusters.json.old
sudo sh -c "cat /etc/aurora/clusters.json.old | jq 'map(. + {docker_registry:\"http://aurora.local:5000\"})' > /etc/aurora/clusters.json"
}
test_appc_unified() {
num_mounts_before=$(mount |wc -l |tr -d '\n')
TEST_JOB_APPC_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_UNIFIED_APPC" "--bind appc_image_id=$APPC_IMAGE_ID")
test_http_example "${TEST_JOB_APPC_ARGS[@]}"
num_mounts_after=$(mount |wc -l |tr -d '\n')
# We want to be sure that running the isolated task did not leak any mounts.
[[ "$num_mounts_before" = "$num_mounts_after" ]]
}
test_file_mount() {
local _cluster=$1 _role=$2 _env=$3 _job=$4
if [[ "$_job" = "$TEST_JOB_UNIFIED_DOCKER" ]]; then
local _jobkey="$_cluster/$_role/$_env/$_job"
verify_file_mount_output=$(aurora task ssh $_jobkey/0 --command='tail -1 .logs/verify_file_mount/0/stdout' |tr -d '\r\n' 2>/dev/null)
echo "$verify_file_mount_output"
[[ "$verify_file_mount_output" = "$(cat /vagrant/.auroraversion |tr -d '\r\n')" ]]
return $?
fi
return 0
}
test_docker_unified() {
num_mounts_before=$(mount |wc -l |tr -d '\n')
TEST_JOB_DOCKER_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_UNIFIED_DOCKER")
test_http_example "${TEST_JOB_DOCKER_ARGS[@]}"
num_mounts_after=$(mount |wc -l |tr -d '\n')
# We want to be sure that running the isolated task did not leak any mounts.
[[ "$num_mounts_before" = "$num_mounts_after" ]]
}
RETCODE=1
# Set up shorthands for test
export TEST_ROOT=/vagrant/src/test/sh/org/apache/aurora/e2e
export EXAMPLE_DIR=${TEST_ROOT}/http
export DOCKER_DIR=${TEST_ROOT}/docker
TEST_CLUSTER=devcluster
TEST_ROLE=vagrant
TEST_ENV=test
TEST_JOB=http_example
TEST_MAINTENANCE_JOB=http_example_maintenance
TEST_JOB_WATCH_SECS=http_example_watch_secs
TEST_JOB_VAR_BATCH_UPDATE=http_example_var_batch_update
TEST_JOB_REVOCABLE=http_example_revocable
TEST_JOB_GPU=http_example_gpu
TEST_JOB_DOCKER=http_example_docker
TEST_JOB_UNIFIED_APPC=http_example_unified_appc
TEST_JOB_UNIFIED_DOCKER=http_example_unified_docker
TEST_CONFIG_FILE=$EXAMPLE_DIR/http_example.aurora
TEST_CONFIG_UPDATED_FILE=$EXAMPLE_DIR/http_example_updated.aurora
TEST_BAD_HEALTHCHECK_CONFIG_UPDATED_FILE=$EXAMPLE_DIR/http_example_bad_healthcheck.aurora
TEST_EPHEMERAL_DAEMON_WITH_FINAL_JOB=ephemeral_daemon_with_final
TEST_EPHEMERAL_DAEMON_WITH_FINAL_CONFIG_FILE=$TEST_ROOT/ephemeral_daemon_with_final.aurora
TEST_DAEMONIZING_PROCESS_JOB=daemonize
TEST_DAEMONIZING_PROCESS_CONFIG_FILE=$TEST_ROOT/test_daemonizing_process.aurora
TEST_PARTITION_AWARENESS_CONFIG_FILE=$TEST_ROOT/partition_aware.aurora
TEST_JOB_PA_DEFAULT=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/partition_aware_default
TEST_JOB_PA_DISABLED=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/partition_aware_disabled
TEST_JOB_PA_DELAY=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/partition_aware_delay
TEST_SLA_POLICY_CONFIG_FILE=$TEST_ROOT/sla_policy.aurora
TEST_JOB_COUNT_SLA=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/count
TEST_JOB_PERCENTAGE_SLA=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/percentage
TEST_JOB_COORDINATOR_SLA=$TEST_CLUSTER/$TEST_ROLE/$TEST_ENV/coordinator
BASE_ARGS=(
$TEST_CLUSTER
$TEST_ROLE
$TEST_ENV
$TEST_CONFIG_FILE
$TEST_CONFIG_UPDATED_FILE
$TEST_BAD_HEALTHCHECK_CONFIG_UPDATED_FILE
)
TEST_JOB_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB")
TEST_MAINTENANCE_JOB_ARGS=("${BASE_ARGS[@]}" "$TEST_MAINTENANCE_JOB")
TEST_JOB_WATCH_SECS_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_WATCH_SECS")
TEST_JOB_VAR_BATCH_UPDATE_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_VAR_BATCH_UPDATE")
TEST_JOB_REVOCABLE_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_REVOCABLE")
TEST_JOB_GPU_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_GPU")
TEST_JOB_DOCKER_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_DOCKER")
TEST_ADMIN_ARGS=($TEST_CLUSTER)
TEST_JOB_EPHEMERAL_DAEMON_WITH_FINAL_ARGS=(
$TEST_CLUSTER
$TEST_ROLE
$TEST_ENV
$TEST_EPHEMERAL_DAEMON_WITH_FINAL_JOB
$TEST_EPHEMERAL_DAEMON_WITH_FINAL_CONFIG_FILE
)
TEST_DAEMONIZING_PROCESS_ARGS=(
$TEST_CLUSTER
$TEST_ROLE
$TEST_ENV
$TEST_DAEMONIZING_PROCESS_JOB
$TEST_DAEMONIZING_PROCESS_CONFIG_FILE
)
TEST_PARTITION_AWARENESS_ARGS=(
$TEST_PARTITION_AWARENESS_CONFIG_FILE
$TEST_CLUSTER
$TEST_JOB_PA_DEFAULT
$TEST_JOB_PA_DISABLED
$TEST_JOB_PA_DELAY
)
TEST_SLA_AWARE_MAINTENANCE_ARGS=(
$TEST_SLA_POLICY_CONFIG_FILE
$TEST_CLUSTER
$TEST_ROLE
$TEST_JOB_COUNT_SLA
$TEST_JOB_PERCENTAGE_SLA
$TEST_JOB_COORDINATOR_SLA
)
TEST_JOB_KILL_MESSAGE_ARGS=("${TEST_JOB_ARGS[@]}" "--message='Test message'")
trap collect_result EXIT
aurorabuild all
setup_ssh
setup_docker_registry
test_sla_aware_maintenance "${TEST_SLA_AWARE_MAINTENANCE_ARGS[@]}"
test_partition_awareness "${TEST_PARTITION_AWARENESS_ARGS[@]}"
test_version
test_http_example "${TEST_JOB_ARGS[@]}"
test_http_example "${TEST_JOB_WATCH_SECS_ARGS[@]}"
# TODO(rdelvalle): Add verification that each batch has the right number of active instances.
test_http_example "${TEST_JOB_VAR_BATCH_UPDATE_ARGS[@]}"
test_health_check
test_mesos_maintenance "${TEST_MAINTENANCE_JOB_ARGS[@]}"
test_http_example_basic "${TEST_JOB_REVOCABLE_ARGS[@]}"
test_http_example_basic "${TEST_JOB_GPU_ARGS[@]}"
test_http_example_basic "${TEST_JOB_KILL_MESSAGE_ARGS[@]}"
test_http_example "${TEST_JOB_DOCKER_ARGS[@]}"
setup_image_stores
test_appc_unified
test_docker_unified
test_admin "${TEST_ADMIN_ARGS[@]}"
test_basic_auth_unauthenticated "${TEST_JOB_ARGS[@]}"
test_ephemeral_daemon_with_final "${TEST_JOB_EPHEMERAL_DAEMON_WITH_FINAL_ARGS[@]}"
test_daemonizing_process "${TEST_DAEMONIZING_PROCESS_ARGS[@]}"
test_recovery_tool $TEST_CLUSTER
/vagrant/src/test/sh/org/apache/aurora/e2e/test_kerberos_end_to_end.sh
/vagrant/src/test/sh/org/apache/aurora/e2e/test_bypass_leader_redirect_end_to_end.sh
RETCODE=0