Merge pull request #33 from jordanly/jly/fix-npe-in-sla-aware-updates

Fix possible NullPointerException in InstanceActionHandler
diff --git a/src/main/java/org/apache/aurora/scheduler/updater/InstanceActionHandler.java b/src/main/java/org/apache/aurora/scheduler/updater/InstanceActionHandler.java
index d9c7e0a..97906b5 100644
--- a/src/main/java/org/apache/aurora/scheduler/updater/InstanceActionHandler.java
+++ b/src/main/java/org/apache/aurora/scheduler/updater/InstanceActionHandler.java
@@ -207,7 +207,12 @@
         IJobUpdateInstructions instructions) throws UpdateStateException {
 
       if (status == ROLLING_FORWARD) {
-        return Optional.ofNullable(instructions.getDesiredState().getTask().getSlaPolicy());
+        // It is possible that an update only removes instances. In this case, there is no desired
+        // state. Otherwise, get the task associated (this should never be null) and return an
+        // optional of the SlaPolicy of the task (or empty if null).
+        return Optional
+            .ofNullable(instructions.getDesiredState())
+            .map(desiredState -> desiredState.getTask().getSlaPolicy());
       } else if (status == ROLLING_BACK) {
         return getConfig(instance.getInstanceId(), instructions.getInitialState())
             .map(ITaskConfig::getSlaPolicy);
diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora b/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora
index 8878ab9..bd4d4a1 100644
--- a/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora
+++ b/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora
@@ -18,6 +18,7 @@
   role=Default(String, getpass.getuser())
   cmd=Default(String, 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .')
   gpu=Default(Integer, 0)
+  instances=Default(Integer, 1)
 
 
 ContainerProfile = DefaultProfile(
@@ -115,7 +116,7 @@
 
 job = Service(
   cluster = 'devcluster',
-  instances = 1,
+  instances = '{{profile.instances}}',
   update_config = update_config,
   health_check_config = health_check_config,
   task = test_task,
diff --git a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
index 24f4448..7c68679 100755
--- a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
+++ b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
@@ -213,7 +213,7 @@
   aurora job restart --batch-size=2 --watch-secs=10 $_jobkey
 }
 
-assert_update_state() {
+assert_active_update_state() {
   local _jobkey=$1 _expected_state=$2
 
   local _state=$(aurora update list $_jobkey --status active | tail -n +2 | awk '{print $3}')
@@ -223,6 +223,17 @@
   fi
 }
 
+assert_update_state_by_id() {
+  # Assert that a given update ID is in an expected state
+  local _jobkey=$1 _update_id=$2 _expected_state=$3
+
+  local _state=$(aurora update info $_jobkey $_update_id | grep 'Current status' | awk '{print $NF}')
+  if [[ $_state != $_expected_state ]]; then
+    echo "Update should have completed in $_expected_state state, but found $_state"
+    exit 1
+  fi
+}
+
 assert_task_status() {
   local _jobkey=$1 _id=$2 _expected_state=$3
 
@@ -294,30 +305,64 @@
   fi
 }
 
+test_update_add_only_kill_only() {
+  # Tests update functionality where we only add or kill instances
+  local _jobkey=$1 _config=$2 _cluster=$3
+  shift 3
+  local _extra_args="${@}"
+
+  # Create the initial update with 3 instances
+  aurora update start $_jobkey $_config $_extra_args --bind profile.instances=3
+  assert_active_update_state $_jobkey 'ROLLING_FORWARD'
+  local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \
+      | tail -n +2 | awk '{print $2}')
+  aurora update wait $_jobkey $_update_id
+  assert_update_state_by_id $_jobkey $_update_id 'ROLLED_FORWARD'
+  wait_until_task_counts $_jobkey 3 0
+
+  # Update and kill 2 instances only
+  aurora update start $_jobkey $_config $_extra_args --bind profile.instances=1
+  assert_active_update_state $_jobkey 'ROLLING_FORWARD'
+  local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \
+      | tail -n +2 | awk '{print $2}')
+  aurora update wait $_jobkey $_update_id
+  assert_update_state_by_id $_jobkey $_update_id 'ROLLED_FORWARD'
+  wait_until_task_counts $_jobkey 1 0
+
+  # Update and add 2 instances only
+  aurora update start $_jobkey $_config $_extra_args --bind profile.instances=3
+  assert_active_update_state $_jobkey 'ROLLING_FORWARD'
+  local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \
+      | tail -n +2 | awk '{print $2}')
+  aurora update wait $_jobkey $_update_id
+  assert_update_state_by_id $_jobkey $_update_id 'ROLLED_FORWARD'
+  wait_until_task_counts $_jobkey 3 0
+
+  # Clean up
+  aurora job killall $_jobkey
+}
+
 test_update() {
+  # Tests generic update functionality like pausing and resuming
   local _jobkey=$1 _config=$2 _cluster=$3
   shift 3
   local _extra_args="${@}"
 
   aurora update start $_jobkey $_config $_extra_args
-  assert_update_state $_jobkey 'ROLLING_FORWARD'
+  assert_active_update_state $_jobkey 'ROLLING_FORWARD'
   local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \
       | tail -n +2 | awk '{print $2}')
   aurora_admin scheduler_snapshot devcluster
   sudo systemctl restart aurora-scheduler
-  assert_update_state $_jobkey 'ROLLING_FORWARD'
+  assert_active_update_state $_jobkey 'ROLLING_FORWARD'
   aurora update pause $_jobkey --message='hello'
-  assert_update_state $_jobkey 'ROLL_FORWARD_PAUSED'
+  assert_active_update_state $_jobkey 'ROLL_FORWARD_PAUSED'
   aurora update resume $_jobkey
-  assert_update_state $_jobkey 'ROLLING_FORWARD'
+  assert_active_update_state $_jobkey 'ROLLING_FORWARD'
   aurora update wait $_jobkey $_update_id
 
   # Check that the update ended in ROLLED_FORWARD state.  Assumes the status is the last column.
-  local status=$(aurora update info $_jobkey $_update_id | grep 'Current status' | awk '{print $NF}')
-  if [[ $status != "ROLLED_FORWARD" ]]; then
-    echo "Update should have completed in ROLLED_FORWARD state"
-    exit 1
-  fi
+  assert_update_state_by_id $_jobkey $_update_id 'ROLLED_FORWARD'
 }
 
 test_update_fail() {
@@ -327,7 +372,7 @@
 
   # Make sure our updates works.
   aurora update start $_jobkey $_config $_extra_args
-  assert_update_state $_jobkey 'ROLLING_FORWARD'
+  assert_active_update_state $_jobkey 'ROLLING_FORWARD'
   local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \
       | tail -n +2 | awk '{print $2}')
   # Need to wait until udpate finishes before we can start one that we want to fail.
@@ -339,12 +384,8 @@
       | tail -n +2 | awk '{print $2}')
   # || is so that we don't return an EXIT so that `trap collect_result` doesn't get triggered.
   aurora update wait $_jobkey $_update_id || echo $?
-  # Making sure we rolled back.
-  local status=$(aurora update info $_jobkey $_update_id | grep 'Current status' | awk '{print $NF}')
-  if [[ $status != "ROLLED_BACK" ]]; then
-    echo "Update should have completed in ROLLED_BACK state due to failed healthcheck."
-    exit 1
-  fi
+  # Making sure we rolled back due to a failed health check
+  assert_update_state_by_id $_jobkey $_update_id 'ROLLED_BACK'
 }
 
 test_partition_awareness() {
@@ -665,6 +706,7 @@
   test_thermos_profile $_jobkey
   test_file_mount $_cluster $_role $_env $_job
   test_restart $_jobkey
+  test_update_add_only_kill_only $_jobkey $_base_config $_cluster $_bind_parameters
   test_update $_jobkey $_updated_config $_cluster $_bind_parameters
   test_update_fail $_jobkey $_base_config  $_cluster $_bad_healthcheck_config $_bind_parameters
   # Running test_update second time to change state to success.