Periodically try to complete the lifecycle hook if we are Pending (#31) Thanks to now using the AMI, the instance can come up a lot quicker than previously, meaning it is possible the supervisor might start up when the instance state is still "Pending", which results in the complete_lifecycle_action call failing. Rather than blocking the actions.runner service from starting (and not running jobs) we continue, and check later if we are still pending, and try to complete the lifecycle action again.

commit: 18344ea79f874a9f503b70d2e07a7bf210e9c8bb [log] [tgz]
author: Ash Berlin-Taylor <ash_github@firemirror.com> Tue May 18 12:18:57 2021 +0100
committer: GitHub <noreply@github.com> Tue May 18 12:18:57 2021 +0100
tree: c10d4c6a4df9633d0e8317587dbfbfcab69faa26
parent: c5c38ee6c8fd68708016c92dfadb1f9ce49fbf91 [diff]
diff --git a/github-runner-ami/packer/files/runner-supervisor.py b/github-runner-ami/packer/files/runner-supervisor.py
index d3d0e0c..6f1fa30 100755
--- a/github-runner-ami/packer/files/runner-supervisor.py
+++ b/github-runner-ami/packer/files/runner-supervisor.py

@@ -162,7 +162,9 @@
                 write_credentials_to_files(repo, index, output_folder, user)
                 merge_in_settings(repo, output_folder)
                 notify(f"STATUS=Obtained lock on {index}")
-                complete_asg_lifecycle_hook()
+
+                if get_lifecycle_state() == "Pending:Wait":
+                    complete_asg_lifecycle_hook()
 
                 notify("READY=1")
                 log.info("Watching for Runner.Worker processes")
@@ -324,33 +326,40 @@
 INSTANCE_ID = None
 
 
-def complete_asg_lifecycle_hook(hook_name='WaitForInstanceReportReady'):
-    global OWN_ASG, INSTANCE_ID
-    # Notify the ASG LifeCycle hook that we are now In Service and ready to
-    # process requests/safe to be shut down
+def get_lifecycle_state() -> str:
+    global INSTANCE_ID, OWN_ASG
 
-    # Fetch current instance ID from where cloutinit writes it to
     if not INSTANCE_ID:
         with open('/var/lib/cloud/data/instance-id') as fh:
             INSTANCE_ID = fh.readline().strip()
 
-    # Get the ASG name we are attached to by looking at our own tags
+    asg_client = boto3.client('autoscaling')
+
+    try:
+        instances = asg_client.describe_auto_scaling_instances(
+            InstanceIds=[INSTANCE_ID],
+        )['AutoScalingInstances']
+    except asg_client.exceptions.ClientError:
+        return "UNKNOWN"
+
+    if len(instances) != 1:
+        return "UNKNOWN"
+
+    details = instances[0]
+
     if not OWN_ASG:
-        ec2 = boto3.client('ec2')
-        tags = ec2.describe_tags(
-            Filters=[
-                {'Name': 'key', 'Values': ['aws:autoscaling:groupName']},
-                {'Name': 'resource-id', 'Values': [INSTANCE_ID]},
-            ]
-        )
+        OWN_ASG = details['AutoScalingGroupName']
 
-        if len(tags['Tags']) == 0:
-            # Not part of an ASG.
-            return
+    return details['LifecycleState']
 
-        OWN_ASG = tags['Tags'][0]['Value']
+
+def complete_asg_lifecycle_hook(hook_name='WaitForInstanceReportReady', retry=False):
+    global OWN_ASG, INSTANCE_ID
+    # Notify the ASG LifeCycle hook that we are now InService and ready to
+    # process requests/safe to be shut down
 
     asg_client = boto3.client('autoscaling')
+
     try:
         asg_client.complete_lifecycle_action(
             AutoScalingGroupName=OWN_ASG,
@@ -364,7 +373,7 @@
         # completed, so this would fail. That is not an error
 
         # We don't want the stacktrace here, just the message
-        log.warning("Failed to complete lifecycle hook: %s", str(e))
+        log.warning("Failed to complete lifecycle hook %s: %s", hook_name, str(e))
         pass
 
 
@@ -564,9 +573,12 @@
     def check_still_alive(self):
         # Check ASG status
         if not self.in_termating_lifecycle:
-            if self.check_for_asg_termiate_requested():
+            state = get_lifecycle_state()
+            if state == 'Terminating:Wait':
                 self.in_termating_lifecycle = True
                 self.gracefully_terminate_runner()
+            elif state == 'Pending:Wait':
+                complete_asg_lifecycle_hook()
 
         # proc_connector is un-reliable (UDP) so periodically check if the processes are still alive
         if not self.interesting_processes:
@@ -593,27 +605,6 @@
     def gracefully_terminate_runner(self):
         check_call(['systemctl', 'stop', 'actions.runner', '--no-block'])
 
-    def check_for_asg_termiate_requested(self):
-        """Check if the ASG stat of this instance is in the "Terminating:Wait" lifecycle state"""
-        if not OWN_ASG:
-            # Not part of an ASG
-            return False
-
-        asg_client = boto3.client('autoscaling')
-        try:
-            instances = asg_client.describe_auto_scaling_instances(
-                InstanceIds=[INSTANCE_ID],
-            )['AutoScalingInstances']
-
-            if len(instances) != 1:
-                return False
-
-            state = instances[0]['LifecycleState']
-
-            return state == 'Terminating:Wait'
-        except asg_client.exceptions.ClientError:
-            return False
-
     def protect_from_scale_in(self, protect: bool = True):
         """ Set (or unset) ProtectedFromScaleIn on our instance"""
         if not OWN_ASG:
commit	18344ea79f874a9f503b70d2e07a7bf210e9c8bb	[log] [tgz]
author	Ash Berlin-Taylor <ash_github@firemirror.com>	Tue May 18 12:18:57 2021 +0100
committer	GitHub <noreply@github.com>	Tue May 18 12:18:57 2021 +0100
tree	c10d4c6a4df9633d0e8317587dbfbfcab69faa26
parent	c5c38ee6c8fd68708016c92dfadb1f9ce49fbf91 [diff]