Merge pull request #3168 from dandsager1/STORM-3539

STORM-3539 Add metric for worker start time out
diff --git a/docs/ClusterMetrics.md b/docs/ClusterMetrics.md
index 4e4d0f1..f7f7b4f 100644
--- a/docs/ClusterMetrics.md
+++ b/docs/ClusterMetrics.md
@@ -185,6 +185,7 @@
 | supervisor:num-launched | meter | number of times the supervisor is launched. |
 | supervisor:num-shell-exceptions | meter | number of exceptions calling shell commands. |
 | supervisor:num-slots-used-gauge | gauge | number of slots used on the supervisor. |
+| supervisor:num-worker-start-timed-out | meter | number of times worker start timed out. |
 | supervisor:num-worker-transitions-into-empty | meter | number of transitions into empty state. |
 | supervisor:num-worker-transitions-into-kill | meter | number of transitions into kill state. |
 | supervisor:num-worker-transitions-into-kill-and-relaunch | meter | number of transitions into kill-and-relaunch state |
diff --git a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java
index 7575a91..df419b9 100644
--- a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java
+++ b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java
@@ -684,6 +684,7 @@
         long timeDiffms = (Time.currentTimeMillis() - dynamicState.startTime);
         long hbFirstTimeoutMs = getFirstHbTimeoutMs(staticState, dynamicState);
         if (timeDiffms > hbFirstTimeoutMs) {
+            staticState.slotMetrics.numWorkerStartTimedOut.mark();
             LOG.warn("SLOT {}: Container {} failed to launch in {} ms.", staticState.port, dynamicState.container,
                     hbFirstTimeoutMs);
             return killContainerFor(KillReason.HB_TIMEOUT, dynamicState, staticState);
diff --git a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java
index f8e13fd..8b2f5f1 100644
--- a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java
+++ b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java
@@ -26,6 +26,7 @@
 class SlotMetrics {
 
     final Meter numWorkersLaunched;
+    final Meter numWorkerStartTimedOut;
     final Map<Slot.KillReason, Meter> numWorkersKilledFor;
     final Timer workerLaunchDuration;
     final Map<Slot.MachineState, Meter> transitionIntoState;
@@ -34,6 +35,7 @@
 
     SlotMetrics(StormMetricsRegistry metricsRegistry) {
         numWorkersLaunched = metricsRegistry.registerMeter("supervisor:num-workers-launched");
+        numWorkerStartTimedOut = metricsRegistry.registerMeter("supervisor:num-worker-start-timed-out");
         numWorkersKilledFor = Collections.unmodifiableMap(EnumUtil.toEnumMap(Slot.KillReason.class,
             killReason -> metricsRegistry.registerMeter("supervisor:num-workers-killed-" + killReason.toString())));
         workerLaunchDuration = metricsRegistry.registerTimer("supervisor:worker-launch-duration");