SLING-5965 : Metrics and a Health-Check for Scheduler to detect long-running Quartz-Jobs

git-svn-id: https://svn.apache.org/repos/asf/sling/trunk@1805720 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/pom.xml b/pom.xml
index a4457b5..858a4e3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -84,12 +84,14 @@
                             org.apache.sling.discovery;resolution:=optional,
                             org.apache.sling.settings;resolution:=optional,
                             org.apache.sling.hc.api;resolution:=optional,
+                            org.apache.sling.hc.util;resolution:=optional,
                             *                        
                         </Import-Package>
                         <DynamicImport-Package>
                             org.apache.sling.discovery;version="[1.0,2)",
                             org.apache.sling.settings;version="[1.0,2)",
-                            org.apache.sling.hc.api
+                            org.apache.sling.hc.api,
+                            org.apache.sling.hc.util
                         </DynamicImport-Package>
                         <Embed-Dependency>
                             quartz
@@ -180,19 +182,16 @@
             <scope>test</scope>
         </dependency>
         <dependency>
-        	<groupId>org.apache.sling</groupId>
-        	<artifactId>org.apache.sling.commons.metrics</artifactId>
-        	<version>1.2.0</version>
+          	<groupId>org.apache.sling</groupId>
+          	<artifactId>org.apache.sling.hc.core</artifactId>
+         	<version>1.0.6</version>
+         	<scope>provided</scope>
         </dependency>
         <dependency>
-        	<groupId>org.apache.sling</groupId>
-        	<artifactId>org.apache.sling.hc.core</artifactId>
-        	<version>1.0.6</version>
-        </dependency>
-        <dependency>
-        	<groupId>org.apache.sling</groupId>
-        	<artifactId>org.apache.sling.commons.osgi</artifactId>
-        	<version>2.1.0</version>
+            <groupId>io.dropwizard.metrics</groupId>
+            <artifactId>metrics-core</artifactId>
+            <version>3.2.3</version>
+            <scope>provided</scope>
         </dependency>
     </dependencies>
 </project>
diff --git a/src/main/java/org/apache/sling/commons/scheduler/impl/GaugesSupport.java b/src/main/java/org/apache/sling/commons/scheduler/impl/GaugesSupport.java
index 12c1868..823fd6a 100644
--- a/src/main/java/org/apache/sling/commons/scheduler/impl/GaugesSupport.java
+++ b/src/main/java/org/apache/sling/commons/scheduler/impl/GaugesSupport.java
@@ -17,16 +17,13 @@
 package org.apache.sling.commons.scheduler.impl;
 
 import java.util.Date;
-import java.util.Dictionary;
 import java.util.HashMap;
-import java.util.Hashtable;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.NoSuchElementException;
 
-import org.apache.sling.commons.metrics.Gauge;
 import org.osgi.framework.BundleContext;
 import org.osgi.framework.Constants;
 import org.osgi.framework.ServiceRegistration;
@@ -41,11 +38,13 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.codahale.metrics.Gauge;
+import com.codahale.metrics.MetricRegistry;
+
 @Component(
         property = {
                 Constants.SERVICE_VENDOR + "=The Apache Software Foundation"
-        },
-        immediate = true
+        }
     )
 /**
  * This service creates gauges for getting how long the oldest running job is
@@ -64,20 +63,20 @@
 
     private final static String CLEANUP_JOB_NAME = "org.apache.sling.commons.scheduler.impl.GaugesSupport.CleanupJob";
 
+    @Reference
+    private MetricRegistry metricRegistry;
+
     @SuppressWarnings("rawtypes")
     private final class TemporaryGauge implements Gauge {
-        private final ServiceRegistration registration;
         private final JobExecutionContext jobExecutionContext;
         private final String gaugeName;
         private volatile boolean unregistered = false;
 
-        private TemporaryGauge(BundleContext ctx, JobExecutionContext jobExecutionContext, String gaugeName) {
+        private TemporaryGauge(final JobExecutionContext jobExecutionContext, final String gaugeName) {
             this.jobExecutionContext = jobExecutionContext;
             this.gaugeName = gaugeName;
 
-            Dictionary<String, String> p = new Hashtable<String, String>();
-            p.put(Gauge.NAME, gaugeName);
-            registration = ctx.registerService(Gauge.class.getName(), TemporaryGauge.this, p);
+            metricRegistry.register(gaugeName,  this);
         }
 
         private void unregister() {
@@ -97,7 +96,7 @@
                     logger.debug("unregister: unregistering dangling temporary gauge for slow job : " + gaugeName);
                 }
             }
-            registration.unregister();
+            metricRegistry.remove(gaugeName);
         }
 
         @Override
@@ -128,8 +127,8 @@
     private final Logger logger = LoggerFactory.getLogger(getClass());
 
     @SuppressWarnings("rawtypes")
-    private final Map<String, ServiceRegistration> gaugeRegistrations = new HashMap<String, ServiceRegistration>();
-    private final Map<String, TemporaryGauge> temporaryGauges = new HashMap<String, TemporaryGauge>();
+    private final Map<String, ServiceRegistration> gaugeRegistrations = new HashMap<>();
+    private final Map<String, TemporaryGauge> temporaryGauges = new HashMap<>();
 
     private volatile boolean active = true;
 
@@ -147,7 +146,7 @@
         active = true;
 
         // register the gauges
-        registerGauges(ctx);
+        registerGauges();
 
         bundleId = ctx.getBundle().getBundleId();
         try {
@@ -178,7 +177,7 @@
         final long start = System.currentTimeMillis();
         final Map<String, TemporaryGauge> localTemporaryGauges;
         synchronized (temporaryGauges) {
-            localTemporaryGauges = new HashMap<String, TemporaryGauge>(temporaryGauges);
+            localTemporaryGauges = new HashMap<>(temporaryGauges);
         }
         final Iterator<TemporaryGauge> it = localTemporaryGauges.values().iterator();
         while (it.hasNext()) {
@@ -195,47 +194,45 @@
                 + ", at end : " + endCount + ", cleanup took : " + diff + "ms)");
     }
 
-    private void registerGauges(BundleContext ctx) {
-        createGauge(ctx, configHolder, null, null, QuartzScheduler.METRICS_NAME_OLDEST_RUNNING_JOB_MILLIS);
-        createGauge(ctx, configHolder, configHolder.poolName(), null,
+    private void registerGauges() {
+        createGauge(configHolder, null, null, QuartzScheduler.METRICS_NAME_OLDEST_RUNNING_JOB_MILLIS);
+        createGauge(configHolder, configHolder.poolName(), null,
                 QuartzScheduler.METRICS_NAME_OLDEST_RUNNING_JOB_MILLIS + ".tp." + configHolder.poolName());
         if (configHolder.allowedPoolNames() != null) {
             for (String tpName : configHolder.allowedPoolNames()) {
-                createGauge(ctx, configHolder, tpName, null,
+                createGauge(configHolder, tpName, null,
                         QuartzScheduler.METRICS_NAME_OLDEST_RUNNING_JOB_MILLIS + ".tp." + tpName);
             }
         }
         for (Map.Entry<String, String> entry : configHolder.getFilterSuffixes().entrySet()) {
             final String name = entry.getKey();
             final String filterName = entry.getValue();
-            createGauge(ctx, configHolder, null, filterName,
+            createGauge(configHolder, null, filterName,
                     QuartzScheduler.METRICS_NAME_OLDEST_RUNNING_JOB_MILLIS + ".filter." + name);
         }
     }
 
     @SuppressWarnings("rawtypes")
-    private void createGauge(final BundleContext ctx, final ConfigHolder configHolder, final String tpName,
-            final String filterName, final String gaugeName) {
-        Dictionary<String, String> p = new Hashtable<String, String>();
-        p.put(Gauge.NAME, gaugeName);
+    private void createGauge(final ConfigHolder configHolder,
+            final String tpName,
+            final String filterName,
+            final String gaugeName) {
         final Gauge gauge = new Gauge() {
             @Override
             public Long getValue() {
                 if (!active) {
                     return -1L; // disabled case
                 }
-                return getOldestRunningJobMillis(configHolder, ctx, tpName, filterName);
+                return getOldestRunningJobMillis(configHolder, tpName, filterName);
             }
         };
         logger.debug("createGauge: registering gauge : " + gaugeName);
-        ServiceRegistration reg = ctx.registerService(Gauge.class.getName(), gauge, p);
-        synchronized (this.gaugeRegistrations) {
-            gaugeRegistrations.put(gaugeName, reg);
-        }
+        this.metricRegistry.register(gaugeName, gauge);
     }
 
-    private Long getOldestRunningJobMillis(ConfigHolder configHolder, BundleContext ctx, String threadPoolNameOrNull,
-            String filterNameOrNull) {
+    private Long getOldestRunningJobMillis(final ConfigHolder configHolder,
+            final String threadPoolNameOrNull,
+            final String filterNameOrNull) {
         final QuartzScheduler localQuartzScheduler = quartzScheduler;
         if (localQuartzScheduler == null) {
             // could happen during deactivation
@@ -252,7 +249,7 @@
             // if a threadPoolName is set and no filter then we go by
             // threadPoolName
             final SchedulerProxy schedulerProxy = schedulers.get(threadPoolNameOrNull);
-            oldestDate = getOldestRunningJobDate(configHolder, ctx, schedulerProxy, null);
+            oldestDate = getOldestRunningJobDate(configHolder, schedulerProxy, null);
         } else {
             // if nothing is set we iterate over everything
             // if both threadPoolName and filter is set, filter has precedence
@@ -260,7 +257,7 @@
             for (Map.Entry<String, SchedulerProxy> entry : schedulers.entrySet()) {
                 SchedulerProxy schedulerProxy = entry.getValue();
                 oldestDate = olderOf(oldestDate,
-                        getOldestRunningJobDate(configHolder, ctx, schedulerProxy, filterNameOrNull));
+                        getOldestRunningJobDate(configHolder, schedulerProxy, filterNameOrNull));
             }
         }
         if (oldestDate == null) {
@@ -270,8 +267,9 @@
         }
     }
 
-    private Date getOldestRunningJobDate(final ConfigHolder configHolder, final BundleContext ctx,
-            final SchedulerProxy schedulerProxy, final String filterNameOrNull) {
+    private Date getOldestRunningJobDate(final ConfigHolder configHolder,
+            final SchedulerProxy schedulerProxy,
+            final String filterNameOrNull) {
         if (schedulerProxy == null) {
             return null;
         }
@@ -315,7 +313,7 @@
             if (slowThresholdMillis > 0 && elapsedMillis > slowThresholdMillis) {
                 // then create a gauge for this slow job in case there isn't one
                 // yet
-                createTemporaryGauge(ctx, jobExecutionContext);
+                createTemporaryGauge(jobExecutionContext);
             }
             oldestDate = olderOf(oldestDate, fireTime);
         }
@@ -334,7 +332,7 @@
         }
     }
 
-    private void createTemporaryGauge(final BundleContext ctx, final JobExecutionContext jobExecutionContext) {
+    private void createTemporaryGauge(final JobExecutionContext jobExecutionContext) {
         final JobDataMap data = jobExecutionContext.getJobDetail().getJobDataMap();
         final String jobName = data.getString(QuartzScheduler.DATA_MAP_NAME);
         final String gaugeName = QuartzScheduler.METRICS_NAME_OLDEST_RUNNING_JOB_MILLIS + ".slow."
@@ -363,7 +361,7 @@
         }
         logger.debug("createTemporaryGauge: creating temporary gauge for slow job : " + gaugeName);
         synchronized (this.temporaryGauges) {
-            temporaryGauges.put(gaugeName, new TemporaryGauge(ctx, jobExecutionContext, gaugeName));
+            temporaryGauges.put(gaugeName, new TemporaryGauge(jobExecutionContext, gaugeName));
         }
     }
 
@@ -371,12 +369,12 @@
     private void unregisterGauges() {
         final Map<String, ServiceRegistration> localGaugeRegistrations;
         synchronized (gaugeRegistrations) {
-            localGaugeRegistrations = new HashMap<String, ServiceRegistration>(gaugeRegistrations);
+            localGaugeRegistrations = new HashMap<>(gaugeRegistrations);
             gaugeRegistrations.clear();
         }
         final Map<String, TemporaryGauge> localTemporaryGauges;
         synchronized (temporaryGauges) {
-            localTemporaryGauges = new HashMap<String, TemporaryGauge>(temporaryGauges);
+            localTemporaryGauges = new HashMap<>(temporaryGauges);
         }
 
         final Iterator<Entry<String, ServiceRegistration>> it = localGaugeRegistrations.entrySet().iterator();
diff --git a/src/main/java/org/apache/sling/commons/scheduler/impl/QuartzJobExecutor.java b/src/main/java/org/apache/sling/commons/scheduler/impl/QuartzJobExecutor.java
index 6431777..d47d97f 100644
--- a/src/main/java/org/apache/sling/commons/scheduler/impl/QuartzJobExecutor.java
+++ b/src/main/java/org/apache/sling/commons/scheduler/impl/QuartzJobExecutor.java
@@ -27,9 +27,6 @@
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;
 
-import org.apache.sling.commons.metrics.Counter;
-import org.apache.sling.commons.metrics.MetricsService;
-import org.apache.sling.commons.metrics.Timer;
 import org.apache.sling.commons.scheduler.JobContext;
 import org.apache.sling.commons.scheduler.Scheduler;
 import org.quartz.Job;
@@ -39,6 +36,10 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.codahale.metrics.Counter;
+import com.codahale.metrics.MetricRegistry;
+import com.codahale.metrics.Timer;
+
 /**
  * This component is responsible to launch a {@link org.apache.sling.commons.scheduler.Job}
  * or {@link Runnable} in a Quartz Scheduler.
@@ -47,7 +48,7 @@
 public class QuartzJobExecutor implements Job {
 
     static final int DEFAULT_SLOW_JOB_THRESHOLD_MILLIS = 1000;
-    
+
     /** Is discovery available? */
     public static final AtomicBoolean DISCOVERY_AVAILABLE = new AtomicBoolean(false);
 
@@ -72,13 +73,13 @@
         public final String providedName;
         public final String name;
         public final String[] runOn;
-        
+
         // SLING-5965 : piggybacking metrics field onto JobDesc
         // to avoid having to create yet another object per job execution.
         // creating such an additional object would require a bit more JVM-GC.
         // but to keep JobDesc close to what it was originally intended for
         // ('describing a job') keeping everything additional private
-        private final MetricsService metricsService;
+        private final MetricRegistry metricRegistry;
         private final Counter runningJobsCounter;
         private final Counter overallRunningJobsCounter;
         private final Timer jobDurationTimer;
@@ -91,21 +92,19 @@
             this.name = (String) data.get(QuartzScheduler.DATA_MAP_NAME);
             this.providedName = (String)data.get(QuartzScheduler.DATA_MAP_PROVIDED_NAME);
             this.runOn = (String[])data.get(QuartzScheduler.DATA_MAP_RUN_ON);
-            
+
             // initialize metrics fields
             final QuartzScheduler localQuartzScheduler = (QuartzScheduler) data.get(QuartzScheduler.DATA_MAP_QUARTZ_SCHEDULER);
-            MetricsService localMetricsService = null;
+            MetricRegistry localMetricsService = null;
             ConfigHolder localConfigHolder = null;
             if (localQuartzScheduler != null) {
                 // shouldn't be null but for paranoia
-                localMetricsService = localQuartzScheduler.metricsService;
+                localMetricsService = localQuartzScheduler.metricsRegistry;
                 localConfigHolder = localQuartzScheduler.configHolder;
             }
-            // localMetricsService might be null during deactivation
-            metricsService = localMetricsService == null ? MetricsService.NOOP : localMetricsService;
             // mainConfiguration might be null during deactivation
             slowThresholdMillis = localConfigHolder != null ? localConfigHolder.slowThresholdMillis() : DEFAULT_SLOW_JOB_THRESHOLD_MILLIS;
-            
+
             String metricsSuffix = "";
             final String filterName = MetricsHelper.deriveFilterName(localConfigHolder, job);
             if (filterName != null) {
@@ -118,17 +117,29 @@
                     metricsSuffix = ".tp." + threadPoolName;
                 }
             }
-            
-            runningJobsCounter = metricsService.counter(QuartzScheduler.METRICS_NAME_RUNNING_JOBS + metricsSuffix);
-            jobDurationTimer = metricsService.timer(QuartzScheduler.METRICS_NAME_TIMER + metricsSuffix);
-            overallRunningJobsCounter = metricsSuffix.length() == 0 ? null
-                    : metricsService.counter(QuartzScheduler.METRICS_NAME_RUNNING_JOBS);
+
+            if ( localMetricsService != null ) {
+                metricRegistry = localMetricsService;
+                runningJobsCounter = metricRegistry.counter(QuartzScheduler.METRICS_NAME_RUNNING_JOBS + metricsSuffix);
+                jobDurationTimer = metricRegistry.timer(QuartzScheduler.METRICS_NAME_TIMER + metricsSuffix);
+                overallRunningJobsCounter = metricsSuffix.length() == 0 ? null
+                        : metricRegistry.counter(QuartzScheduler.METRICS_NAME_RUNNING_JOBS);
+            } else {
+                metricRegistry = null;
+                runningJobsCounter = null;
+                jobDurationTimer = null;
+                overallRunningJobsCounter = null;
+            }
         }
-        
+
         private void measureJobStart() {
             // measure job start
-            if (overallRunningJobsCounter != null) overallRunningJobsCounter.increment();
-            runningJobsCounter.increment();
+            if (overallRunningJobsCounter != null) {
+                overallRunningJobsCounter.inc();
+            }
+            if ( runningJobsCounter != null ) {
+                runningJobsCounter.inc();
+            }
             jobStart = System.currentTimeMillis();
         }
 
@@ -137,9 +148,13 @@
                 // then measureJobStart was never invoked - hence not measuring anything
                 return;
             }
-            
-            if (overallRunningJobsCounter != null) overallRunningJobsCounter.decrement();
-            runningJobsCounter.decrement();
+
+            if (overallRunningJobsCounter != null) {
+                overallRunningJobsCounter.dec();
+            }
+            if ( runningJobsCounter != null ) {
+                runningJobsCounter.dec();
+            }
             final long elapsedMillis = System.currentTimeMillis() - jobStart;
             // depending on slowness either measure via a separate 'slow' or the normal timer
             // (and this triage can only be done by manual measuring)
@@ -147,15 +162,19 @@
                 // if the job was slow we (only) add it to a separate '.slow.' timer
                 // the idea being to not "pollute" the normal timer which would
                 // get quite skewed metrics otherwise with slow jobs around
-                final String slowTimerName = QuartzScheduler.METRICS_NAME_TIMER + ".slow."
-                        + MetricsHelper.asMetricsSuffix(this.name);
-                metricsService.timer(slowTimerName).update(elapsedMillis, TimeUnit.MILLISECONDS);
+                if ( metricRegistry != null ) {
+                    final String slowTimerName = QuartzScheduler.METRICS_NAME_TIMER + ".slow."
+                            + MetricsHelper.asMetricsSuffix(this.name);
+                    metricRegistry.timer(slowTimerName).update(elapsedMillis, TimeUnit.MILLISECONDS);
+                }
             } else {
                 // if the job was not slow, then measure it normally
-                jobDurationTimer.update(elapsedMillis, TimeUnit.MILLISECONDS);
+                if ( jobDurationTimer != null ) {
+                    jobDurationTimer.update(elapsedMillis, TimeUnit.MILLISECONDS);
+                }
             }
         }
-        
+
         public boolean isKnownJob() {
             return this.job != null && this.name != null;
         }
diff --git a/src/main/java/org/apache/sling/commons/scheduler/impl/QuartzScheduler.java b/src/main/java/org/apache/sling/commons/scheduler/impl/QuartzScheduler.java
index b9e5309..a8360ac 100644
--- a/src/main/java/org/apache/sling/commons/scheduler/impl/QuartzScheduler.java
+++ b/src/main/java/org/apache/sling/commons/scheduler/impl/QuartzScheduler.java
@@ -25,7 +25,6 @@
 import java.util.Set;
 import java.util.UUID;
 
-import org.apache.sling.commons.metrics.MetricsService;
 import org.apache.sling.commons.scheduler.Job;
 import org.apache.sling.commons.scheduler.ScheduleOptions;
 import org.apache.sling.commons.scheduler.Scheduler;
@@ -54,6 +53,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.codahale.metrics.MetricRegistry;
+
 /**
  * The quartz based implementation of the scheduler.
  *
@@ -112,7 +113,7 @@
     private ThreadPoolManager threadPoolManager;
 
     @Reference
-    MetricsService metricsService;
+    MetricRegistry metricsRegistry;
 
     ConfigHolder configHolder;