HDFS-6403. Add metrics for log warnings reported by JVM pauses. Contributed by Yongjun Zhang.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1604074 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/source/JvmMetrics.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/source/JvmMetrics.java
index 7504991..c62caf3 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/source/JvmMetrics.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/source/JvmMetrics.java
@@ -38,6 +38,7 @@
import org.apache.hadoop.metrics2.lib.Interns;
import static org.apache.hadoop.metrics2.source.JvmMetricsInfo.*;
import static org.apache.hadoop.metrics2.impl.MsInfo.*;
+import org.apache.hadoop.util.JvmPauseMonitor;
/**
* JVM and logging related metrics.
@@ -65,6 +66,7 @@
ManagementFactory.getGarbageCollectorMXBeans();
final ThreadMXBean threadMXBean = ManagementFactory.getThreadMXBean();
final String processName, sessionId;
+ private JvmPauseMonitor pauseMonitor = null;
final ConcurrentHashMap<String, MetricsInfo[]> gcInfoCache =
new ConcurrentHashMap<String, MetricsInfo[]>();
@@ -73,6 +75,10 @@
this.sessionId = sessionId;
}
+ public void setPauseMonitor(final JvmPauseMonitor pauseMonitor) {
+ this.pauseMonitor = pauseMonitor;
+ }
+
public static JvmMetrics create(String processName, String sessionId,
MetricsSystem ms) {
return ms.register(JvmMetrics.name(), JvmMetrics.description(),
@@ -120,6 +126,15 @@
}
rb.addCounter(GcCount, count)
.addCounter(GcTimeMillis, timeMillis);
+
+ if (pauseMonitor != null) {
+ rb.addCounter(GcNumWarnThresholdExceeded,
+ pauseMonitor.getNumGcWarnThreadholdExceeded());
+ rb.addCounter(GcNumInfoThresholdExceeded,
+ pauseMonitor.getNumGcInfoThresholdExceeded());
+ rb.addCounter(GcTotalExtraSleepTime,
+ pauseMonitor.getTotalGcExtraSleepTime());
+ }
}
private MetricsInfo[] getGcInfo(String gcName) {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/source/JvmMetricsInfo.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/source/JvmMetricsInfo.java
index 64a59b6..55bb417 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/source/JvmMetricsInfo.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/source/JvmMetricsInfo.java
@@ -48,7 +48,10 @@
LogFatal("Total number of fatal log events"),
LogError("Total number of error log events"),
LogWarn("Total number of warning log events"),
- LogInfo("Total number of info log events");
+ LogInfo("Total number of info log events"),
+ GcNumWarnThresholdExceeded("Number of times that the GC warn threshold is exceeded"),
+ GcNumInfoThresholdExceeded("Number of times that the GC info threshold is exceeded"),
+ GcTotalExtraSleepTime("Total GC extra sleep time in milliseconds");
private final String desc;
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/JvmPauseMonitor.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/JvmPauseMonitor.java
index f7932a6..e8af45e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/JvmPauseMonitor.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/JvmPauseMonitor.java
@@ -62,10 +62,13 @@
"jvm.pause.info-threshold.ms";
private static final long INFO_THRESHOLD_DEFAULT = 1000;
-
+ private long numGcWarnThresholdExceeded = 0;
+ private long numGcInfoThresholdExceeded = 0;
+ private long totalGcExtraSleepTime = 0;
+
private Thread monitorThread;
private volatile boolean shouldRun = true;
-
+
public JvmPauseMonitor(Configuration conf) {
this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT);
this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT);
@@ -87,6 +90,22 @@
Thread.currentThread().interrupt();
}
}
+
+ public boolean isStarted() {
+ return monitorThread != null;
+ }
+
+ public long getNumGcWarnThreadholdExceeded() {
+ return numGcWarnThresholdExceeded;
+ }
+
+ public long getNumGcInfoThresholdExceeded() {
+ return numGcInfoThresholdExceeded;
+ }
+
+ public long getTotalGcExtraSleepTime() {
+ return totalGcExtraSleepTime;
+ }
private String formatMessage(long extraSleepTime,
Map<String, GcTimes> gcTimesAfterSleep,
@@ -166,13 +185,15 @@
Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
if (extraSleepTime > warnThresholdMs) {
+ ++numGcWarnThresholdExceeded;
LOG.warn(formatMessage(
extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
} else if (extraSleepTime > infoThresholdMs) {
+ ++numGcInfoThresholdExceeded;
LOG.info(formatMessage(
extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
}
-
+ totalGcExtraSleepTime += extraSleepTime;
gcTimesBeforeSleep = gcTimesAfterSleep;
}
}
diff --git a/hadoop-common-project/hadoop-common/src/site/apt/Metrics.apt.vm b/hadoop-common-project/hadoop-common/src/site/apt/Metrics.apt.vm
index 55e532d..14cc712 100644
--- a/hadoop-common-project/hadoop-common/src/site/apt/Metrics.apt.vm
+++ b/hadoop-common-project/hadoop-common/src/site/apt/Metrics.apt.vm
@@ -86,6 +86,14 @@
*-------------------------------------+--------------------------------------+
|<<<LogInfo>>> | Total number of INFO logs
*-------------------------------------+--------------------------------------+
+|<<<GcNumWarnThresholdExceeded>>> | Number of times that the GC warn
+ | threshold is exceeded
+*-------------------------------------+--------------------------------------+
+|<<<GcNumInfoThresholdExceeded>>> | Number of times that the GC info
+ | threshold is exceeded
+*-------------------------------------+--------------------------------------+
+|<<<GcTotalExtraSleepTime>>> | Total GC extra sleep time in msec
+*-------------------------------------+--------------------------------------+
rpc context
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/metrics2/source/TestJvmMetrics.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/metrics2/source/TestJvmMetrics.java
index 1fc91c3..3cb3384 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/metrics2/source/TestJvmMetrics.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/metrics2/source/TestJvmMetrics.java
@@ -19,18 +19,25 @@
package org.apache.hadoop.metrics2.source;
import org.junit.Test;
+
import static org.mockito.Mockito.*;
import static org.apache.hadoop.test.MetricsAsserts.*;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.metrics2.MetricsCollector;
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
+import org.apache.hadoop.util.JvmPauseMonitor;
+
import static org.apache.hadoop.metrics2.source.JvmMetricsInfo.*;
import static org.apache.hadoop.metrics2.impl.MsInfo.*;
public class TestJvmMetrics {
@Test public void testPresence() {
- MetricsRecordBuilder rb = getMetrics(new JvmMetrics("test", "test"));
+ JvmPauseMonitor pauseMonitor = new JvmPauseMonitor(new Configuration());
+ JvmMetrics jvmMetrics = new JvmMetrics("test", "test");
+ jvmMetrics.setPauseMonitor(pauseMonitor);
+ MetricsRecordBuilder rb = getMetrics(jvmMetrics);
MetricsCollector mc = rb.parent();
verify(mc).addRecord(JvmMetrics);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index baf6546..dae38c2 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -453,6 +453,9 @@
HDFS-6480. Move waitForReady() from FSDirectory to FSNamesystem. (wheat9)
+ HDFS-6403. Add metrics for log warnings reported by JVM pauses. (Yongjun
+ Zhang via atm)
+
OPTIMIZATIONS
HDFS-6214. Webhdfs has poor throughput for files >2GB (daryn)
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
index 292259a..5797c00 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
@@ -778,7 +778,8 @@
initIpcServer(conf);
metrics = DataNodeMetrics.create(conf, getDisplayName());
-
+ metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
+
blockPoolManager = new BlockPoolManager(this);
blockPoolManager.refreshNamenodes(conf);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java
index 9601bcf..b536e7e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java
@@ -90,13 +90,15 @@
final MutableQuantiles[] sendDataPacketBlockedOnNetworkNanosQuantiles;
@Metric MutableRate sendDataPacketTransferNanos;
final MutableQuantiles[] sendDataPacketTransferNanosQuantiles;
-
final MetricsRegistry registry = new MetricsRegistry("datanode");
final String name;
-
- public DataNodeMetrics(String name, String sessionId, int[] intervals) {
+ JvmMetrics jvmMetrics = null;
+
+ public DataNodeMetrics(String name, String sessionId, int[] intervals,
+ final JvmMetrics jvmMetrics) {
this.name = name;
+ this.jvmMetrics = jvmMetrics;
registry.tag(SessionId, sessionId);
final int len = intervals.length;
@@ -131,7 +133,7 @@
public static DataNodeMetrics create(Configuration conf, String dnName) {
String sessionId = conf.get(DFSConfigKeys.DFS_METRICS_SESSION_ID_KEY);
MetricsSystem ms = DefaultMetricsSystem.instance();
- JvmMetrics.create("DataNode", sessionId, ms);
+ JvmMetrics jm = JvmMetrics.create("DataNode", sessionId, ms);
String name = "DataNodeActivity-"+ (dnName.isEmpty()
? "UndefinedDataNodeName"+ DFSUtil.getRandom().nextInt()
: dnName.replace(':', '-'));
@@ -141,11 +143,15 @@
conf.getInts(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY);
return ms.register(name, null, new DataNodeMetrics(name, sessionId,
- intervals));
+ intervals, jm));
}
public String name() { return name; }
+ public JvmMetrics getJvmMetrics() {
+ return jvmMetrics;
+ }
+
public void addHeartbeat(long latency) {
heartbeats.add(latency);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
index 9cdad26..9747f9c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
@@ -598,7 +598,8 @@
pauseMonitor = new JvmPauseMonitor(conf);
pauseMonitor.start();
-
+ metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
+
startCommonServices(conf);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java
index 52ad185..42942dc 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java
@@ -98,7 +98,11 @@
@Metric("GetImageServlet putImage")
MutableRate putImage;
- NameNodeMetrics(String processName, String sessionId, int[] intervals) {
+ JvmMetrics jvmMetrics = null;
+
+ NameNodeMetrics(String processName, String sessionId, int[] intervals,
+ final JvmMetrics jvmMetrics) {
+ this.jvmMetrics = jvmMetrics;
registry.tag(ProcessName, processName).tag(SessionId, sessionId);
final int len = intervals.length;
@@ -124,14 +128,19 @@
String sessionId = conf.get(DFSConfigKeys.DFS_METRICS_SESSION_ID_KEY);
String processName = r.toString();
MetricsSystem ms = DefaultMetricsSystem.instance();
- JvmMetrics.create(processName, sessionId, ms);
+ JvmMetrics jm = JvmMetrics.create(processName, sessionId, ms);
// Percentile measurement is off by default, by watching no intervals
int[] intervals =
conf.getInts(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY);
- return ms.register(new NameNodeMetrics(processName, sessionId, intervals));
+ return ms.register(new NameNodeMetrics(processName, sessionId,
+ intervals, jm));
}
+ public JvmMetrics getJvmMetrics() {
+ return jvmMetrics;
+ }
+
public void shutdown() {
DefaultMetricsSystem.shutdown();
}