blob: ed457e2f20957e69e16b2727129c59c4f6831499 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ignite.internal.components;
import static org.apache.ignite.internal.lang.IgniteSystemProperties.getBoolean;
import static org.apache.ignite.internal.lang.IgniteSystemProperties.getInteger;
import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.ignite.internal.lang.IgniteBiTuple;
import org.apache.ignite.internal.logger.IgniteLogger;
import org.apache.ignite.internal.logger.Loggers;
import org.apache.ignite.internal.manager.IgniteComponent;
import org.apache.ignite.internal.thread.NamedThreadFactory;
import org.apache.ignite.internal.tostring.S;
import org.jetbrains.annotations.Nullable;
/**
* Class for detection of long JVM pauses. It has a worker thread, which wakes up in cycle every {@code PRECISION} (default is 50)
* milliseconds, and monitors a time values between awakenings. If worker pause exceeds the expected value more than {@code THRESHOLD}
* default is 500), the difference is considered as JVM pause, most likely STW, and event of long JVM pause is registered. The values of
* {@code PRECISION}, {@code THRESHOLD} and {@code EVT_CNT} (event window size, default is 20) can be configured in system or environment
* properties IGNITE_JVM_PAUSE_DETECTOR_PRECISION, IGNITE_JVM_PAUSE_DETECTOR_THRESHOLD and IGNITE_JVM_PAUSE_DETECTOR_LAST_EVENTS_COUNT
* accordingly.
*/
public class LongJvmPauseDetector implements IgniteComponent {
private final IgniteLogger log = Loggers.forClass(LongJvmPauseDetector.class);
/** Ignite JVM pause detector threshold default value. */
public static final int DEFAULT_JVM_PAUSE_DETECTOR_THRESHOLD = 500;
/** System property to change default is IGNITE_JVM_PAUSE_DETECTOR_PRECISION. */
public static final int DFLT_JVM_PAUSE_DETECTOR_PRECISION = 50;
/** System property to change default is IGNITE_JVM_PAUSE_DETECTOR_LAST_EVENTS_COUNT. */
public static final int DFLT_JVM_PAUSE_DETECTOR_LAST_EVENTS_COUNT = 20;
/** Precision. */
private static final int PRECISION = getInteger("IGNITE_JVM_PAUSE_DETECTOR_PRECISION", DFLT_JVM_PAUSE_DETECTOR_PRECISION);
/** Threshold. */
private final int threshold = getInteger("IGNITE_JVM_PAUSE_DETECTOR_THRESHOLD", DEFAULT_JVM_PAUSE_DETECTOR_THRESHOLD);
/** Event count. */
private static final int EVT_CNT = getInteger("IGNITE_JVM_PAUSE_DETECTOR_LAST_EVENTS_COUNT",
DFLT_JVM_PAUSE_DETECTOR_LAST_EVENTS_COUNT);
/** Disabled flag. */
private static final boolean DISABLED = getBoolean("IGNITE_JVM_PAUSE_DETECTOR_DISABLED");
/** Worker reference. */
private final AtomicReference<Thread> workerRef = new AtomicReference<>();
/** Long pause count. */
private long longPausesCnt;
/** Long pause total duration. */
private long longPausesTotalDuration;
/** Last detector's wake up time. */
private long lastWakeUpTime;
/** Long pauses timestamps. */
private final long[] longPausesTimestamps = new long[EVT_CNT];
/** Long pauses durations. */
private final long[] longPausesDurations = new long[EVT_CNT];
private final String nodeName;
public LongJvmPauseDetector(String nodeName) {
this.nodeName = nodeName;
}
/** {@inheritDoc} */
@Override
public CompletableFuture<Void> startAsync() {
if (DISABLED) {
log.debug("JVM Pause Detector is disabled");
return nullCompletedFuture();
}
final Thread worker = new Thread(NamedThreadFactory.threadPrefix(nodeName, "jvm-pause-detector-worker")) {
@Override
public void run() {
synchronized (LongJvmPauseDetector.this) {
lastWakeUpTime = System.currentTimeMillis();
}
log.debug("Detector worker has been started [thread={}]", getName());
while (true) {
try {
Thread.sleep(PRECISION);
final long now = System.currentTimeMillis();
final long pause = now - PRECISION - lastWakeUpTime;
if (pause >= threshold) {
log.warn("Possible too long JVM pause [duration={}ms]", pause);
synchronized (LongJvmPauseDetector.this) {
final int next = (int) (longPausesCnt % EVT_CNT);
longPausesCnt++;
longPausesTotalDuration += pause;
longPausesTimestamps[next] = now;
longPausesDurations[next] = pause;
lastWakeUpTime = now;
}
} else {
synchronized (LongJvmPauseDetector.this) {
lastWakeUpTime = now;
}
}
} catch (InterruptedException e) {
if (workerRef.compareAndSet(this, null)) {
log.debug("Thread has been interrupted [thread={}]", e, getName());
} else {
log.debug("Thread has been stopped [thread={}]", getName());
}
break;
}
}
}
};
if (!workerRef.compareAndSet(null, worker)) {
log.debug("{} already started", LongJvmPauseDetector.class.getSimpleName());
return nullCompletedFuture();
}
worker.setDaemon(true);
worker.start();
log.debug("{} was successfully started", LongJvmPauseDetector.class.getSimpleName());
return nullCompletedFuture();
}
/** {@inheritDoc} */
@Override
public CompletableFuture<Void> stopAsync() {
final Thread worker = workerRef.getAndSet(null);
if (worker != null && worker.isAlive() && !worker.isInterrupted()) {
worker.interrupt();
}
return nullCompletedFuture();
}
/**
* Gets a logging status.
*
* @return {@code false} if IGNITE_JVM_PAUSE_DETECTOR_DISABLED set to {@code true}, and {@code true} otherwise.
*/
public static boolean enabled() {
return !DISABLED;
}
/**
* Gets a pause count.
*
* @return Long JVM pauses count.
*/
synchronized long longPausesCount() {
return longPausesCnt;
}
/**
* Gets a total duration.
*
* @return Long JVM pauses total duration.
*/
synchronized long longPausesTotalDuration() {
return longPausesTotalDuration;
}
/**
* Gets a last wakeup time.
*
* @return Last checker's wake up time.
*/
public synchronized long getLastWakeUpTime() {
return lastWakeUpTime;
}
/**
* Gets long pause events.
*
* @return Last long JVM pause events.
*/
synchronized Map<Long, Long> longPauseEvents() {
final Map<Long, Long> evts = new TreeMap<>();
for (int i = 0; i < longPausesTimestamps.length && longPausesTimestamps[i] != 0; i++) {
evts.put(longPausesTimestamps[i], longPausesDurations[i]);
}
return evts;
}
/**
* Gets the last long pause event.
*
* @return Pair ({@code last long pause event time}, {@code pause time duration}) or {@code null}, if long pause wasn't occurred.
*/
public synchronized @Nullable IgniteBiTuple<Long, Long> getLastLongPause() {
int lastPauseIdx = (int) ((EVT_CNT + longPausesCnt - 1) % EVT_CNT);
if (longPausesTimestamps[lastPauseIdx] == 0) {
return null;
}
return new IgniteBiTuple<>(longPausesTimestamps[lastPauseIdx], longPausesDurations[lastPauseIdx]);
}
/**
* Return long JVM pause threshold in mills.
*/
public long longJvmPauseThreshold() {
return threshold;
}
/** {@inheritDoc} */
@Override
public String toString() {
return S.toString(LongJvmPauseDetector.class, this);
}
}