Support custom GRPCClient health checker logic. (#13353)
[NOTICE] Roll back score meaning in GraphQL health check API.
diff --git a/dist-material/release-docs/LICENSE b/dist-material/release-docs/LICENSE
index 60d6a59..43d5b2e 100644
--- a/dist-material/release-docs/LICENSE
+++ b/dist-material/release-docs/LICENSE
@@ -226,7 +226,7 @@
https://mvnrepository.com/artifact/com.fasterxml.jackson.datatype/jackson-datatype-jsr310/2.18.2 Apache-2.0
https://mvnrepository.com/artifact/com.fasterxml.jackson.module/jackson-module-kotlin/2.13.4 Apache-2.0
https://mvnrepository.com/artifact/com.fasterxml/classmate/1.5.1 Apache-2.0
- https://mvnrepository.com/artifact/com.google.api.grpc/proto-google-common-protos/2.41.0 Apache-2.0
+ https://mvnrepository.com/artifact/com.google.api.grpc/proto-google-common-protos/2.48.0 Apache-2.0
https://mvnrepository.com/artifact/com.google.auto.service/auto-service-annotations/1.0.1 Apache-2.0
https://mvnrepository.com/artifact/com.google.code.findbugs/jsr305/3.0.2 Apache-2.0
https://mvnrepository.com/artifact/com.google.code.gson/gson/2.9.0 Apache-2.0
@@ -290,16 +290,16 @@
https://mvnrepository.com/artifact/io.fabric8/kubernetes-model-scheduling/6.7.1 Apache-2.0
https://mvnrepository.com/artifact/io.fabric8/kubernetes-model-storageclass/6.7.1 Apache-2.0
https://mvnrepository.com/artifact/io.fabric8/zjsonpatch/0.3.0 Apache-2.0
- https://mvnrepository.com/artifact/io.grpc/grpc-api/1.68.1 Apache-2.0
- https://mvnrepository.com/artifact/io.grpc/grpc-context/1.68.1 Apache-2.0
- https://mvnrepository.com/artifact/io.grpc/grpc-core/1.68.1 Apache-2.0
- https://mvnrepository.com/artifact/io.grpc/grpc-grpclb/1.68.1 Apache-2.0
- https://mvnrepository.com/artifact/io.grpc/grpc-netty/1.68.1 Apache-2.0
- https://mvnrepository.com/artifact/io.grpc/grpc-protobuf/1.68.1 Apache-2.0
- https://mvnrepository.com/artifact/io.grpc/grpc-protobuf-lite/1.68.1 Apache-2.0
+ https://mvnrepository.com/artifact/io.grpc/grpc-api/1.70.0 Apache-2.0
+ https://mvnrepository.com/artifact/io.grpc/grpc-context/1.70.0 Apache-2.0
+ https://mvnrepository.com/artifact/io.grpc/grpc-core/1.70.0 Apache-2.0
+ https://mvnrepository.com/artifact/io.grpc/grpc-grpclb/1.70.0 Apache-2.0
+ https://mvnrepository.com/artifact/io.grpc/grpc-netty/1.70.0 Apache-2.0
+ https://mvnrepository.com/artifact/io.grpc/grpc-protobuf/1.70.0 Apache-2.0
+ https://mvnrepository.com/artifact/io.grpc/grpc-protobuf-lite/1.70.0 Apache-2.0
https://mvnrepository.com/artifact/io.grpc/grpc-services/1.70.0 Apache-2.0
- https://mvnrepository.com/artifact/io.grpc/grpc-stub/1.68.1 Apache-2.0
- https://mvnrepository.com/artifact/io.grpc/grpc-util/1.68.1 Apache-2.0
+ https://mvnrepository.com/artifact/io.grpc/grpc-stub/1.70.0 Apache-2.0
+ https://mvnrepository.com/artifact/io.grpc/grpc-util/1.70.0 Apache-2.0
https://mvnrepository.com/artifact/io.micrometer/micrometer-commons/1.14.4 Apache-2.0
https://mvnrepository.com/artifact/io.micrometer/micrometer-core/1.14.4 Apache-2.0
https://mvnrepository.com/artifact/io.micrometer/micrometer-observation/1.14.4 Apache-2.0
diff --git a/docs/en/changes/changes.md b/docs/en/changes/changes.md
index 03d0b20..2f8b4a3 100644
--- a/docs/en/changes/changes.md
+++ b/docs/en/changes/changes.md
@@ -37,7 +37,8 @@
* chore: add a warning log when connecting to ES takes too long.
* Fix the query time range in the metadata API.
* OAP gRPC-Client support `Health Check`.
-* [Break Change] `Health Check` make response 1 represents healthy, 0 represents unhealthy.
+* [Break Change] `health_check_xx` metrics make response 1 represents healthy, 0 represents unhealthy.
+* Bump up grpc to 1.70.0.
#### UI
diff --git a/docs/en/setup/backend/backend-health-check.md b/docs/en/setup/backend/backend-health-check.md
index a8201ab..c717851 100644
--- a/docs/en/setup/backend/backend-health-check.md
+++ b/docs/en/setup/backend/backend-health-check.md
@@ -36,7 +36,7 @@
{
"data": {
"checkHealth": {
- "score": 1,
+ "score": 0,
"details": ""
}
}
@@ -49,7 +49,7 @@
{
"data": {
"checkHealth": {
- "score": 0,
+ "score": 1,
"details": "storage_h2,"
}
}
diff --git a/oap-server-bom/pom.xml b/oap-server-bom/pom.xml
index 5ab5564..050d3ad 100644
--- a/oap-server-bom/pom.xml
+++ b/oap-server-bom/pom.xml
@@ -254,6 +254,11 @@
<version>${grpc.version}</version>
</dependency>
<dependency>
+ <groupId>io.grpc</groupId>
+ <artifactId>grpc-services</artifactId>
+ <version>${grpc.version}</version>
+ </dependency>
+ <dependency>
<groupId>io.netty</groupId>
<artifactId>netty-tcnative-boringssl-static</artifactId>
<version>${netty-tcnative-boringssl-static.version}</version>
diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/HealthStatus.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/HealthStatus.java
index ab073d4..6a65bc7 100644
--- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/HealthStatus.java
+++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/HealthStatus.java
@@ -26,7 +26,7 @@
@Setter
@ToString
public class HealthStatus {
- // score == 1 means healthy, otherwise it's unhealthy.
+ // score == 0 means healthy and no unhealthy component or connection, otherwise it's unhealthy.
private int score;
private String details;
}
diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/remote/health/HealthCheckServiceHandler.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/remote/health/HealthCheckServiceHandler.java
index 53a8776..611da0a 100644
--- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/remote/health/HealthCheckServiceHandler.java
+++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/remote/health/HealthCheckServiceHandler.java
@@ -18,17 +18,15 @@
package org.apache.skywalking.oap.server.core.remote.health;
-import grpc.health.v1.HealthCheckService;
-import grpc.health.v1.HealthGrpc;
+import io.grpc.health.v1.HealthCheckRequest;
+import io.grpc.health.v1.HealthCheckResponse;
+import io.grpc.health.v1.HealthGrpc;
import io.grpc.stub.StreamObserver;
+import lombok.extern.slf4j.Slf4j;
import org.apache.skywalking.oap.server.library.server.grpc.GRPCHandler;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+@Slf4j
public class HealthCheckServiceHandler extends HealthGrpc.HealthImplBase implements GRPCHandler {
-
- private static final Logger LOGGER = LoggerFactory.getLogger(HealthCheckServiceHandler.class);
-
/**
* By my test, consul didn't send the service.
*
@@ -36,15 +34,13 @@
* @param responseObserver status
*/
@Override
- public void check(HealthCheckService.HealthCheckRequest request,
- StreamObserver<HealthCheckService.HealthCheckResponse> responseObserver) {
-
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug("Received the gRPC server health check with the service name of {}", request.getService());
+ public void check(HealthCheckRequest request, StreamObserver<HealthCheckResponse> responseObserver) {
+ if (log.isDebugEnabled()) {
+ log.debug("Received the gRPC server health check with the service name of {}", request.getService());
}
- HealthCheckService.HealthCheckResponse.Builder response = HealthCheckService.HealthCheckResponse.newBuilder();
- response.setStatus(HealthCheckService.HealthCheckResponse.ServingStatus.SERVING);
+ HealthCheckResponse.Builder response = HealthCheckResponse.newBuilder();
+ response.setStatus(HealthCheckResponse.ServingStatus.SERVING);
responseObserver.onNext(response.build());
responseObserver.onCompleted();
diff --git a/oap-server/server-core/src/main/proto/HealthCheckService.proto b/oap-server/server-core/src/main/proto/HealthCheckService.proto
deleted file mode 100644
index ecae37c..0000000
--- a/oap-server/server-core/src/main/proto/HealthCheckService.proto
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-//This is a health check proto, provided by gRPC team. Please don't change it.
-//https://github.com/grpc/grpc/blob/master/doc/health-checking.md
-syntax = "proto3";
-
-package grpc.health.v1;
-
-message HealthCheckRequest {
- string service = 1;
-}
-
-message HealthCheckResponse {
- enum ServingStatus {
- UNKNOWN = 0;
- SERVING = 1;
- NOT_SERVING = 2;
- }
- ServingStatus status = 1;
-}
-
-service Health {
- rpc Check (HealthCheckRequest) returns (HealthCheckResponse);
-}
\ No newline at end of file
diff --git a/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerHttpService.java b/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerHttpService.java
index 7b520fc..356d76a 100644
--- a/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerHttpService.java
+++ b/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerHttpService.java
@@ -37,7 +37,7 @@
final var status = healthQueryService.checkHealth();
log.info("Health status: {}", status);
- if (status.getScore() == 1) {
+ if (status.getScore() == 0) {
return HttpResponse.of(HttpStatus.OK);
}
return HttpResponse.of(HttpStatus.SERVICE_UNAVAILABLE);
diff --git a/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerProvider.java b/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerProvider.java
index 2051b5b..6bec385 100644
--- a/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerProvider.java
+++ b/oap-server/server-health-checker/src/main/java/org/apache/skywalking/oap/server/health/checker/provider/HealthCheckerProvider.java
@@ -25,7 +25,6 @@
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import lombok.extern.slf4j.Slf4j;
import org.apache.skywalking.oap.server.core.CoreModule;
@@ -100,18 +99,22 @@
@Override public void notifyAfterCompleted() throws ServiceNotProvidedException, ModuleStartException {
ses.scheduleAtFixedRate(() -> {
StringBuilder unhealthyModules = new StringBuilder();
- AtomicBoolean hasUnhealthyModule = new AtomicBoolean(false);
+ AtomicDouble unhealthyModule = new AtomicDouble(0);
Stream.ofAll(collector.collect())
.flatMap(metricFamily -> metricFamily.samples)
.filter(sample -> metricsCreator.isHealthCheckerMetrics(sample.name))
.forEach(sample -> {
if (sample.value < 1) {
unhealthyModules.append(metricsCreator.extractModuleName(sample.name)).append(",");
- hasUnhealthyModule.set(true);
+ unhealthyModule.updateAndGet(v -> v + 1);
}
});
- score.set(hasUnhealthyModule.get() ? 0 : 1);
+ if (unhealthyModule.get() > 0) {
+ score.set(unhealthyModule.get());
+ } else {
+ score.set(0);
+ }
details.set(unhealthyModules.toString());
},
2, config.getCheckIntervalSeconds(), TimeUnit.SECONDS);
diff --git a/oap-server/server-library/library-client/pom.xml b/oap-server/server-library/library-client/pom.xml
index 94e9dd0..12952e3 100755
--- a/oap-server/server-library/library-client/pom.xml
+++ b/oap-server/server-library/library-client/pom.xml
@@ -54,6 +54,10 @@
<artifactId>grpc-netty</artifactId>
</dependency>
<dependency>
+ <groupId>io.grpc</groupId>
+ <artifactId>grpc-services</artifactId>
+ </dependency>
+ <dependency>
<groupId>io.netty</groupId>
<artifactId>netty-codec-http2</artifactId>
</dependency>
diff --git a/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java b/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java
index ce54de1..1ab0ee3 100644
--- a/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java
+++ b/oap-server/server-library/library-client/src/main/java/org/apache/skywalking/oap/server/library/client/grpc/GRPCClient.java
@@ -18,26 +18,27 @@
package org.apache.skywalking.oap.server.library.client.grpc;
-import io.grpc.ConnectivityState;
import io.grpc.ManagedChannel;
import io.grpc.ManagedChannelBuilder;
+import io.grpc.Status;
+import io.grpc.StatusRuntimeException;
+import io.grpc.health.v1.HealthCheckRequest;
+import io.grpc.health.v1.HealthCheckResponse;
+import io.grpc.health.v1.HealthGrpc;
import io.grpc.netty.NettyChannelBuilder;
import io.netty.handler.ssl.SslContext;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import lombok.Getter;
+import lombok.extern.slf4j.Slf4j;
import org.apache.skywalking.oap.server.library.client.Client;
import org.apache.skywalking.oap.server.library.client.healthcheck.DelegatedHealthChecker;
import org.apache.skywalking.oap.server.library.client.healthcheck.HealthCheckable;
import org.apache.skywalking.oap.server.library.util.HealthChecker;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+@Slf4j
public class GRPCClient implements Client, HealthCheckable {
-
- private static final Logger LOGGER = LoggerFactory.getLogger(GRPCClient.class);
-
@Getter
private final String host;
@@ -54,6 +55,35 @@
private boolean enableHealthCheck = false;
+ private long initialDelay = 5; // Initial delay for health check in seconds
+
+ private long period = 20; // Period for health check in seconds
+
+ // The default health check runnable that checks the health of the gRPC channel.
+ private Runnable healthCheckRunnable = () -> {
+ if (getChannel() != null && !getChannel().isShutdown()) {
+ HealthGrpc.HealthBlockingStub healthStub = HealthGrpc.newBlockingStub(getChannel());
+ HealthCheckRequest request = HealthCheckRequest.newBuilder().setService("").build();
+ try {
+ HealthCheckResponse response = healthStub.check(request);
+ handleStateChange(response);
+ } catch (StatusRuntimeException s) {
+ if (s.getStatus().getCode() == Status.Code.UNIMPLEMENTED) {
+ log.warn("Health check is not implemented on the remote gRPC server, regard as healthy. Host: {}, Port: {}", getHost(), getPort());
+ healthChecker.health();
+ } else {
+ log.warn("Health check failed for gRPC channel. Host: {}, Port: {}", getHost(), getPort(), s);
+ healthChecker.unHealth(s);
+ }
+ } catch (Throwable t) {
+ log.warn("Health check failed for gRPC channel. Host: {}, Port: {}", getHost(), getPort(), t);
+ healthChecker.unHealth(t);
+ }
+ } else {
+ healthChecker.unHealth("gRPC channel is not available or shutting down. Host: " + getHost() + ", Port: " + getPort());
+ }
+ };
+
public GRPCClient(String host, int port) {
this.host = host;
this.port = port;
@@ -81,7 +111,7 @@
try {
channel.shutdownNow();
} catch (Throwable t) {
- LOGGER.error(t.getMessage(), t);
+ log.error(t.getMessage(), t);
} finally {
if (healthCheckExecutor != null) {
healthCheckExecutor.shutdownNow();
@@ -114,32 +144,51 @@
this.enableHealthCheck = true;
}
+ /**
+ * Override the default health check runnable with a custom one.
+ * Must override before calling connect()
+ * This can be used to provide a different health check logic.
+ *
+ * @param healthCheckRunnable The custom health check runnable.
+ * @param initialDelay Initial delay before the first health check.
+ * @param period Period between subsequent health checks.
+ */
+ public void overrideCheckerRunnable(final Runnable healthCheckRunnable, final long initialDelay, final long period) {
+ this.healthCheckRunnable = healthCheckRunnable;
+ if (initialDelay < 0) {
+ throw new IllegalArgumentException("initialDelay must be non-negative. Provided value: " + initialDelay);
+ }
+ if (period < 0) {
+ throw new IllegalArgumentException("period must be non-negative. Provided value: " + period);
+ }
+ this.initialDelay = initialDelay;
+ this.period = period;
+ }
+
private void checkHealth() {
if (healthCheckExecutor == null) {
healthCheckExecutor = Executors.newSingleThreadScheduledExecutor();
- healthCheckExecutor.scheduleAtFixedRate(
- () -> {
- ConnectivityState currentState = channel.getState(true); // true means try to connect
- handleStateChange(currentState);
- }, 5, 10, TimeUnit.SECONDS
+ healthCheckExecutor.scheduleAtFixedRate(healthCheckRunnable, initialDelay, period, TimeUnit.SECONDS
);
}
}
- private void handleStateChange(ConnectivityState newState) {
- switch (newState) {
- case READY:
- case IDLE:
+ private void handleStateChange(HealthCheckResponse response) {
+ switch (response.getStatus()) {
+ case SERVING:
this.healthChecker.health();
break;
- case CONNECTING:
- this.healthChecker.unHealth("gRPC connecting, waiting for ready. Host: " + host + ", Port: " + port);
+ case NOT_SERVING:
+ this.healthChecker.unHealth("Remote gRPC Server NOT_SERVING. Host: " + host + ", Port: " + port);
break;
- case TRANSIENT_FAILURE:
- this.healthChecker.unHealth("gRPC connection failed, will retry. Host: " + host + ", Port: " + port);
+ case SERVICE_UNKNOWN:
+ this.healthChecker.unHealth("Remote gRPC Server SERVICE_UNKNOWN. Host: " + host + ", Port: " + port);
break;
- case SHUTDOWN:
- this.healthChecker.unHealth("gRPC channel is shutting down. Host: " + host + ", Port: " + port);
+ case UNKNOWN:
+ this.healthChecker.unHealth("Remote gRPC Server UNKNOWN. Host: " + host + ", Port: " + port);
+ break;
+ case UNRECOGNIZED:
+ this.healthChecker.unHealth("Remote gRPC Server UNRECOGNIZED. Host: " + host + ", Port: " + port);
break;
}
}
diff --git a/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol b/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol
index 021c0ad..e6d2b99 160000
--- a/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol
+++ b/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol
@@ -1 +1 @@
-Subproject commit 021c0ad768f8f6f64dceead9d79a3dd7e9ad8dd9
+Subproject commit e6d2b99597a6489ab58f8a2ff32f212c8ef9b388
diff --git a/oap-server/server-starter/pom.xml b/oap-server/server-starter/pom.xml
index 2b06383..d197157 100644
--- a/oap-server/server-starter/pom.xml
+++ b/oap-server/server-starter/pom.xml
@@ -317,6 +317,7 @@
<exclude>hierarchy-definition.yml</exclude>
<exclude>bydb.dependencies.properties</exclude>
<exclude>bydb.yml</exclude>
+ <exclude>bydb-topn.yml</exclude>
<exclude>oal/</exclude>
<exclude>fetcher-prom-rules/</exclude>
<exclude>envoy-metrics-rules/</exclude>
diff --git a/pom.xml b/pom.xml
index 07648f1..5cffda4 100755
--- a/pom.xml
+++ b/pom.xml
@@ -165,7 +165,7 @@
<byte-buddy.version>1.14.9</byte-buddy.version>
<!-- core lib dependency -->
- <grpc.version>1.68.1</grpc.version>
+ <grpc.version>1.70.0</grpc.version>
<netty.version>4.1.118.Final</netty.version>
<netty-tcnative-boringssl-static.version>2.0.69.Final</netty-tcnative-boringssl-static.version>
<gson.version>2.9.0</gson.version>