Use smarter probes for SolrCloud and SolrPrometheusExporter (#511)
SolrCloud - 60 seconds to start, 10-20 seconds of downtime to become not-ready, 40-60 seconds of downtime to become not-live.
SolrPrometheusExporter - 22 seconds to start, 10-15 seconds of downtime to become not ready, 20-30 seconds of downtime to become not-live.
diff --git a/controllers/util/prometheus_exporter_util.go b/controllers/util/prometheus_exporter_util.go
index dc77ed6..633b436 100644
--- a/controllers/util/prometheus_exporter_util.go
+++ b/controllers/util/prometheus_exporter_util.go
@@ -180,6 +180,14 @@
containerImage = solrCloudImage
}
+ defaultProbeHandler := corev1.ProbeHandler{
+ HTTPGet: &corev1.HTTPGetAction{
+ Scheme: corev1.URISchemeHTTP,
+ Path: "/metrics",
+ Port: intstr.FromInt(SolrMetricsPort),
+ },
+ }
+
containers := []corev1.Container{
{
Name: "solr-prometheus-exporter",
@@ -191,19 +199,27 @@
Args: exporterArgs,
Env: envVars,
- LivenessProbe: &corev1.Probe{
- ProbeHandler: corev1.ProbeHandler{
- HTTPGet: &corev1.HTTPGetAction{
- Scheme: corev1.URISchemeHTTP,
- Path: "/metrics",
- Port: intstr.FromInt(SolrMetricsPort),
- },
- },
- InitialDelaySeconds: 20,
- TimeoutSeconds: 1,
- PeriodSeconds: 10,
+ StartupProbe: &corev1.Probe{
+ ProbeHandler: defaultProbeHandler,
+ InitialDelaySeconds: 2,
+ TimeoutSeconds: 2,
+ PeriodSeconds: 2,
SuccessThreshold: 1,
- FailureThreshold: 3,
+ FailureThreshold: 10,
+ },
+ LivenessProbe: &corev1.Probe{
+ ProbeHandler: defaultProbeHandler,
+ TimeoutSeconds: 2,
+ PeriodSeconds: 10,
+ SuccessThreshold: 1,
+ FailureThreshold: 3,
+ },
+ ReadinessProbe: &corev1.Probe{
+ ProbeHandler: defaultProbeHandler,
+ TimeoutSeconds: 2,
+ PeriodSeconds: 5,
+ SuccessThreshold: 1,
+ FailureThreshold: 3,
},
},
}
diff --git a/controllers/util/solr_util.go b/controllers/util/solr_util.go
index 690ba9c..4ae2ba5 100644
--- a/controllers/util/solr_util.go
+++ b/controllers/util/solr_util.go
@@ -406,22 +406,31 @@
Protocol: "TCP",
},
},
- LivenessProbe: &corev1.Probe{
- InitialDelaySeconds: 20,
+ // Wait 60 seconds for Solr to startup
+ StartupProbe: &corev1.Probe{
+ InitialDelaySeconds: 10,
TimeoutSeconds: defaultProbeTimeout,
SuccessThreshold: 1,
- FailureThreshold: 3,
- PeriodSeconds: 10,
- ProbeHandler: defaultHandler,
- },
- ReadinessProbe: &corev1.Probe{
- InitialDelaySeconds: 15,
- TimeoutSeconds: defaultProbeTimeout,
- SuccessThreshold: 1,
- FailureThreshold: 3,
+ FailureThreshold: 10,
PeriodSeconds: 5,
ProbeHandler: defaultHandler,
},
+ // Kill Solr if it is unavailable for any 60-second period
+ LivenessProbe: &corev1.Probe{
+ TimeoutSeconds: defaultProbeTimeout,
+ SuccessThreshold: 1,
+ FailureThreshold: 3,
+ PeriodSeconds: 20,
+ ProbeHandler: defaultHandler,
+ },
+ // Do not route requests to solr if it is not available for any 20-second period
+ ReadinessProbe: &corev1.Probe{
+ TimeoutSeconds: defaultProbeTimeout,
+ SuccessThreshold: 1,
+ FailureThreshold: 2,
+ PeriodSeconds: 10,
+ ProbeHandler: defaultHandler,
+ },
VolumeMounts: volumeMounts,
Env: envVars,
Lifecycle: &corev1.Lifecycle{
diff --git a/helm/solr-operator/Chart.yaml b/helm/solr-operator/Chart.yaml
index c091e59..d9bfd9e 100644
--- a/helm/solr-operator/Chart.yaml
+++ b/helm/solr-operator/Chart.yaml
@@ -108,6 +108,13 @@
url: https://github.com/apache/solr-operator/pull/516
- name: GitHub Issue
url: https://github.com/apache/solr-operator/issues/515
+ - kind: changed
+ description: Use better default startup, liveness and readiness probes for SolrCloud and SolrPrometheusExporter
+ links:
+ - name: GitHub PR
+ url: https://github.com/apache/solr-operator/pull/511
+ - name: GitHub Issue
+ url: https://github.com/apache/solr-operator/issues/510
artifacthub.io/images: |
- name: solr-operator
image: apache/solr-operator:v0.7.0-prerelease