feat: Add default alerting rule for build error SLO
diff --git a/deploy/operator-prometheus-rule.yaml b/deploy/operator-prometheus-rule.yaml
index ab4e837..83f056a 100644
--- a/deploy/operator-prometheus-rule.yaml
+++ b/deploy/operator-prometheus-rule.yaml
@@ -70,7 +70,7 @@
for {{ $labels.job }} have their duration above 5m.
- alert: CamelKBuildFailure
expr: |
- sum(rate(camel_k_build_duration_seconds_count{result="Error"}[5m])) by (job)
+ sum(rate(camel_k_build_duration_seconds_count{result="Failed"}[5m])) by (job)
/
sum(rate(camel_k_build_duration_seconds_count[5m])) by (job)
* 100
@@ -81,6 +81,19 @@
annotations:
message: |
{{ printf "%0.0f" $value }}% of the builds for {{ $labels.job }} have failed.
+ - alert: CamelKBuildError
+ expr: |
+ sum(rate(camel_k_build_duration_seconds_count{result="Error"}[5m])) by (job)
+ /
+ sum(rate(camel_k_build_duration_seconds_count[5m])) by (job)
+ * 100
+ > 1
+ for: 10m
+ labels:
+ severity: critical
+ annotations:
+ message: |
+ {{ printf "%0.0f" $value }}% of the builds for {{ $labels.job }} have errored.
- alert: CamelKBuildQueueDuration1m
expr: |
(