SCB-1092 More abundant metrics information (#536)
diff --git a/integration/health-metrics-grafana.json b/integration/health-metrics-grafana.json
index e7acfb7..f730b2c 100644
--- a/integration/health-metrics-grafana.json
+++ b/integration/health-metrics-grafana.json
@@ -57,6 +57,12 @@
},
{
"type": "panel",
+ "id": "table",
+ "name": "Table",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
"id": "text",
"name": "Text",
"version": "5.0.0"
@@ -299,7 +305,7 @@
"format": "time_series",
"instant": false,
"intervalFactor": 2,
- "legendFormat": "instances",
+ "legendFormat": "microservices",
"refId": "A"
}
],
@@ -1334,178 +1340,6 @@
{
"cacheTimeout": null,
"colorBackground": false,
- "colorValue": true,
- "colors": [
- "#d44a3a",
- "rgba(237, 129, 40, 0.89)",
- "#299c46"
- ],
- "datasource": "${DS_LOCAL}",
- "format": "percentunit",
- "gauge": {
- "maxValue": 1,
- "minValue": 0,
- "show": true,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "gridPos": {
- "h": 6,
- "w": 4,
- "x": 0,
- "y": 19
- },
- "id": 12,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "minSpan": 4,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": true,
- "lineColor": "rgb(31, 120, 193)",
- "show": true
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(service_center_http_success_total{job=\"service-center\"})/sum(service_center_http_request_total{job=\"service-center\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "refId": "A"
- }
- ],
- "thresholds": ".5,.8",
- "title": "Global Success Rate",
- "transparent": false,
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "current"
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "${DS_LOCAL}",
- "fill": 1,
- "gridPos": {
- "h": 6,
- "w": 8,
- "x": 4,
- "y": 19
- },
- "height": "",
- "id": 28,
- "legend": {
- "alignAsTable": true,
- "avg": true,
- "current": true,
- "max": true,
- "min": true,
- "show": true,
- "sort": null,
- "sortDesc": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "minSpan": 4,
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(service_center_http_success_total{job=\"service-center\"})/sum(service_center_http_request_total{job=\"service-center\"})",
- "format": "time_series",
- "instant": false,
- "intervalFactor": 1,
- "legendFormat": "rate",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeShift": null,
- "title": "Success Rate",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "percentunit",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
@@ -1524,7 +1358,7 @@
"gridPos": {
"h": 6,
"w": 4,
- "x": 12,
+ "x": 0,
"y": 19
},
"id": 10,
@@ -1607,7 +1441,7 @@
"gridPos": {
"h": 6,
"w": 4,
- "x": 16,
+ "x": 4,
"y": 19
},
"id": 11,
@@ -1668,6 +1502,232 @@
"valueName": "avg"
},
{
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "#d44a3a",
+ "rgba(237, 129, 40, 0.89)",
+ "#299c46"
+ ],
+ "datasource": "${DS_LOCAL}",
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 1,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 8,
+ "y": 19
+ },
+ "id": 12,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "minSpan": 4,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": true,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(service_center_http_success_total{job=\"service-center\"})/sum(service_center_http_request_total{job=\"service-center\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A"
+ }
+ ],
+ "thresholds": ".5,.8",
+ "title": "Global Success Rate",
+ "transparent": false,
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_LOCAL}",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 5,
+ "x": 12,
+ "y": 19
+ },
+ "height": "",
+ "id": 28,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": false,
+ "max": true,
+ "min": true,
+ "show": true,
+ "sort": null,
+ "sortDesc": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 4,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(service_center_http_success_total{job=\"service-center\"})/sum(service_center_http_request_total{job=\"service-center\"})",
+ "format": "time_series",
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "rate",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Success Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percentunit",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "columns": [],
+ "datasource": "${DS_LOCAL}",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 6,
+ "w": 7,
+ "x": 17,
+ "y": 19
+ },
+ "id": 46,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 3,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Time",
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "pattern": "Time",
+ "type": "date"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "decimals": 2,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "sum(service_center_http_request_total{job=\"service-center\"}) by (api,code)",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "title": "Status Code",
+ "transform": "table",
+ "type": "table"
+ },
+ {
"content": "<div class=\"text-center dashboard-header\">\n <span>INSTANCE METRICS</span>\n</div>\n",
"gridPos": {
"h": 3,
@@ -2642,6 +2702,100 @@
}
},
{
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_LOCAL}",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 12,
+ "y": 40
+ },
+ "height": "",
+ "id": 43,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "sort": "avg",
+ "sortDesc": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 4,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "service_center_notify_subscriber_total{job=\"service-center\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}> {{domain}} {{scheme}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Subscribers",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "transparent": false,
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
"content": "<div class=\"text-center dashboard-header\">\n <span>RESOURCES</span>\n</div>\n",
"gridPos": {
"h": 3,
@@ -3068,7 +3222,7 @@
},
"yaxes": [
{
- "format": "none",
+ "format": "short",
"label": null,
"logBase": 1,
"max": null,
@@ -3311,5 +3465,5 @@
"timezone": "",
"title": "ServiceCenter",
"uid": "Zg6NoHGiz",
- "version": 9
+ "version": 12
}
\ No newline at end of file
diff --git a/server/notify/common.go b/server/notify/common.go
index a234398..d0db027 100644
--- a/server/notify/common.go
+++ b/server/notify/common.go
@@ -24,4 +24,6 @@
SendTimeout = 5 * time.Second
InstanceEventQueueSize = 5000
ReadMaxBody = 64
+ Websocket = "Websocket"
+ GRPC = "gRPC"
)
diff --git a/server/notify/metrics.go b/server/notify/metrics.go
index 94fa7d9..39d88c3 100644
--- a/server/notify/metrics.go
+++ b/server/notify/metrics.go
@@ -44,10 +44,18 @@
Help: "Latency of publishing instance events",
Objectives: prometheus.DefObjectives,
}, []string{"instance", "source", "status"})
+
+ subscriberGauge = prometheus.NewGaugeVec(
+ prometheus.GaugeOpts{
+ Namespace: metric.FamilyName,
+ Subsystem: "notify",
+ Name: "subscriber_total",
+ Help: "Gauge of subscribers",
+ }, []string{"instance", "domain", "scheme"})
)
func init() {
- prometheus.MustRegister(notifyCounter, notifyLatency)
+ prometheus.MustRegister(notifyCounter, notifyLatency, subscriberGauge)
}
func ReportPublishCompleted(evt notify.Event, err error) {
@@ -60,3 +68,9 @@
notifyLatency.WithLabelValues(instance, evt.Type().String(), status).Observe(elapsed)
notifyCounter.WithLabelValues(instance, evt.Type().String(), status).Inc()
}
+
+func ReportSubscriber(domain, scheme string, n float64) {
+ instance := metric.InstanceName()
+
+ subscriberGauge.WithLabelValues(instance, domain, scheme).Add(n)
+}
diff --git a/server/notify/stream.go b/server/notify/stream.go
index 3fe7371..8d2f1b4 100644
--- a/server/notify/stream.go
+++ b/server/notify/stream.go
@@ -64,12 +64,16 @@
}
}
-func DoStreamListAndWatch(ctx context.Context, serviceId string, f func() ([]*pb.WatchInstanceResponse, int64), stream pb.ServiceInstanceCtrl_WatchServer) error {
+func DoStreamListAndWatch(ctx context.Context, serviceId string, f func() ([]*pb.WatchInstanceResponse, int64), stream pb.ServiceInstanceCtrl_WatchServer) (err error) {
domainProject := util.ParseDomainProject(ctx)
+ domain := util.ParseDomain(ctx)
watcher := NewInstanceEventListWatcher(serviceId, apt.GetInstanceRootKey(domainProject)+"/", f)
- err := NotifyCenter().AddSubscriber(watcher)
+ err = NotifyCenter().AddSubscriber(watcher)
if err != nil {
- return err
+ return
}
- return HandleWatchJob(watcher, stream)
+ ReportSubscriber(domain, GRPC, 1)
+ err = HandleWatchJob(watcher, stream)
+ ReportSubscriber(domain, GRPC, -1)
+ return
}
diff --git a/server/notify/websocket.go b/server/notify/websocket.go
index 033ab12..6eaac0c 100644
--- a/server/notify/websocket.go
+++ b/server/notify/websocket.go
@@ -268,12 +268,16 @@
func DoWebSocketListAndWatch(ctx context.Context, serviceId string, f func() ([]*pb.WatchInstanceResponse, int64), conn *websocket.Conn) {
domainProject := util.ParseDomainProject(ctx)
+ domain := util.ParseDomain(ctx)
socket := &WebSocket{
ctx: ctx,
conn: conn,
watcher: NewInstanceEventListWatcher(serviceId, domainProject, f),
}
+
+ ReportSubscriber(domain, Websocket, 1)
process(socket)
+ ReportSubscriber(domain, Websocket, -1)
}
func process(socket *WebSocket) {