blob: 9e4a74e49915db29c91a8cd547ea7797bc3e1605 [file] [log] [blame]
% Licensed under the Apache License, Version 2.0 (the "License"); you may not
% use this file except in compliance with the License. You may obtain a copy of
% the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
% License for the specific language governing permissions and limitations under
% the License.
-module(couch_prometheus_util).
-export([
couch_to_prom/3,
to_bin/1,
to_prom/4,
to_prom_summary/2
]).
-include("couch_prometheus.hrl").
couch_to_prom([couch_log, level, alert], Info, _All) ->
to_prom(couch_log_requests_total, counter, "number of logged messages", {
[{level, alert}], val(Info)
});
couch_to_prom([couch_log, level, Level], Info, _All) ->
to_prom(couch_log_requests_total, {[{level, Level}], val(Info)});
couch_to_prom([couch_replicator, checkpoints, failure], Info, _All) ->
to_prom(couch_replicator_checkpoints_failure_total, counter, desc(Info), val(Info));
couch_to_prom([couch_replicator, checkpoints, success], Info, All) ->
Total = val(Info) + val([couch_replicator, checkpoints, failure], All),
to_prom(couch_replicator_checkpoints_total, counter, "number of checkpoint saves", Total);
couch_to_prom([couch_replicator, responses, failure], Info, _All) ->
to_prom(couch_replicator_responses_failure_total, counter, desc(Info), val(Info));
couch_to_prom([couch_replicator, responses, success], Info, All) ->
Total = val(Info) + val([couch_replicator, responses, failure], All),
to_prom(
couch_replicator_responses_total,
counter,
"number of HTTP responses received by the replicator",
Total
);
couch_to_prom([couch_replicator, stream_responses, failure], Info, _All) ->
to_prom(couch_replicator_stream_responses_failure_total, counter, desc(Info), val(Info));
couch_to_prom([couch_replicator, stream_responses, success], Info, All) ->
Total = val(Info) + val([couch_replicator, stream_responses, failure], All),
to_prom(
couch_replicator_stream_responses_total,
counter,
"number of streaming HTTP responses received by the replicator",
Total
);
couch_to_prom([couchdb, auth_cache_hits], Info, All) ->
Total = val(Info) + val([couchdb, auth_cache_misses], All),
to_prom(auth_cache_requests_total, counter, "number of authentication cache requests", Total);
couch_to_prom([couchdb, auth_cache_misses], Info, _All) ->
to_prom(auth_cache_misses_total, counter, desc(Info), val(Info));
% force a # TYPE and # HELP definition for httpd_request_methods
couch_to_prom([couchdb, httpd_request_methods, 'COPY'], Info, _All) ->
to_prom(httpd_request_methods, counter, "number of HTTP requests by method", {
[{method, 'COPY'}], val(Info)
});
couch_to_prom([couchdb, httpd_request_methods, Method], Info, _All) ->
to_prom(httpd_request_methods, {[{method, Method}], val(Info)});
% force a # TYPE and # HELP definition for httpd_status_codes
couch_to_prom([couchdb, httpd_status_codes, 200], Info, _All) ->
to_prom(httpd_status_codes, counter, "number of HTTP responses by status code", {
[{code, 200}], val(Info)
});
couch_to_prom([couchdb, httpd_status_codes, Code], Info, _All) ->
to_prom(httpd_status_codes, {[{code, Code}], val(Info)});
couch_to_prom([ddoc_cache, hit], Info, All) ->
Total = val(Info) + val([ddoc_cache, miss], All),
to_prom(ddoc_cache_requests_total, counter, "number of design doc cache requests", Total);
couch_to_prom([ddoc_cache, miss], Info, _All) ->
to_prom(ddoc_cache_requests_failures_total, counter, desc(Info), val(Info));
couch_to_prom([ddoc_cache, recovery], Info, _All) ->
to_prom(ddoc_cache_requests_recovery_total, counter, desc(Info), val(Info));
couch_to_prom([fabric, read_repairs, failure], Info, _All) ->
to_prom(fabric_read_repairs_failures_total, counter, desc(Info), val(Info));
couch_to_prom([fabric, read_repairs, success], Info, All) ->
Total = val(Info) + val([fabric, read_repairs, failure], All),
to_prom(fabric_read_repairs_total, counter, "number of fabric read repairs", Total);
couch_to_prom([rexi, streams, timeout, init_stream], Info, _All) ->
to_prom(rexi_streams_timeout_total, counter, "number of rexi stream timeouts", {
[{stage, init_stream}], val(Info)
});
couch_to_prom([rexi_streams, timeout, Stage], Info, _All) ->
to_prom(rexi_streams_timeout_total, {[{stage, Stage}], val(Info)});
couch_to_prom([couchdb | Rest], Info, All) ->
couch_to_prom(Rest, Info, All);
couch_to_prom(Path, Info, _All) ->
case lists:keyfind(type, 1, Info) of
{type, counter} ->
Metric = counter_metric(Path),
to_prom(Metric, counter, desc(Info), val(Info));
{type, gauge} ->
to_prom(path_to_name(Path), gauge, desc(Info), val(Info));
{type, histogram} ->
to_prom_summary(Path, Info)
end.
type_def(Metric, Type, Desc) ->
Name = to_prom_name(Metric),
[
to_bin(io_lib:format("\n# HELP ~s ~s\r", [Name, Desc])),
to_bin(io_lib:format("# TYPE ~s ~s", [Name, Type]))
].
to_prom(Metric, Type, Desc, Data) ->
TypeStr = type_def(Metric, Type, Desc),
[TypeStr] ++ to_prom(Metric, Data).
to_prom(Metric, Instances) when is_list(Instances) ->
lists:flatmap(fun(Inst) -> to_prom(Metric, Inst) end, Instances);
to_prom(Metric, {Labels, Value}) ->
LabelParts = lists:map(
fun({K, V}) ->
lists:flatten(io_lib:format("~s=\"~s\"", [to_bin(K), to_bin(V)]))
end,
Labels
),
MetricStr =
case length(LabelParts) > 0 of
true ->
LabelStr = string:join(LabelParts, ", "),
lists:flatten(io_lib:format("~s{~s}", [to_prom_name(Metric), LabelStr]));
false ->
lists:flatten(io_lib:format("~s", [to_prom_name(Metric)]))
end,
[to_bin(io_lib:format("~s ~p", [MetricStr, Value]))];
to_prom(Metric, Value) ->
[to_bin(io_lib:format("~s ~p", [to_prom_name(Metric), Value]))].
to_prom_summary(Path, Info) ->
Metric = path_to_name(Path ++ ["seconds"]),
{value, Value} = lists:keyfind(value, 1, Info),
{arithmetic_mean, Mean} = lists:keyfind(arithmetic_mean, 1, Value),
{percentile, Percentiles} = lists:keyfind(percentile, 1, Value),
{n, Count} = lists:keyfind(n, 1, Value),
Quantiles = lists:map(
fun({Perc, Val0}) ->
% Prometheus uses seconds, so we need to convert milliseconds to seconds
Val = Val0 / 1000,
case Perc of
50 -> {[{quantile, <<"0.5">>}], Val};
75 -> {[{quantile, <<"0.75">>}], Val};
90 -> {[{quantile, <<"0.9">>}], Val};
95 -> {[{quantile, <<"0.95">>}], Val};
99 -> {[{quantile, <<"0.99">>}], Val};
999 -> {[{quantile, <<"0.999">>}], Val}
end
end,
Percentiles
),
SumMetric = path_to_name(Path ++ ["seconds", "sum"]),
SumStat = to_prom(SumMetric, Count * Mean),
CountMetric = path_to_name(Path ++ ["seconds", "count"]),
CountStat = to_prom(CountMetric, Count),
to_prom(Metric, summary, desc(Info), Quantiles) ++ [SumStat, CountStat].
to_prom_name(Metric) ->
to_bin(io_lib:format("couchdb_~s", [Metric])).
path_to_name(Path) ->
Parts = lists:map(
fun(Part) ->
io_lib:format("~s", [Part])
end,
Path
),
string:join(Parts, "_").
counter_metric(Path) ->
Name = path_to_name(Path),
case lists:suffix("_total", Name) of
true -> to_bin(Name);
_ -> to_bin(io_lib:format("~s_total", [Name]))
end.
to_bin(Data) when is_list(Data) ->
iolist_to_binary(Data);
to_bin(Data) when is_atom(Data) ->
atom_to_binary(Data, utf8);
to_bin(Data) when is_integer(Data) ->
integer_to_binary(Data);
to_bin(Data) when is_binary(Data) ->
Data.
val(Data) ->
{value, V} = lists:keyfind(value, 1, Data),
V.
val(Key, Stats) ->
{Key, Data} = lists:keyfind(Key, 1, Stats),
val(Data).
desc(Info) ->
{desc, V} = lists:keyfind(desc, 1, Info),
V.
-ifdef(TEST).
-include_lib("couch/include/couch_eunit.hrl").
to_prom_counter_test() ->
[
?assertEqual(
<<"couchdb_ddoc_cache 10">>,
test_to_prom_output(ddoc_cache, counter, "size of ddoc cache", 10)
),
?assertEqual(
<<"couchdb_httpd_status_codes{code=\"200\"} 3">>,
test_to_prom_output(httpd_status_codes, counter, "HTTP request status by code", {
[{code, 200}], 3
})
)
].
to_prom_gauge_test() ->
?assertEqual(
<<"couchdb_temperature_celsius 36">>,
test_to_prom_output(temperature_celsius, gauge, "temp", 36)
).
to_prom_summary_test() ->
?assertEqual(
<<"couchdb_mango_query_time_seconds{quantile=\"0.75\"} 4.5">>,
test_to_prom_summary_output([mango_query_time], [
{value, [
{min, 0.0},
{max, 0.0},
{arithmetic_mean, 0.0},
{geometric_mean, 0.0},
{harmonic_mean, 0.0},
{median, 0.0},
{variance, 0.0},
{standard_deviation, 0.0},
{skewness, 0.0},
{kurtosis, 0.0},
{percentile, [
{50, 0.0},
{75, 4500},
{90, 0.0},
{95, 0.0},
{99, 0.0},
{999, 0.0}
]},
{histogram, [
{0, 0}
]},
{n, 0}
]},
{type, histogram},
{desc, <<"length of time processing a mango query">>}
])
).
counter_metric_test_() ->
[
?_assertEqual(
<<"document_purges_total">>,
counter_metric([document_purges, total])
),
?_assertEqual(
<<"document_purges_total">>,
counter_metric([document_purges])
)
].
test_to_prom_output(Metric, Type, Desc, Val) ->
Out = to_prom(Metric, Type, Desc, Val),
lists:nth(2, Out).
test_to_prom_summary_output(Metric, Info) ->
Out = to_prom_summary(Metric, Info),
lists:nth(3, Out).
-endif.