src/couch_prometheus/src/couch_prometheus_util.erl - couchdb - Git at Google

 % Licensed under the Apache License, Version 2.0 (the "License"); you may not
 % use this file except in compliance with the License. You may obtain a copy of
 % the License at
 %
 %   http://www.apache.org/licenses/LICENSE-2.0
 %
 % Unless required by applicable law or agreed to in writing, software
 % distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 % WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 % License for the specific language governing permissions and limitations under
 % the License.

 -module(couch_prometheus_util).

 -export([
     couch_to_prom/3,
     to_bin/1,
     to_prom/4,
     to_prom_summary/2
 ]).

 -include("couch_prometheus.hrl").

 couch_to_prom([couch_log, level, alert], Info, _All) ->
     to_prom(couch_log_requests_total, counter, "number of logged messages", {
         [{level, alert}], val(Info)
     });
 couch_to_prom([couch_log, level, Level], Info, _All) ->
     to_prom(couch_log_requests_total, {[{level, Level}], val(Info)});
 couch_to_prom([couch_replicator, checkpoints, failure], Info, _All) ->
     to_prom(couch_replicator_checkpoints_failure_total, counter, desc(Info), val(Info));
 couch_to_prom([couch_replicator, checkpoints, success], Info, All) ->
     Total = val(Info) + val([couch_replicator, checkpoints, failure], All),
     to_prom(couch_replicator_checkpoints_total, counter, "number of checkpoint saves", Total);
 couch_to_prom([couch_replicator, responses, failure], Info, _All) ->
     to_prom(couch_replicator_responses_failure_total, counter, desc(Info), val(Info));
 couch_to_prom([couch_replicator, responses, success], Info, All) ->
     Total = val(Info) + val([couch_replicator, responses, failure], All),
     to_prom(
         couch_replicator_responses_total,
         counter,
         "number of HTTP responses received by the replicator",
         Total
     );
 couch_to_prom([couch_replicator, stream_responses, failure], Info, _All) ->
     to_prom(couch_replicator_stream_responses_failure_total, counter, desc(Info), val(Info));
 couch_to_prom([couch_replicator, stream_responses, success], Info, All) ->
     Total = val(Info) + val([couch_replicator, stream_responses, failure], All),
     to_prom(
         couch_replicator_stream_responses_total,
         counter,
         "number of streaming HTTP responses received by the replicator",
         Total
     );
 couch_to_prom([couchdb, auth_cache_hits], Info, All) ->
     Total = val(Info) + val([couchdb, auth_cache_misses], All),
     to_prom(auth_cache_requests_total, counter, "number of authentication cache requests", Total);
 couch_to_prom([couchdb, auth_cache_misses], Info, _All) ->
     to_prom(auth_cache_misses_total, counter, desc(Info), val(Info));
 % force a # TYPE and # HELP definition for httpd_request_methods
 couch_to_prom([couchdb, httpd_request_methods, 'COPY'], Info, _All) ->
     to_prom(httpd_request_methods, counter, "number of HTTP requests by method", {
         [{method, 'COPY'}], val(Info)
     });
 couch_to_prom([couchdb, httpd_request_methods, Method], Info, _All) ->
     to_prom(httpd_request_methods, {[{method, Method}], val(Info)});
 % force a # TYPE and # HELP definition for httpd_status_codes
 couch_to_prom([couchdb, httpd_status_codes, 200], Info, _All) ->
     to_prom(httpd_status_codes, counter, "number of HTTP responses by status code", {
         [{code, 200}], val(Info)
     });
 couch_to_prom([couchdb, httpd_status_codes, Code], Info, _All) ->
     to_prom(httpd_status_codes, {[{code, Code}], val(Info)});
 couch_to_prom([ddoc_cache, hit], Info, All) ->
     Total = val(Info) + val([ddoc_cache, miss], All),
     to_prom(ddoc_cache_requests_total, counter, "number of design doc cache requests", Total);
 couch_to_prom([ddoc_cache, miss], Info, _All) ->
     to_prom(ddoc_cache_requests_failures_total, counter, desc(Info), val(Info));
 couch_to_prom([ddoc_cache, recovery], Info, _All) ->
     to_prom(ddoc_cache_requests_recovery_total, counter, desc(Info), val(Info));
 couch_to_prom([fabric, read_repairs, failure], Info, _All) ->
     to_prom(fabric_read_repairs_failures_total, counter, desc(Info), val(Info));
 couch_to_prom([fabric, read_repairs, success], Info, All) ->
     Total = val(Info) + val([fabric, read_repairs, failure], All),
     to_prom(fabric_read_repairs_total, counter, "number of fabric read repairs", Total);
 couch_to_prom([rexi, streams, timeout, init_stream], Info, _All) ->
     to_prom(rexi_streams_timeout_total, counter, "number of rexi stream timeouts", {
         [{stage, init_stream}], val(Info)
     });
 couch_to_prom([rexi_streams, timeout, Stage], Info, _All) ->
     to_prom(rexi_streams_timeout_total, {[{stage, Stage}], val(Info)});
 couch_to_prom([couchdb | Rest], Info, All) ->
     couch_to_prom(Rest, Info, All);
 couch_to_prom(Path, Info, _All) ->
     case lists:keyfind(type, 1, Info) of
         {type, counter} ->
             Metric = counter_metric(Path),
             to_prom(Metric, counter, desc(Info), val(Info));
         {type, gauge} ->
             to_prom(path_to_name(Path), gauge, desc(Info), val(Info));
         {type, histogram} ->
             to_prom_summary(Path, Info)
     end.

 type_def(Metric, Type, Desc) ->
     Name = to_prom_name(Metric),
     [
         to_bin(io_lib:format("\n# HELP ~s ~s\r", [Name, Desc])),
         to_bin(io_lib:format("# TYPE ~s ~s", [Name, Type]))
     ].

 to_prom(Metric, Type, Desc, Data) ->
     TypeStr = type_def(Metric, Type, Desc),
     [TypeStr] ++ to_prom(Metric, Data).

 to_prom(Metric, Instances) when is_list(Instances) ->
     lists:flatmap(fun(Inst) -> to_prom(Metric, Inst) end, Instances);
 to_prom(Metric, {Labels, Value}) ->
     LabelParts = lists:map(
         fun({K, V}) ->
             lists:flatten(io_lib:format("~s=\"~s\"", [to_bin(K), to_bin(V)]))
         end,
         Labels
     ),
     MetricStr =
         case length(LabelParts) > 0 of
             true ->
                 LabelStr = string:join(LabelParts, ", "),
                 lists:flatten(io_lib:format("~s{~s}", [to_prom_name(Metric), LabelStr]));
             false ->
                 lists:flatten(io_lib:format("~s", [to_prom_name(Metric)]))
         end,
     [to_bin(io_lib:format("~s ~p", [MetricStr, Value]))];
 to_prom(Metric, Value) ->
     [to_bin(io_lib:format("~s ~p", [to_prom_name(Metric), Value]))].

 to_prom_summary(Path, Info) ->
     Metric = path_to_name(Path ++ ["seconds"]),
     {value, Value} = lists:keyfind(value, 1, Info),
     {arithmetic_mean, Mean} = lists:keyfind(arithmetic_mean, 1, Value),
     {percentile, Percentiles} = lists:keyfind(percentile, 1, Value),
     {n, Count} = lists:keyfind(n, 1, Value),
     Quantiles = lists:map(
         fun({Perc, Val0}) ->
             % Prometheus uses seconds, so we need to convert milliseconds to seconds
             Val = Val0 / 1000,
             case Perc of
                 50 -> {[{quantile, <<"0.5">>}], Val};
                 75 -> {[{quantile, <<"0.75">>}], Val};
                 90 -> {[{quantile, <<"0.9">>}], Val};
                 95 -> {[{quantile, <<"0.95">>}], Val};
                 99 -> {[{quantile, <<"0.99">>}], Val};
                 999 -> {[{quantile, <<"0.999">>}], Val}
             end
         end,
         Percentiles
     ),
     SumMetric = path_to_name(Path ++ ["seconds", "sum"]),
     SumStat = to_prom(SumMetric, Count * Mean),
     CountMetric = path_to_name(Path ++ ["seconds", "count"]),
     CountStat = to_prom(CountMetric, Count),
     to_prom(Metric, summary, desc(Info), Quantiles) ++ [SumStat, CountStat].

 to_prom_name(Metric) ->
     to_bin(io_lib:format("couchdb_~s", [Metric])).

 path_to_name(Path) ->
     Parts = lists:map(
         fun(Part) ->
             io_lib:format("~s", [Part])
         end,
         Path
     ),
     string:join(Parts, "_").

 counter_metric(Path) ->
     Name = path_to_name(Path),
     case lists:suffix("_total", Name) of
         true -> to_bin(Name);
         _ -> to_bin(io_lib:format("~s_total", [Name]))
     end.

 to_bin(Data) when is_list(Data) ->
     iolist_to_binary(Data);
 to_bin(Data) when is_atom(Data) ->
     atom_to_binary(Data, utf8);
 to_bin(Data) when is_integer(Data) ->
     integer_to_binary(Data);
 to_bin(Data) when is_binary(Data) ->
     Data.

 val(Data) ->
     {value, V} = lists:keyfind(value, 1, Data),
     V.

 val(Key, Stats) ->
     {Key, Data} = lists:keyfind(Key, 1, Stats),
     val(Data).

 desc(Info) ->
     {desc, V} = lists:keyfind(desc, 1, Info),
     V.

 -ifdef(TEST).
 -include_lib("couch/include/couch_eunit.hrl").

 to_prom_counter_test() ->
     [
         ?assertEqual(
             <<"couchdb_ddoc_cache 10">>,
             test_to_prom_output(ddoc_cache, counter, "size of ddoc cache", 10)
         ),
         ?assertEqual(
             <<"couchdb_httpd_status_codes{code=\"200\"} 3">>,
             test_to_prom_output(httpd_status_codes, counter, "HTTP request status by code", {
                 [{code, 200}], 3
             })
         )
     ].

 to_prom_gauge_test() ->
     ?assertEqual(
         <<"couchdb_temperature_celsius 36">>,
         test_to_prom_output(temperature_celsius, gauge, "temp", 36)
     ).

 to_prom_summary_test() ->
     ?assertEqual(
         <<"couchdb_mango_query_time_seconds{quantile=\"0.75\"} 4.5">>,
         test_to_prom_summary_output([mango_query_time], [
             {value, [
                 {min, 0.0},
                 {max, 0.0},
                 {arithmetic_mean, 0.0},
                 {geometric_mean, 0.0},
                 {harmonic_mean, 0.0},
                 {median, 0.0},
                 {variance, 0.0},
                 {standard_deviation, 0.0},
                 {skewness, 0.0},
                 {kurtosis, 0.0},
                 {percentile, [
                     {50, 0.0},
                     {75, 4500},
                     {90, 0.0},
                     {95, 0.0},
                     {99, 0.0},
                     {999, 0.0}
                 ]},
                 {histogram, [
                     {0, 0}
                 ]},
                 {n, 0}
             ]},
             {type, histogram},
             {desc, <<"length of time processing a mango query">>}
         ])
     ).

 counter_metric_test_() ->
     [
         ?_assertEqual(
             <<"document_purges_total">>,
             counter_metric([document_purges, total])
         ),
         ?_assertEqual(
             <<"document_purges_total">>,
             counter_metric([document_purges])
         )
     ].

 test_to_prom_output(Metric, Type, Desc, Val) ->
     Out = to_prom(Metric, Type, Desc, Val),
     lists:nth(2, Out).

 test_to_prom_summary_output(Metric, Info) ->
     Out = to_prom_summary(Metric, Info),
     lists:nth(3, Out).

 -endif.
	% Licensed under the Apache License, Version 2.0 (the "License"); you may not
	% use this file except in compliance with the License. You may obtain a copy of
	% the License at
	%
	% http://www.apache.org/licenses/LICENSE-2.0
	%
	% Unless required by applicable law or agreed to in writing, software
	% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
	% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
	% License for the specific language governing permissions and limitations under
	% the License.

	-module(couch_prometheus_util).

	-export([
	couch_to_prom/3,
	to_bin/1,
	to_prom/4,
	to_prom_summary/2
	]).

	-include("couch_prometheus.hrl").

	couch_to_prom([couch_log, level, alert], Info, _All) ->
	to_prom(couch_log_requests_total, counter, "number of logged messages", {
	[{level, alert}], val(Info)
	});
	couch_to_prom([couch_log, level, Level], Info, _All) ->
	to_prom(couch_log_requests_total, {[{level, Level}], val(Info)});
	couch_to_prom([couch_replicator, checkpoints, failure], Info, _All) ->
	to_prom(couch_replicator_checkpoints_failure_total, counter, desc(Info), val(Info));
	couch_to_prom([couch_replicator, checkpoints, success], Info, All) ->
	Total = val(Info) + val([couch_replicator, checkpoints, failure], All),
	to_prom(couch_replicator_checkpoints_total, counter, "number of checkpoint saves", Total);
	couch_to_prom([couch_replicator, responses, failure], Info, _All) ->
	to_prom(couch_replicator_responses_failure_total, counter, desc(Info), val(Info));
	couch_to_prom([couch_replicator, responses, success], Info, All) ->
	Total = val(Info) + val([couch_replicator, responses, failure], All),
	to_prom(
	couch_replicator_responses_total,
	counter,
	"number of HTTP responses received by the replicator",
	Total
	);
	couch_to_prom([couch_replicator, stream_responses, failure], Info, _All) ->
	to_prom(couch_replicator_stream_responses_failure_total, counter, desc(Info), val(Info));
	couch_to_prom([couch_replicator, stream_responses, success], Info, All) ->
	Total = val(Info) + val([couch_replicator, stream_responses, failure], All),
	to_prom(
	couch_replicator_stream_responses_total,
	counter,
	"number of streaming HTTP responses received by the replicator",
	Total
	);
	couch_to_prom([couchdb, auth_cache_hits], Info, All) ->
	Total = val(Info) + val([couchdb, auth_cache_misses], All),
	to_prom(auth_cache_requests_total, counter, "number of authentication cache requests", Total);
	couch_to_prom([couchdb, auth_cache_misses], Info, _All) ->
	to_prom(auth_cache_misses_total, counter, desc(Info), val(Info));
	% force a # TYPE and # HELP definition for httpd_request_methods
	couch_to_prom([couchdb, httpd_request_methods, 'COPY'], Info, _All) ->
	to_prom(httpd_request_methods, counter, "number of HTTP requests by method", {
	[{method, 'COPY'}], val(Info)
	});
	couch_to_prom([couchdb, httpd_request_methods, Method], Info, _All) ->
	to_prom(httpd_request_methods, {[{method, Method}], val(Info)});
	% force a # TYPE and # HELP definition for httpd_status_codes
	couch_to_prom([couchdb, httpd_status_codes, 200], Info, _All) ->
	to_prom(httpd_status_codes, counter, "number of HTTP responses by status code", {
	[{code, 200}], val(Info)
	});
	couch_to_prom([couchdb, httpd_status_codes, Code], Info, _All) ->
	to_prom(httpd_status_codes, {[{code, Code}], val(Info)});
	couch_to_prom([ddoc_cache, hit], Info, All) ->
	Total = val(Info) + val([ddoc_cache, miss], All),
	to_prom(ddoc_cache_requests_total, counter, "number of design doc cache requests", Total);
	couch_to_prom([ddoc_cache, miss], Info, _All) ->
	to_prom(ddoc_cache_requests_failures_total, counter, desc(Info), val(Info));
	couch_to_prom([ddoc_cache, recovery], Info, _All) ->
	to_prom(ddoc_cache_requests_recovery_total, counter, desc(Info), val(Info));
	couch_to_prom([fabric, read_repairs, failure], Info, _All) ->
	to_prom(fabric_read_repairs_failures_total, counter, desc(Info), val(Info));
	couch_to_prom([fabric, read_repairs, success], Info, All) ->
	Total = val(Info) + val([fabric, read_repairs, failure], All),
	to_prom(fabric_read_repairs_total, counter, "number of fabric read repairs", Total);
	couch_to_prom([rexi, streams, timeout, init_stream], Info, _All) ->
	to_prom(rexi_streams_timeout_total, counter, "number of rexi stream timeouts", {
	[{stage, init_stream}], val(Info)
	});
	couch_to_prom([rexi_streams, timeout, Stage], Info, _All) ->
	to_prom(rexi_streams_timeout_total, {[{stage, Stage}], val(Info)});
	couch_to_prom([couchdb \| Rest], Info, All) ->
	couch_to_prom(Rest, Info, All);
	couch_to_prom(Path, Info, _All) ->
	case lists:keyfind(type, 1, Info) of
	{type, counter} ->
	Metric = counter_metric(Path),
	to_prom(Metric, counter, desc(Info), val(Info));
	{type, gauge} ->
	to_prom(path_to_name(Path), gauge, desc(Info), val(Info));
	{type, histogram} ->
	to_prom_summary(Path, Info)
	end.

	type_def(Metric, Type, Desc) ->
	Name = to_prom_name(Metric),
	[
	to_bin(io_lib:format("\n# HELP ~s ~s\r", [Name, Desc])),
	to_bin(io_lib:format("# TYPE ~s ~s", [Name, Type]))
	].

	to_prom(Metric, Type, Desc, Data) ->
	TypeStr = type_def(Metric, Type, Desc),
	[TypeStr] ++ to_prom(Metric, Data).

	to_prom(Metric, Instances) when is_list(Instances) ->
	lists:flatmap(fun(Inst) -> to_prom(Metric, Inst) end, Instances);
	to_prom(Metric, {Labels, Value}) ->
	LabelParts = lists:map(
	fun({K, V}) ->
	lists:flatten(io_lib:format("~s=\"~s\"", [to_bin(K), to_bin(V)]))
	end,
	Labels
	),
	MetricStr =
	case length(LabelParts) > 0 of
	true ->
	LabelStr = string:join(LabelParts, ", "),
	lists:flatten(io_lib:format("~s{~s}", [to_prom_name(Metric), LabelStr]));
	false ->
	lists:flatten(io_lib:format("~s", [to_prom_name(Metric)]))
	end,
	[to_bin(io_lib:format("~s ~p", [MetricStr, Value]))];
	to_prom(Metric, Value) ->
	[to_bin(io_lib:format("~s ~p", [to_prom_name(Metric), Value]))].

	to_prom_summary(Path, Info) ->
	Metric = path_to_name(Path ++ ["seconds"]),
	{value, Value} = lists:keyfind(value, 1, Info),
	{arithmetic_mean, Mean} = lists:keyfind(arithmetic_mean, 1, Value),
	{percentile, Percentiles} = lists:keyfind(percentile, 1, Value),
	{n, Count} = lists:keyfind(n, 1, Value),
	Quantiles = lists:map(
	fun({Perc, Val0}) ->
	% Prometheus uses seconds, so we need to convert milliseconds to seconds
	Val = Val0 / 1000,
	case Perc of
	50 -> {[{quantile, <<"0.5">>}], Val};
	75 -> {[{quantile, <<"0.75">>}], Val};
	90 -> {[{quantile, <<"0.9">>}], Val};
	95 -> {[{quantile, <<"0.95">>}], Val};
	99 -> {[{quantile, <<"0.99">>}], Val};
	999 -> {[{quantile, <<"0.999">>}], Val}
	end
	end,
	Percentiles
	),
	SumMetric = path_to_name(Path ++ ["seconds", "sum"]),
	SumStat = to_prom(SumMetric, Count * Mean),
	CountMetric = path_to_name(Path ++ ["seconds", "count"]),
	CountStat = to_prom(CountMetric, Count),
	to_prom(Metric, summary, desc(Info), Quantiles) ++ [SumStat, CountStat].

	to_prom_name(Metric) ->
	to_bin(io_lib:format("couchdb_~s", [Metric])).

	path_to_name(Path) ->
	Parts = lists:map(
	fun(Part) ->
	io_lib:format("~s", [Part])
	end,
	Path
	),
	string:join(Parts, "_").

	counter_metric(Path) ->
	Name = path_to_name(Path),
	case lists:suffix("_total", Name) of
	true -> to_bin(Name);
	_ -> to_bin(io_lib:format("~s_total", [Name]))
	end.

	to_bin(Data) when is_list(Data) ->
	iolist_to_binary(Data);
	to_bin(Data) when is_atom(Data) ->
	atom_to_binary(Data, utf8);
	to_bin(Data) when is_integer(Data) ->
	integer_to_binary(Data);
	to_bin(Data) when is_binary(Data) ->
	Data.

	val(Data) ->
	{value, V} = lists:keyfind(value, 1, Data),
	V.

	val(Key, Stats) ->
	{Key, Data} = lists:keyfind(Key, 1, Stats),
	val(Data).

	desc(Info) ->
	{desc, V} = lists:keyfind(desc, 1, Info),
	V.

	-ifdef(TEST).
	-include_lib("couch/include/couch_eunit.hrl").

	to_prom_counter_test() ->
	[
	?assertEqual(
	<<"couchdb_ddoc_cache 10">>,
	test_to_prom_output(ddoc_cache, counter, "size of ddoc cache", 10)
	),
	?assertEqual(
	<<"couchdb_httpd_status_codes{code=\"200\"} 3">>,
	test_to_prom_output(httpd_status_codes, counter, "HTTP request status by code", {
	[{code, 200}], 3
	})
	)
	].

	to_prom_gauge_test() ->
	?assertEqual(
	<<"couchdb_temperature_celsius 36">>,
	test_to_prom_output(temperature_celsius, gauge, "temp", 36)
	).

	to_prom_summary_test() ->
	?assertEqual(
	<<"couchdb_mango_query_time_seconds{quantile=\"0.75\"} 4.5">>,
	test_to_prom_summary_output([mango_query_time], [
	{value, [
	{min, 0.0},
	{max, 0.0},
	{arithmetic_mean, 0.0},
	{geometric_mean, 0.0},
	{harmonic_mean, 0.0},
	{median, 0.0},
	{variance, 0.0},
	{standard_deviation, 0.0},
	{skewness, 0.0},
	{kurtosis, 0.0},
	{percentile, [
	{50, 0.0},
	{75, 4500},
	{90, 0.0},
	{95, 0.0},
	{99, 0.0},
	{999, 0.0}
	]},
	{histogram, [
	{0, 0}
	]},
	{n, 0}
	]},
	{type, histogram},
	{desc, <<"length of time processing a mango query">>}
	])
	).

	counter_metric_test_() ->
	[
	?_assertEqual(
	<<"document_purges_total">>,
	counter_metric([document_purges, total])
	),
	?_assertEqual(
	<<"document_purges_total">>,
	counter_metric([document_purges])
	)
	].

	test_to_prom_output(Metric, Type, Desc, Val) ->
	Out = to_prom(Metric, Type, Desc, Val),
	lists:nth(2, Out).

	test_to_prom_summary_output(Metric, Info) ->
	Out = to_prom_summary(Metric, Info),
	lists:nth(3, Out).

	-endif.