blob: dfffc9a1651d5c6725a405c3d952683e59915330 [file] [log] [blame]
% Licensed under the Apache License, Version 2.0 (the "License"); you may not
% use this file except in compliance with the License. You may obtain a copy of
% the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
% WARRANTIES OR CONDITIONS OF ANY KIND, eithe r express or implied. See the
% License for the specific language governing permissions and limitations under
% the License.
-module(couch_stats_math).
-export([
summary/2
]).
% Stats are computed over two passes. First one, with acc1, gathers some
% basics, then the second pass, with acc2, computes additional, more complex,
% statistics.
%
-record(acc1, {
% Non-zero bins = [{BinIndex, Counts}, ...]
bins = [],
n = 0,
sum = 0,
sum_log = 0,
sum_inv = 0
}).
-record(acc2, {
diff_2_sum = 0,
diff_3_sum = 0,
diff_4_sum = 0,
% pXX = {Rank, PercentileValue}
p50 = {0, 0},
p75 = {0, 0},
p90 = {0, 0},
p95 = {0, 0},
p99 = {0, 0},
p999 = {0, 0}
}).
summary(Counter, BinCount) when is_tuple(Counter), is_integer(BinCount) ->
calc_stats(pass1(Counter, BinCount)).
calc_stats(#acc1{n = 0}) ->
% Can't do much with 0 items. Instead of inserting division checks for
% 0 everywhere, just return a empty stats object
n0_stats();
calc_stats(#acc1{n = N} = Stats) when is_integer(N), N > 0 ->
#acc1{bins = Bins, sum = Sum, sum_log = SumLog, sum_inv = SumInv} = Stats,
Mean = Sum / N,
Acc2 = pass2(Bins, N, Mean),
#acc2{
diff_2_sum = Diff2Sum,
diff_3_sum = Diff3Sum,
diff_4_sum = Diff4Sum,
p50 = {_, P50},
p75 = {_, P75},
p90 = {_, P90},
p95 = {_, P95},
p99 = {_, P99},
p999 = {_, P999}
} = Acc2,
Variance = Diff2Sum / N,
StdDev = math:sqrt(Variance),
[
{n, N},
{min, couch_stats_histogram:bin_min(element(1, hd(Bins)))},
{max, couch_stats_histogram:bin_max(element(1, lists:last(Bins)))},
{arithmetic_mean, Mean},
{geometric_mean, math:exp(SumLog / N)},
{harmonic_mean, N / SumInv},
{median, P50},
{variance, Variance},
{standard_deviation, StdDev},
{skewness, skewness(N, Diff3Sum, StdDev)},
{kurtosis, kurtosis(N, Diff4Sum, StdDev)},
{percentile, [
{50, P50},
{75, P75},
{90, P90},
{95, P95},
{99, P99},
{999, P999}
]},
% Emit for compatibility reasons
{histogram, [{0, 0}]}
].
% Fist pass removes 0 bins and calculates some basics like sums, counts, sum
% logs. Some of these can only be used after the second pass over the data.
%
pass1(Counter, BinCount) ->
lists:foldl(
fun(Index, #acc1{} = Acc) ->
case counters:get(Counter, Index) of
Count when is_integer(Count), Count =< 0 ->
Acc;
Count when is_integer(Count), Count > 0 ->
Val = couch_stats_histogram:bin_middle(Index),
Acc#acc1{
bins = [{Index, Count} | Acc#acc1.bins],
n = Acc#acc1.n + Count,
sum = Acc#acc1.sum + Count * Val,
sum_log = Acc#acc1.sum_log + Count * math:log(Val),
sum_inv = Acc#acc1.sum_inv + Count / Val
}
end
end,
#acc1{},
lists:seq(BinCount, 1, -1)
).
% Second statistics pass. This calculates the diff squared, cubed and 4th power
% sums. These are used later to get the 2nd, 3rd and 4th central momements to
% calcuate variance, skewness and kurtosis. In this pass we also calculate
% percentiles.
%
pass2(Bins, N, Mean) ->
% Initialize each percentile's rank value as N * Q. During traversal each
% corresponding rank will be decremeneted by the bin's count value until we
% get to the bin where percentile rank < bin count value. After we
% calculate a percentile (as a non-0 value), we ignore that percentile
% entry and stop updating its rank.
%
Acc0 = #acc2{
p50 = {N * 0.50, 0},
p75 = {N * 0.75, 0},
p90 = {N * 0.90, 0},
p95 = {N * 0.95, 0},
p99 = {N * 0.99, 0},
p999 = {N * 0.999, 0}
},
lists:foldl(
fun({Index, Count}, #acc2{} = Acc) ->
Acc1 = percentiles(Acc, Index, Count),
Diff = couch_stats_histogram:bin_middle(Index) - Mean,
Diff2 = Diff * Diff,
Diff3 = Diff2 * Diff,
Diff4 = Diff2 * Diff2,
Acc1#acc2{
diff_2_sum = Acc1#acc2.diff_2_sum + Count * Diff2,
diff_3_sum = Acc1#acc2.diff_3_sum + Count * Diff3,
diff_4_sum = Acc1#acc2.diff_4_sum + Count * Diff4
}
end,
Acc0,
Bins
).
%% erlfmt-ignore
percentiles(#acc2{} = Acc1, Index, Count) ->
Acc2 = Acc1#acc2{p50 = percentile(Acc1#acc2.p50, Index, Count)},
Acc3 = Acc2#acc2{p75 = percentile(Acc2#acc2.p75, Index, Count)},
Acc4 = Acc3#acc2{p90 = percentile(Acc3#acc2.p90, Index, Count)},
Acc5 = Acc4#acc2{p95 = percentile(Acc4#acc2.p95, Index, Count)},
Acc6 = Acc5#acc2{p99 = percentile(Acc5#acc2.p99, Index, Count)},
Acc7 = Acc6#acc2{p999 = percentile(Acc6#acc2.p999, Index, Count)},
Acc7.
percentile({Rank, 0}, Index, Count) when Rank < Count ->
Min = couch_stats_histogram:bin_min(Index),
Max = couch_stats_histogram:bin_max(Index),
Width = Max - Min,
% Count should not be 0, we already filtered out Count == 0 bins
Frac = Rank / Count,
% Do a bit of extra work to get a nicer interpolated percentile value. Frac
% is the fractional part of the bin width based on the rank left-over. For
% example, if the Count = 1000:
%
% If Rank = 1, then Frac = 0.001 : We're closer to bin min
% If Rank = 500, then Frac = 0.5 : We're closer the middle
% If Rank = 950, then Frac = 0.95 : We're closer to bin max
%
Percentile = Min + Width * Frac,
{Rank, Percentile};
percentile({Rank, 0}, _Index, Count) ->
% Haven't reached our bin yet, reduce this percentile's rank by Count
% amount and keep going.
{Rank - Count, 0};
percentile({Rank, Percentile}, _, _) when is_number(Percentile), Percentile > 0 ->
% Nothing left to do, we already calculated this percentile value.
{Rank, Percentile}.
skewness(N, Diff3Sum, StdDev) ->
% https://en.wikipedia.org/wiki/Skewness
% Skewness = M3 / StdDev^3
% M3 = mean(Diff3Sum) = Diff3Sum/N. (3rd central moment)
case math:pow(StdDev, 3) of
StdDev3 when StdDev3 < 1.0e-12 ->
% If StdDev is too low avoid dividing by 0, assume skewness = 0
0;
StdDev3 ->
M3 = Diff3Sum / N,
M3 / StdDev3
end.
kurtosis(N, Diff4Sum, StdDev) ->
% http://en.wikipedia.org/wiki/Kurtosis
% Kurtosis = M4 / StdDev^4 - 3
% M4 = mean(Diff4Sum) = Diff4Sum/4. (4th central momement)
%
% Normal distribution kurtosis is 3 so we subtract 3 to get excess kurtosis
% to show how it's different from a normal distribution.
case math:pow(StdDev, 4) of
StdDev4 when StdDev4 < 1.0e-12 ->
0;
StdDev4 ->
M4 = Diff4Sum / N,
M4 / StdDev4 - 3
end.
n0_stats() ->
[
{n, 0},
{min, 0},
{max, 0},
{arithmetic_mean, 0},
{geometric_mean, 0},
{harmonic_mean, 0},
{median, 0},
{variance, 0},
{standard_deviation, 0},
{skewness, 0},
{kurtosis, 0},
{percentile, [
{50, 0},
{75, 0},
{90, 0},
{95, 0},
{99, 0},
{999, 0}
]},
{histogram, [{0, 0}]}
].
-ifdef(TEST).
-include_lib("couch/include/couch_eunit.hrl").
basic_test() ->
H = couch_stats_histogram:new(),
Vals = [0.05, 0.9, 0.7, 0.7, 10.1, 11, 100.5, 0.10, 13.5],
[couch_stats_histogram:update(H, V) || V <- Vals],
Stats = couch_stats_histogram:stats(H),
Percentiles = prop(percentile, Stats),
?assertEqual(length(Vals), prop(n, Stats)),
?assert(flim(0.05, prop(min, Stats))),
?assert(flim(104, prop(max, Stats))),
?assert(flim(15.3, prop(arithmetic_mean, Stats))),
?assert(flim(1.9, prop(geometric_mean, Stats))),
?assert(flim(0.25, prop(harmonic_mean, Stats))),
?assert(flim(0.9, prop(median, Stats))),
?assert(flim(923, prop(variance, Stats))),
?assert(flim(30.4, prop(standard_deviation, Stats))),
% Values are skewed toward the left so we should have a positive skew
% https://en.wikipedia.org/wiki/Skewness
?assert(flim(2.3, prop(skewness, Stats))),
% We have more extreme tail outliers compared to a normal distribution, so
% excess kurtosis should be > 0. In stats-speak the distribution would
% be "leptokurtic".
?assert(flim(3.7, prop(kurtosis, Stats))),
?assert(flim(0.9, prop(50, Percentiles))),
?assert(flim(11.7, prop(75, Percentiles))),
?assert(flim(97, prop(90, Percentiles))),
?assert(flim(100, prop(95, Percentiles))),
?assert(flim(103, prop(99, Percentiles))),
?assert(flim(104, prop(999, Percentiles))).
min_extreme_test() ->
% All the values in the smallest bin
H = couch_stats_histogram:new(),
N = 1000000,
[couch_stats_histogram:update(H, 0) || _ <- lists:seq(1, N)],
Stats = couch_stats_histogram:stats(H),
Percentiles = prop(percentile, Stats),
?assertEqual(N, prop(n, Stats)),
?assert(flim(0, prop(min, Stats))),
?assert(flim(0, prop(max, Stats))),
?assert(flim(0, prop(arithmetic_mean, Stats))),
?assert(flim(0, prop(geometric_mean, Stats))),
?assert(flim(0, prop(harmonic_mean, Stats))),
?assert(flim(0, prop(median, Stats))),
?assert(flim(0, prop(variance, Stats))),
?assert(flim(0, prop(standard_deviation, Stats))),
?assert(flim(0, prop(skewness, Stats))),
?assert(flim(0, prop(kurtosis, Stats))),
?assert(flim(0, prop(50, Percentiles))),
?assert(flim(0, prop(75, Percentiles))),
?assert(flim(0, prop(90, Percentiles))),
?assert(flim(0, prop(95, Percentiles))),
?assert(flim(0, prop(99, Percentiles))),
?assert(flim(0, prop(999, Percentiles))).
max_extreme_test() ->
% All the values are in the largest bin
H = couch_stats_histogram:new(),
N = 1000000,
% ?BIN_COUNT in couch_stats_histogram.erl
HighestBin = 230,
[couch_stats_histogram:update(H, 10000000) || _ <- lists:seq(1, N)],
Stats = couch_stats_histogram:stats(H),
Percentiles = prop(percentile, Stats),
?assertEqual(N, prop(n, Stats)),
% Min would be the lower bound of the highest bin
BinMin = couch_stats_histogram:bin_min(HighestBin),
?assert(flim(BinMin, prop(min, Stats))),
% Max would be the highest bound of the highest bin
BinMax = couch_stats_histogram:bin_max(HighestBin),
?assert(flim(BinMax, prop(max, Stats))),
BinMid = couch_stats_histogram:bin_middle(HighestBin),
?assert(flim(BinMid, prop(arithmetic_mean, Stats))),
?assert(flim(BinMid, prop(geometric_mean, Stats))),
?assert(flim(BinMid, prop(harmonic_mean, Stats))),
?assert(flim(BinMid, prop(median, Stats))),
?assert(flim(0, prop(variance, Stats))),
?assert(flim(0, prop(standard_deviation, Stats))),
?assert(flim(0, prop(skewness, Stats))),
?assert(flim(0, prop(kurtosis, Stats))),
?assert(flim(BinMid, prop(50, Percentiles))),
?assert(flim(4128767, prop(75, Percentiles))),
?assert(flim(4168089, prop(90, Percentiles))),
?assert(flim(4181196, prop(95, Percentiles))),
?assert(flim(4191682, prop(99, Percentiles))),
?assert(flim(4194041, prop(999, Percentiles))).
normal_dist_test() ->
H = couch_stats_histogram:new(),
rand:seed(default, {1, 2, 3}),
N = 1000000,
Mean = 50,
Var = 100,
[couch_stats_histogram:update(H, rand:normal(Mean, Var)) || _ <- lists:seq(1, N)],
Stats = couch_stats_histogram:stats(H),
Percentiles = prop(percentile, Stats),
?assertEqual(N, prop(n, Stats)),
?assert(flim(3.7, prop(min, Stats))),
?assert(flim(104, prop(max, Stats))),
?assert(flim(Mean, prop(arithmetic_mean, Stats))),
?assert(flim(49, prop(geometric_mean, Stats))),
?assert(flim(48, prop(harmonic_mean, Stats))),
% Median and mean of a normal distribution should be the same
?assert(flim(Mean, prop(median, Stats))),
?assert(flim(Var, prop(variance, Stats))),
?assert(flim(math:sqrt(Var), prop(standard_deviation, Stats))),
% Skewness should be close to 0 as the distribution is symmetric
?assert(flim(0.0, prop(skewness, Stats))),
% Excess kurtosis should be 0. In stats-speak normal distribution is
% "mesokurtic".
?assert(flim(0.0, prop(kurtosis, Stats))),
% P50 = Median = Mean
?assert(flim(Mean, prop(50, Percentiles))),
?assert(flim(56, prop(75, Percentiles))),
?assert(flim(63, prop(90, Percentiles))),
?assert(flim(68, prop(95, Percentiles))),
?assert(flim(74, prop(99, Percentiles))),
?assert(flim(82, prop(999, Percentiles))).
uniform_dist_test() ->
H = couch_stats_histogram:new(),
rand:seed(default, {1, 2, 3}),
N = 1000000,
% rand:uniform/1 returns values in [1,N], so subtract 1 to get values closer to 0
RandFun = fun() -> rand:uniform(10000001) / 10 - 1 end,
[couch_stats_histogram:update(H, RandFun()) || _ <- lists:seq(1, N)],
Stats = couch_stats_histogram:stats(H),
Percentiles = prop(percentile, Stats),
?assertEqual(N, prop(n, Stats)),
?assert(flim(0, prop(min, Stats))),
?assert(flim(1040000, prop(max, Stats))),
?assert(flim(500000, prop(arithmetic_mean, Stats))),
?assert(flim(368000, prop(geometric_mean, Stats))),
?assert(flim(8800, prop(harmonic_mean, Stats))),
?assert(flim(500000, prop(median, Stats))),
% Variance and stddev should be large for a uniform distribution
?assert(flim(83.0e9, prop(variance, Stats))),
?assert(flim(290000, prop(standard_deviation, Stats))),
% Skewness should be close to 0 as the distribution is symmetric
?assert(flim(0.0, prop(skewness, Stats))),
% Uniform distribution would be platykurtic. Excess kurtosis should be
% negative we'd have fewer extreme outliers (at the tails) than a normal
% distribution might have.
?assert(flim(-1.2, prop(kurtosis, Stats))),
?assert(flim(500000, prop(50, Percentiles))),
?assert(flim(750000, prop(75, Percentiles))),
?assert(flim(900000, prop(90, Percentiles))),
?assert(flim(950000, prop(95, Percentiles))),
?assert(flim(1010000, prop(99, Percentiles))),
?assert(flim(1040000, prop(999, Percentiles))).
prop(Prop, KVs) ->
proplists:get_value(Prop, KVs).
% Since we can't compare float exactly we use
% a tollerance of 5% and a minimum of 0.05
%
flim(X, Y) ->
flim(X, Y, max(0.05, abs(X * 0.05))).
flim(X, Y, Tol) ->
abs(X - Y) < Tol.
-endif.