attempt at better dynamic histogram binning

commit: 11a246d1f404b7e756aed92c8fe4e1117d92ac91 [log] [tgz]
author: joewilliams <williams.joe@gmail.com> Fri Dec 16 11:42:17 2011 -0800
committer: joewilliams <williams.joe@gmail.com> Fri Dec 16 11:42:17 2011 -0800
tree: cd3272cb60e1f4f37fef047d46708e8d551d46ad
parent: 1d66048d3983640d8326cc020527cb082689c499 [diff]
diff --git a/src/folsom_statistics.erl b/src/folsom_statistics.erl
index 5d6f197..efd8cfc 100644
--- a/src/folsom_statistics.erl
+++ b/src/folsom_statistics.erl

@@ -65,7 +65,7 @@
        {999, percentile(SortedValues, Scan_res, 0.999)}
       ]
      },
-     {histogram, get_histogram(Values, Scan_res)}
+     {histogram, get_histogram(Values, Scan_res, Scan_res2)}
      ].
 
 get_statistics(Values1, Values2) ->
@@ -154,13 +154,19 @@
 kurtosis(#scan_result{n=N}=Scan_res, #scan_result2{x4=X4}=Scan_res2) ->
     (X4/N)/(math:pow(std_deviation(Scan_res,Scan_res2), 4)) - 3.
 
-get_histogram(Values, Scan_res) ->
-    Bins = get_hist_bins(Scan_res#scan_result.min, Scan_res#scan_result.max),
+get_histogram(Values, Scan_res, Scan_res2) ->
+    Bins = get_hist_bins(Scan_res#scan_result.min,
+                         Scan_res#scan_result.max,
+                         std_deviation(Scan_res, Scan_res2),
+                         length(Values)
+                        ),
+
     Dict = lists:foldl(fun (Value, Dict) ->
              update_bin(Value, Bins, Dict)
            end,
            dict:from_list([{Bin, 0} || Bin <- Bins]),
            Values),
+
     lists:sort(dict:to_list(Dict)).
 
 update_bin(Value, [Bin|_Bins], Dict) when Value =< Bin ->
@@ -273,13 +279,14 @@
 inverse(X) ->
     1/X.
 
-get_hist_bins(Min, Max) ->
-    Width = round((Max - Min) / ?HIST_BINS),
-    get_bin_list(Width, ?HIST_BINS, []).
+get_hist_bins(Min, Max, StdDev, Count) ->
+    BinWidth = get_bin_width(StdDev, Count),
+    BinCount = get_bin_count(Min, Max, BinWidth),
+    get_bin_list(BinWidth, BinCount, []).
 
 get_bin_list(Width, Bins, Acc) when Bins > length(Acc) ->
     Bin = ((length(Acc) + 1) * Width ),
-    get_bin_list(Width, Bins, [round_bin(Bin) | Acc]);
+    get_bin_list(Width, Bins, [round_bin(Bin)| Acc]);
 get_bin_list(_, _, Acc) ->
     lists:usort(Acc).
 
@@ -291,3 +298,18 @@
     Bin;
 round_bin(Bin, Base) ->
     Bin + Base - (Bin rem Base).
+
+% the following is up for debate as far as what the best method
+% of choosing bin counts and widths. these seem to work *good enough*
+% in my testing
+
+% bin width based on Sturges
+% http://www.jstor.org/pss/2965501
+get_bin_width(StdDev, Count) ->
+    round((3.5 * StdDev) / math:pow(Count, 0.3333333)).
+
+% based on the simple ceilng function at
+% http://en.wikipedia.org/wiki/Histograms#Number_of_bins_and_width
+% with a modification to attempt to get on bin beyond the max value
+get_bin_count(Min, Max, Width) ->
+    round((Max - Min) / Width) + 1.

diff --git a/test/folsom_erlang_checks.erl b/test/folsom_erlang_checks.erl
index 7c8e2d5..24122f0 100644
--- a/test/folsom_erlang_checks.erl
+++ b/test/folsom_erlang_checks.erl

@@ -236,16 +236,7 @@
     5000 = proplists:get_value(999, List).
 
 histogram_check(List) ->
-    [
-     {500,7},
-     {1000,2},
-     {1500,0},
-     {2000,1},
-     {2500,0},
-     {3000,0},
-     {4000,0},
-     {5000,1}
-    ] = List.
+    [{2400,10},{5000,1},{8000,0}] = List.
 
 counter_inc_dec(Counter) ->
     ok = folsom_metrics:notify({Counter, {inc, 1}}),
commit	11a246d1f404b7e756aed92c8fe4e1117d92ac91	[log] [tgz]
author	joewilliams <williams.joe@gmail.com>	Fri Dec 16 11:42:17 2011 -0800
committer	joewilliams <williams.joe@gmail.com>	Fri Dec 16 11:42:17 2011 -0800
tree	cd3272cb60e1f4f37fef047d46708e8d551d46ad
parent	1d66048d3983640d8326cc020527cb082689c499 [diff]