Merge pull request #13 from Feuerlabs/uw-boundary-stats-subset

adjust get_statistics to allow for requesting specific stats to calculate
diff --git a/src/bear.erl b/src/bear.erl
index 04593e7..b211a54 100644
--- a/src/bear.erl
+++ b/src/bear.erl
@@ -41,32 +41,7 @@
 
 -compile([native]).
 
-get_statistics(Values) when length(Values) < ?STATS_MIN ->
-    [
-     {min, 0.0},
-     {max, 0.0},
-     {arithmetic_mean, 0.0},
-     {geometric_mean, 0.0},
-     {harmonic_mean, 0.0},
-     {median, 0.0},
-     {variance, 0.0},
-     {standard_deviation, 0.0},
-     {skewness, 0.0},
-     {kurtosis, 0.0},
-     {percentile,
-      [
-       {50, 0.0},
-       {75, 0.0},
-       {90, 0.0},
-       {95, 0.0},
-       {99, 0.0},
-       {999, 0.0}
-      ]
-     },
-     {histogram, [{0, 0}]},
-     {n, 0}
-     ];
-get_statistics(Values) ->
+get_statistics([_,_,_,_,_|_] = Values) ->
     Scan_res = scan_values(Values),
     Scan_res2 = scan_values2(Values, Scan_res),
     Variance = variance(Scan_res, Scan_res2),
@@ -94,7 +69,86 @@
      },
      {histogram, get_histogram(Values, Scan_res, Scan_res2)},
      {n, Scan_res#scan_result.n}
-     ].
+    ];
+get_statistics(Values) when is_list(Values) ->
+    [
+     {min, 0.0},
+     {max, 0.0},
+     {arithmetic_mean, 0.0},
+     {geometric_mean, 0.0},
+     {harmonic_mean, 0.0},
+     {median, 0.0},
+     {variance, 0.0},
+     {standard_deviation, 0.0},
+     {skewness, 0.0},
+     {kurtosis, 0.0},
+     {percentile,
+      [
+       {50, 0.0},
+       {75, 0.0},
+       {90, 0.0},
+       {95, 0.0},
+       {99, 0.0},
+       {999, 0.0}
+      ]
+     },
+     {histogram, [{0, 0}]},
+     {n, 0}
+    ].
+
+get_statistics_subset(Values, Items) ->
+    Length = length(Values),
+    if Length < ?STATS_MIN ->
+	    [I || {K,_} = I <- get_statistics([]),
+		  lists:member(K, Items) orelse K==percentiles];
+       true ->
+	    SortedValues = lists:sort(Values),
+	    Steps = calc_steps(Items),
+	    Scan_res = if Steps > 1 -> scan_values(Values);
+			  true -> []
+		       end,
+	    Scan_res2 = if Steps > 2 -> scan_values2(Values, Scan_res);
+			   true -> []
+			end,
+	    report_subset(Items, Length,
+			  SortedValues, Scan_res, Scan_res2)
+    end.
+
+calc_steps(Items) ->
+    lists:foldl(fun({I,_},Acc) ->
+			erlang:max(level(I), Acc);
+		   (I,Acc) ->
+			erlang:max(level(I), Acc)
+		end, 1, Items).
+
+level(standard_deviation) -> 3;
+level(variance          ) -> 3;
+level(skewness          ) -> 3;
+level(kurtosis          ) -> 3;
+level(histogram         ) -> 3;
+level(arithmetic_mean   ) -> 2;
+level(geometric_mean    ) -> 2;
+level(harmonic_mean     ) -> 2;
+level(_) -> 1.
+
+report_subset(Items, N, SortedValues, Scan_res, Scan_res2) ->
+    lists:map(
+      fun(min) -> {min, hd(SortedValues)};
+	 (max) -> {max, lists:last(SortedValues)};
+	 (arithmetic_mean) -> {arithmetic_mean, arithmetic_mean(Scan_res)};
+	 (harmonic_mean) -> {harmonic_mean, harmonic_mean(Scan_res)};
+	 (geometric_mean) -> {geometric_mean, geometric_mean(Scan_res)};
+	 (median) -> {median, percentile(SortedValues,
+					 #scan_result{n = N}, 0.5)};
+	 (variance) -> {variance, variance(Scan_res, Scan_res2)};
+	 (standard_deviation=I) -> {I, std_deviation(Scan_res, Scan_res2)};
+	 (skewness) -> {skewness, skewness(Scan_res, Scan_res2)};
+	 (kurtosis) -> {kurtosis, kurtosis(Scan_res, Scan_res2)};
+	 ({percentile,Ps}) -> {percentile, percentiles(Ps, N, SortedValues)};
+	 (histogram) ->
+	      {histogram, get_histogram(SortedValues, Scan_res, Scan_res2)};
+	 (n) -> {n, N}
+      end, Items).
 
 get_statistics(Values, _) when length(Values) < ?STATS_MIN ->
     0.0;
@@ -446,3 +500,36 @@
             end
     end.
 
+
+percentiles(Ps, N, Values) ->
+    Items = [{P, perc(P, N)} || P <- Ps],
+    pick_items(Values, 1, Items).
+
+pick_items([H|_] = L, P, [{Tag,P}|Ps]) ->
+    [{Tag,H} | pick_items(L, P, Ps)];
+pick_items([_|T], P, Ps) ->
+    pick_items(T, P+1, Ps);
+pick_items([], _, Ps) ->
+    [{Tag,undefined} || {Tag,_} <- Ps].
+
+perc(P, Len) when is_integer(P), 0 =< P, P =< 100 ->
+    V = round(P * Len / 100),
+    erlang:max(1, V);
+perc(P, Len) when is_integer(P), 100 =< P, P =< 1000 ->
+    V = round(P * Len / 1000),
+    erlang:max(1, V);
+perc(P, Len) when is_float(P), 0 =< P, P =< 1 ->
+    erlang:max(1, round(P * Len)).
+
+
+test_values() ->
+    [1,1,1,1,1,1,1,
+     2,2,2,2,2,2,2,
+     3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+     4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+     5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+     6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+     8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+     9,9,9,9,9,9,9].
+
diff --git a/test/bear_test.erl b/test/bear_test.erl
index 2cca076..fc37c6b 100644
--- a/test/bear_test.erl
+++ b/test/bear_test.erl
@@ -235,3 +235,28 @@
     ?assertEqual([{2.0,5},{2.0,5},{2.0,5},{2.0,5}], bear:tied_rank_worker([], [{2.0,5}], {[1,2,3], 5})),
     ?assertEqual([{2.0,5},{2.0,5},{2.0,5},{2.0,5},{2.0,5},{2.0,5}],
                  bear:tied_rank_worker([{2.0,5},{2.0,5}], [{2.0,5}], {[1,2,3], 5})).
+
+subset_test() ->
+    Stats = bear:get_statistics(bear:test_values()),
+    match_values(Stats).
+
+full_subset_test() ->
+    Stats = bear:get_statistics(bear:test_values()),
+    match_values2(Stats).
+
+match_values([H|T]) ->
+    Res = bear:get_statistics_subset(bear:test_values(), [mk_item(H)]),
+    Res = [H],
+    match_values(T);
+match_values([]) ->
+    ok.
+
+mk_item({percentile, Ps}) ->
+    {percentile, [P || {P,_} <- Ps]};
+mk_item({K, _}) ->
+    K.
+
+match_values2(Stats) ->
+    Items = [mk_item(I) || I <- Stats],
+    Stats = bear:get_statistics_subset(bear:test_values(), Items),
+    ok.