src/folsom_statistics_scutil.erl - couchdb-folsom - Git at Google

 %% taken from http://crunchyd.com/scutil/
 %% All code here is MIT Licensed
 %%  http://scutil.com/license.html

 -module(folsom_statistics_scutil).

 -export([
          arithmetic_mean/1,
          geometric_mean/1,
          harmonic_mean/1,
          kendall_correlation/2,
          spearman_correlation/2,
          pearson_correlation/2
         ]).

 -compile([native]).

 % http://en.wikipedia.org/wiki/Arithmetic_mean
 arithmetic_mean([]) ->
     0.0;
 arithmetic_mean(List) when is_list(List) ->
     lists:sum(List) / length(List).

 % http://en.wikipedia.org/wiki/Geometric_mean
 geometric_mean([]) ->
     0.0;
 geometric_mean(List) when is_list(List) ->
     math:exp(arithmetic_mean([ math:log(X) || X<-List ])).

 % http://en.wikipedia.org/wiki/Harmonic_mean
 harmonic_mean([]) ->
     0.0;
 harmonic_mean(List) when is_list(List) ->
   length(List) / lists:sum([ 1/X || X<-List ]).


 % seems to match the value returned by the 'cor' (method="kendal") R function
 % http://en.wikipedia.org/wiki/Kendall_tau_rank_correlation_coefficient
 kendall_correlation(List1, List2) when is_list(List1), is_list(List2) ->
     {RA,_} = lists:unzip(tied_ordered_ranking(List1)),
     {RB,_} = lists:unzip(tied_ordered_ranking(List2)),

     Ordering = lists:keysort(1, lists:zip(RA,RB)),
     {_,OrdB} = lists:unzip(Ordering),

     N = length(List1),
     P = lists:sum(kendall_right_of(OrdB, [])),

     -(( (4*P) / (N * (N - 1))) - 1).

 % seems to match the value returned by the 'cor' (method="spearman") R function
 % http://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient
 spearman_correlation(List1, List2) when is_list(List1), is_list(List2) ->
     {TR1,_} = lists:unzip(simple_ranking(List1)),
     {TR2,_} = lists:unzip(simple_ranking(List2)),

     Numerator   = 6 * lists:sum([ square(D1-D2) || {D1,D2} <- lists:zip(TR1,TR2) ]),
     Denominator = math:pow(length(List1),3)-length(List1),

     1-(Numerator/Denominator).

 % seems to match the value returned by the 'cor' (method="pearson") R function
 % http://en.wikipedia.org/wiki/Pearson_correlation_coefficient
 pearson_correlation(List1, List2) when is_list(List1), is_list(List2) ->
     SumXY = lists:sum([A*B || {A,B} <- lists:zip(List1,List2) ]), % the sum of the products of each matched pair

     SumX  = lists:sum(List1),
     SumY  = lists:sum(List2),

     SumXX = lists:sum([L*L || L<-List1]), % the sums of the squared items
     SumYY = lists:sum([L*L || L<-List2]),

     N     = length(List1),

     case math:sqrt(   ( (N*SumXX)-(SumX*SumX) )   *   ( (N*SumYY)-(SumY*SumY) )   ) of
         0 ->
             0.0; % some nasty value sets otherwise cause divide by zero
         0.0 ->
             0.0; % eg [ [1,1,1,1,1], [1,1,2,1,2] ]
         Denom ->
           Numer = (N*SumXY) - (SumX * SumY),
           (Numer/Denom)
     end.

 %%%===================================================================
 %%% Internal functions
 %%%==================================================================

 simple_ranking(List) when is_list(List) ->
     lists:zip(lists:seq(1,length(List)),lists:reverse(lists:sort(List))).

 tied_ranking(List) ->
     tied_rank_worker(simple_ranking(List), [], no_prev_value).

 tied_ordered_ranking(List) when is_list(List) ->
     tied_ordered_ranking(List, tied_ranking(List), []).

 tied_ordered_ranking([], [], Work) ->
     lists:reverse(Work);

 tied_ordered_ranking([Front|Rem], Ranks, Work) ->
     {value,Item}  = lists:keysearch(Front,2,Ranks),
     {IRank,Front} = Item,
     tied_ordered_ranking(Rem, Ranks--[Item], [{IRank,Front}]++Work).

 square(Value) ->
     math:pow(Value, 2).

 kendall_right_of([], Work) ->
     lists:reverse(Work);
 kendall_right_of([F|R], Work) ->
     kendall_right_of(R, [kendall_right_of_item(F,R)]++Work).

 kendall_right_of_item(B, Rem) ->
     length([R || R <- Rem, R < B]).

 tied_add_prev(Work, {FoundAt, NewValue}) ->
     lists:duplicate( length(FoundAt), {lists:sum(FoundAt)/length(FoundAt), NewValue} ) ++ Work.

 tied_rank_worker([], Work, PrevValue) ->
     lists:reverse(tied_add_prev(Work, PrevValue));

 tied_rank_worker([Item|Remainder], Work, PrevValue) ->
     case PrevValue of
         no_prev_value ->
             {BaseRank,BaseVal} = Item,
             tied_rank_worker(Remainder, Work, {[BaseRank],BaseVal});
         {FoundAt,OldVal} ->
             case Item of
                 {Id,OldVal} ->
                     tied_rank_worker(Remainder, Work, {[Id]++FoundAt,OldVal});
                 {Id,NewVal} ->
                     tied_rank_worker(Remainder, tied_add_prev(Work, PrevValue), {[Id],NewVal})

             end
     end.
	%% taken from http://crunchyd.com/scutil/
	%% All code here is MIT Licensed
	%% http://scutil.com/license.html

	-module(folsom_statistics_scutil).

	-export([
	arithmetic_mean/1,
	geometric_mean/1,
	harmonic_mean/1,
	kendall_correlation/2,
	spearman_correlation/2,
	pearson_correlation/2
	]).

	-compile([native]).

	% http://en.wikipedia.org/wiki/Arithmetic_mean
	arithmetic_mean([]) ->
	0.0;
	arithmetic_mean(List) when is_list(List) ->
	lists:sum(List) / length(List).

	% http://en.wikipedia.org/wiki/Geometric_mean
	geometric_mean([]) ->
	0.0;
	geometric_mean(List) when is_list(List) ->
	math:exp(arithmetic_mean([ math:log(X) \|\| X<-List ])).

	% http://en.wikipedia.org/wiki/Harmonic_mean
	harmonic_mean([]) ->
	0.0;
	harmonic_mean(List) when is_list(List) ->
	length(List) / lists:sum([ 1/X \|\| X<-List ]).


	% seems to match the value returned by the 'cor' (method="kendal") R function
	% http://en.wikipedia.org/wiki/Kendall_tau_rank_correlation_coefficient
	kendall_correlation(List1, List2) when is_list(List1), is_list(List2) ->
	{RA,_} = lists:unzip(tied_ordered_ranking(List1)),
	{RB,_} = lists:unzip(tied_ordered_ranking(List2)),

	Ordering = lists:keysort(1, lists:zip(RA,RB)),
	{_,OrdB} = lists:unzip(Ordering),

	N = length(List1),
	P = lists:sum(kendall_right_of(OrdB, [])),

	-(( (4P) / (N (N - 1))) - 1).

	% seems to match the value returned by the 'cor' (method="spearman") R function
	% http://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient
	spearman_correlation(List1, List2) when is_list(List1), is_list(List2) ->
	{TR1,_} = lists:unzip(simple_ranking(List1)),
	{TR2,_} = lists:unzip(simple_ranking(List2)),

	Numerator = 6 * lists:sum([ square(D1-D2) \|\| {D1,D2} <- lists:zip(TR1,TR2) ]),
	Denominator = math:pow(length(List1),3)-length(List1),

	1-(Numerator/Denominator).

	% seems to match the value returned by the 'cor' (method="pearson") R function
	% http://en.wikipedia.org/wiki/Pearson_correlation_coefficient
	pearson_correlation(List1, List2) when is_list(List1), is_list(List2) ->
	SumXY = lists:sum([A*B \|\| {A,B} <- lists:zip(List1,List2) ]), % the sum of the products of each matched pair

	SumX = lists:sum(List1),
	SumY = lists:sum(List2),

	SumXX = lists:sum([L*L \|\| L<-List1]), % the sums of the squared items
	SumYY = lists:sum([L*L \|\| L<-List2]),

	N = length(List1),

	case math:sqrt( ( (NSumXX)-(SumXSumX) ) * ( (NSumYY)-(SumYSumY) ) ) of
	0 ->
	0.0; % some nasty value sets otherwise cause divide by zero
	0.0 ->
	0.0; % eg [ [1,1,1,1,1], [1,1,2,1,2] ]
	Denom ->
	Numer = (NSumXY) - (SumX SumY),
	(Numer/Denom)
	end.

	%%%===================================================================
	%%% Internal functions
	%%%==================================================================

	simple_ranking(List) when is_list(List) ->
	lists:zip(lists:seq(1,length(List)),lists:reverse(lists:sort(List))).

	tied_ranking(List) ->
	tied_rank_worker(simple_ranking(List), [], no_prev_value).

	tied_ordered_ranking(List) when is_list(List) ->
	tied_ordered_ranking(List, tied_ranking(List), []).

	tied_ordered_ranking([], [], Work) ->
	lists:reverse(Work);

	tied_ordered_ranking([Front\|Rem], Ranks, Work) ->
	{value,Item} = lists:keysearch(Front,2,Ranks),
	{IRank,Front} = Item,
	tied_ordered_ranking(Rem, Ranks--[Item], [{IRank,Front}]++Work).

	square(Value) ->
	math:pow(Value, 2).

	kendall_right_of([], Work) ->
	lists:reverse(Work);
	kendall_right_of([F\|R], Work) ->
	kendall_right_of(R, [kendall_right_of_item(F,R)]++Work).

	kendall_right_of_item(B, Rem) ->
	length([R \|\| R <- Rem, R < B]).

	tied_add_prev(Work, {FoundAt, NewValue}) ->
	lists:duplicate( length(FoundAt), {lists:sum(FoundAt)/length(FoundAt), NewValue} ) ++ Work.

	tied_rank_worker([], Work, PrevValue) ->
	lists:reverse(tied_add_prev(Work, PrevValue));

	tied_rank_worker([Item\|Remainder], Work, PrevValue) ->
	case PrevValue of
	no_prev_value ->
	{BaseRank,BaseVal} = Item,
	tied_rank_worker(Remainder, Work, {[BaseRank],BaseVal});
	{FoundAt,OldVal} ->
	case Item of
	{Id,OldVal} ->
	tied_rank_worker(Remainder, Work, {[Id]++FoundAt,OldVal});
	{Id,NewVal} ->
	tied_rank_worker(Remainder, tied_add_prev(Work, PrevValue), {[Id],NewVal})

	end
	end.