blob: 386b5aef3638f32035c67fd8af691d2e60f9a0bb [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
/*
*
* For a given pair of attribute sets, compute bivariate statistics between all attribute pairs
* Given, S_1 = {A_11, A_12, ... A_1m} and S_2 = {A_21, A_22, ... A_2n}
* compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n)
*
* Seven inputs:
* $1) D - input data
* $2) S1 - First attribute set {A_11, A_12, ... A_1m}
* $3) S2 - Second attribute set {A_21, A_22, ... A_2n}
* $4) K1 - kind for attributes in S1
* $5) K2 - kind for attributes in S2
* kind=1 for scale, kind=2 for nominal, kind=3 for ordinal
* $6) numPairs - total number of pairs (m*n)
* $7) maxC - maximum number of categories in any categorical attribute
*
* One output:
* $6) output directory in which following four statistics files are created
* + bivar.stats - matrix with all 8 bivariate statistics computed for different attribute pairs
* (R, (chi-sq, df, pval, cramersv), spearman, Eta, F)
* + categorical.counts -
* + categorical.means -
* + categorical.variances -
* -> Values in these three matrices are applicable only for scale-categorical attribute pairs.
* k^th column in these matrices denote the attribute pair (A_1i,A_2j) where i*j = k.
*/
D = read($1, rows=$7, cols=$8); # input data set
S1 = read($2, rows=1, cols=$9); # attribute set 1
S2 = read($3, rows=1, cols=$9); # attribute set 2
K1 = read($4, rows=1, cols=$9); # kind for attributes in S1
K2 = read($5, rows=1, cols=$9); # kind for attributes in S2
numPairs = $10; # number of attribute pairs (|S1|*|S2|)
maxC = $11; # max number of categories in any categorical attribute
s1size = ncol(S1);
s2size = ncol(S2);
#numpairs = s1size * s2size;
#print(s1size + ", " + s2size + ", " + numpairs);
# R, chisq, cramers, spearman, eta, anovaf
numstats = 8;
basestats = matrix(0, rows=numstats, cols=numPairs);
cat_counts = matrix(0, rows=maxC, cols=numPairs);
cat_means = matrix(0, rows=maxC, cols=numPairs);
cat_vars = matrix(0, rows=maxC, cols=numPairs);
dummy = matrix(1, rows=1, cols=1);
parfor( i in 1:s1size, check=0, opt=HEURISTIC) {
a1 = as.scalar(S1[,i]);
k1 = as.scalar(K1[1,i]);
A1 = D[,a1];
parfor( j in 1:s2size, check=0) {
pairID = (i-1)*s2size+j;
a2 = as.scalar(S2[,j]);
k2 = as.scalar(K2[1,j]);
A2 = D[,a2];
if (k1 == k2) {
if (k1 == 1) {
# scale-scale
print("[" + i + "," + j + "] scale-scale");
r = bivar_ss(A1,A2);
basestats[1,pairID] = dummy*r;
} else {
# nominal-nominal or ordinal-ordinal
print("[" + i + "," + j + "] categorical-categorical");
[chisq, df, pval, cramersv] = bivar_cc(A1,A2);
basestats[2,pairID] = dummy*chisq;
basestats[3,pairID] = dummy*df;
basestats[4,pairID] = dummy*pval;
basestats[5,pairID] = dummy*cramersv;
if ( k1 == 3 ) {
# ordinal-ordinal
print("[" + i + "," + j + "] ordinal-ordinal");
sp = bivar_oo(A1, A2);
basestats[6,pairID] = dummy*sp;
}
}
}
else {
if (k1 == 1 | k2 == 1) {
# Scale-nominal/ordinal
print("[" + i + "," + j + "] scale-categorical");
if ( k1 == 1 ) {
[eta,f, counts, means, vars] = bivar_sc(A1,A2);
}
else {
[eta,f, counts, means, vars] = bivar_sc(A2,A1);
}
basestats[7,pairID] = dummy*eta;
basestats[8,pairID] = dummy*f;
cat_counts[1:nrow(counts),pairID] = counts;
cat_means[1:nrow(means),pairID] = means;
cat_vars[1:nrow(vars),pairID] = vars;
}
else {
# nominal-ordinal or ordinal-nominal
print("[" + i + "," + j + "] categorical-categorical");
[chisq, df, pval, cramersv] = bivar_cc(A1,A2);
basestats[2,pairID] = dummy*chisq;
basestats[3,pairID] = dummy*df;
basestats[4,pairID] = dummy*pval;
basestats[5,pairID] = dummy*cramersv;
}
}
}
}
write(basestats, $6 + "/bivar.stats");
write(cat_counts, $6 + "/category.counts");
write(cat_means, $6 + "/category.means");
write(cat_vars, $6 + "/category.variances");
# -----------------------------------------------------------------------------------------------------------
bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double chisq, Double df, Double pval, Double cramersv) {
# Contingency Table
F = table(A,B);
# Chi-Squared
W = sum(F);
r = rowSums(F);
c = colSums(F);
E = (r %*% c)/W;
T = (F-E)^2/E;
chi_squared = sum(T);
# compute p-value
degFreedom = (nrow(F)-1)*(ncol(F)-1);
pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
# Cramer's V
R = nrow(F);
C = ncol(F);
q = min(R,C);
cramers_v = sqrt(chi_squared/(W*(q-1)));
# Assign return values
chisq = chi_squared;
df = degFreedom;
pval = pValue;
cramersv = cramers_v;
}
# -----------------------------------------------------------------------------------------------------------
bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R) {
# Unweighted co-variance
covXY = cov(X,Y);
# compute standard deviations for both X and Y by computing 2^nd central moment
W = nrow(X);
m2X = moment(X,2);
m2Y = moment(Y,2);
sigmaX = sqrt(m2X * (W/(W-1.0)) );
sigmaY = sqrt(m2Y * (W/(W-1.0)) );
# Pearson's R
R = covXY / (sigmaX*sigmaY);
}
# -----------------------------------------------------------------------------------------------------------
# Y points to SCALE variable
# A points to CATEGORICAL variable
bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double Eta, Double AnovaF, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) {
# mean and variance in target variable
W = nrow(A);
my = mean(Y);
varY = moment(Y,2) * W/(W-1.0)
# category-wise (frequencies, means, variances)
CFreqs = aggregate(target=Y, groups=A, fn="count");
CMeans = aggregate(target=Y, groups=A, fn="mean");
CVars = aggregate(target=Y, groups=A, fn="variance");
# number of categories
R = nrow(CFreqs);
Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) ));
anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1);
anova_den = sum( (CFreqs-1)*CVars )/(W-R);
AnovaF = anova_num/anova_den;
}
# -----------------------------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------------------------
# Function to compute ranks
# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category
computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) {
dummy = matrix(1, rows=1, cols=1);
Rks = X;
size = nrow(X);
for(i in 1:size) {
prefixSum = 0.0;
if( i>1 ){
prefixSum = sum(X[1:(i-1),1]);
}
Rks[i,1] = dummy * (prefixSum + ((as.scalar(X[i,1])+1)/2));
}
Ranks = Rks;
}
#-------------------------------------------------------------------------
bivar_oo = function(Matrix[Double] A, Matrix[Double] B) return (Double sp) {
# compute contingency table
F = table(A,B);
catA = nrow(F); # number of categories in A
catB = ncol(F); # number of categories in B
# compute category-wise counts for both the attributes
R = rowSums(F);
S = colSums(F);
# compute scores, both are column vectors
[C] = computeRanks(R);
meanX = mean(C,R);
columnS = t(S);
[D] = computeRanks(columnS);
# scores (C,D) are individual values, and counts (R,S) act as weights
meanY = mean(D,columnS);
W = sum(F); # total weight, or total #cases
varX = moment(C,R,2)*(W/(W-1.0));
varY = moment(D,columnS,2)*(W/(W-1.0));
covXY = 0.0;
for(i in 1:catA) {
covXY = covXY + sum((F[i,]/(W-1)) * (as.scalar(C[i,1])-meanX) * (t(D[,1])-meanY));
}
sp = covXY/(sqrt(varX)*sqrt(varY));
}
# -----------------------------------------------------------------------------------------------------------