| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| |
| |
| /* |
| * |
| * For a given pair of attribute sets, compute bivariate statistics between all attribute pairs |
| * Given, S_1 = {A_11, A_12, ... A_1m} and S_2 = {A_21, A_22, ... A_2n} |
| * compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n) |
| * |
| * Seven inputs: |
| * $1) D - input data |
| * $2) S1 - First attribute set {A_11, A_12, ... A_1m} |
| * $3) S2 - Second attribute set {A_21, A_22, ... A_2n} |
| * $4) K1 - kind for attributes in S1 |
| * $5) K2 - kind for attributes in S2 |
| * kind=1 for scale, kind=2 for nominal, kind=3 for ordinal |
| * $6) numPairs - total number of pairs (m*n) |
| * $7) maxC - maximum number of categories in any categorical attribute |
| * |
| * One output: |
| * $6) output directory in which following four statistics files are created |
| * + bivar.stats - matrix with all 8 bivariate statistics computed for different attribute pairs |
| * (R, (chi-sq, df, pval, cramersv), spearman, Eta, F) |
| * + categorical.counts - |
| * + categorical.means - |
| * + categorical.variances - |
| * -> Values in these three matrices are applicable only for scale-categorical attribute pairs. |
| * k^th column in these matrices denote the attribute pair (A_1i,A_2j) where i*j = k. |
| */ |
| |
| D = read($1, rows=$7, cols=$8); # input data set |
| S1 = read($2, rows=1, cols=$9); # attribute set 1 |
| S2 = read($3, rows=1, cols=$9); # attribute set 2 |
| K1 = read($4, rows=1, cols=$9); # kind for attributes in S1 |
| K2 = read($5, rows=1, cols=$9); # kind for attributes in S2 |
| numPairs = $10; # number of attribute pairs (|S1|*|S2|) |
| maxC = $11; # max number of categories in any categorical attribute |
| |
| s1size = ncol(S1); |
| s2size = ncol(S2); |
| |
| #numpairs = s1size * s2size; |
| #print(s1size + ", " + s2size + ", " + numpairs); |
| |
| # R, chisq, cramers, spearman, eta, anovaf |
| numstats = 8; |
| basestats = matrix(0, rows=numstats, cols=numPairs); |
| cat_counts = matrix(0, rows=maxC, cols=numPairs); |
| cat_means = matrix(0, rows=maxC, cols=numPairs); |
| cat_vars = matrix(0, rows=maxC, cols=numPairs); |
| |
| dummy = matrix(1, rows=1, cols=1); |
| |
| |
| parfor( i in 1:s1size, check=0, opt=HEURISTIC) { |
| a1 = as.scalar(S1[,i]); |
| k1 = as.scalar(K1[1,i]); |
| A1 = D[,a1]; |
| |
| parfor( j in 1:s2size, check=0) { |
| pairID = (i-1)*s2size+j; |
| a2 = as.scalar(S2[,j]); |
| k2 = as.scalar(K2[1,j]); |
| A2 = D[,a2]; |
| |
| if (k1 == k2) { |
| if (k1 == 1) { |
| # scale-scale |
| print("[" + i + "," + j + "] scale-scale"); |
| r = bivar_ss(A1,A2); |
| basestats[1,pairID] = dummy*r; |
| } else { |
| # nominal-nominal or ordinal-ordinal |
| print("[" + i + "," + j + "] categorical-categorical"); |
| [chisq, df, pval, cramersv] = bivar_cc(A1,A2); |
| basestats[2,pairID] = dummy*chisq; |
| basestats[3,pairID] = dummy*df; |
| basestats[4,pairID] = dummy*pval; |
| basestats[5,pairID] = dummy*cramersv; |
| |
| if ( k1 == 3 ) { |
| # ordinal-ordinal |
| print("[" + i + "," + j + "] ordinal-ordinal"); |
| sp = bivar_oo(A1, A2); |
| basestats[6,pairID] = dummy*sp; |
| } |
| } |
| } |
| else { |
| if (k1 == 1 | k2 == 1) { |
| # Scale-nominal/ordinal |
| print("[" + i + "," + j + "] scale-categorical"); |
| |
| if ( k1 == 1 ) { |
| [eta,f, counts, means, vars] = bivar_sc(A1,A2); |
| } |
| else { |
| [eta,f, counts, means, vars] = bivar_sc(A2,A1); |
| } |
| basestats[7,pairID] = dummy*eta; |
| basestats[8,pairID] = dummy*f; |
| cat_counts[1:nrow(counts),pairID] = counts; |
| cat_means[1:nrow(means),pairID] = means; |
| cat_vars[1:nrow(vars),pairID] = vars; |
| } |
| else { |
| # nominal-ordinal or ordinal-nominal |
| print("[" + i + "," + j + "] categorical-categorical"); |
| [chisq, df, pval, cramersv] = bivar_cc(A1,A2); |
| basestats[2,pairID] = dummy*chisq; |
| basestats[3,pairID] = dummy*df; |
| basestats[4,pairID] = dummy*pval; |
| basestats[5,pairID] = dummy*cramersv; |
| } |
| } |
| } |
| } |
| |
| write(basestats, $6 + "/bivar.stats"); |
| write(cat_counts, $6 + "/category.counts"); |
| write(cat_means, $6 + "/category.means"); |
| write(cat_vars, $6 + "/category.variances"); |
| |
| |
| # ----------------------------------------------------------------------------------------------------------- |
| |
| bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double chisq, Double df, Double pval, Double cramersv) { |
| |
| # Contingency Table |
| F = table(A,B); |
| |
| # Chi-Squared |
| W = sum(F); |
| r = rowSums(F); |
| c = colSums(F); |
| E = (r %*% c)/W; |
| T = (F-E)^2/E; |
| chi_squared = sum(T); |
| |
| # compute p-value |
| degFreedom = (nrow(F)-1)*(ncol(F)-1); |
| pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); |
| |
| # Cramer's V |
| R = nrow(F); |
| C = ncol(F); |
| q = min(R,C); |
| cramers_v = sqrt(chi_squared/(W*(q-1))); |
| |
| # Assign return values |
| chisq = chi_squared; |
| df = degFreedom; |
| pval = pValue; |
| cramersv = cramers_v; |
| } |
| |
| # ----------------------------------------------------------------------------------------------------------- |
| |
| bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R) { |
| |
| # Unweighted co-variance |
| covXY = cov(X,Y); |
| |
| # compute standard deviations for both X and Y by computing 2^nd central moment |
| W = nrow(X); |
| m2X = moment(X,2); |
| m2Y = moment(Y,2); |
| sigmaX = sqrt(m2X * (W/(W-1.0)) ); |
| sigmaY = sqrt(m2Y * (W/(W-1.0)) ); |
| |
| # Pearson's R |
| R = covXY / (sigmaX*sigmaY); |
| } |
| |
| # ----------------------------------------------------------------------------------------------------------- |
| |
| # Y points to SCALE variable |
| # A points to CATEGORICAL variable |
| bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double Eta, Double AnovaF, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) { |
| |
| # mean and variance in target variable |
| W = nrow(A); |
| my = mean(Y); |
| varY = moment(Y,2) * W/(W-1.0) |
| |
| # category-wise (frequencies, means, variances) |
| CFreqs = aggregate(target=Y, groups=A, fn="count"); |
| CMeans = aggregate(target=Y, groups=A, fn="mean"); |
| CVars = aggregate(target=Y, groups=A, fn="variance"); |
| |
| # number of categories |
| R = nrow(CFreqs); |
| |
| Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); |
| |
| anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); |
| anova_den = sum( (CFreqs-1)*CVars )/(W-R); |
| AnovaF = anova_num/anova_den; |
| } |
| |
| # ----------------------------------------------------------------------------------------------------------- |
| |
| |
| # ----------------------------------------------------------------------------------------------------------- |
| # Function to compute ranks |
| # takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category |
| computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) { |
| dummy = matrix(1, rows=1, cols=1); |
| Rks = X; |
| size = nrow(X); |
| for(i in 1:size) { |
| prefixSum = 0.0; |
| if( i>1 ){ |
| prefixSum = sum(X[1:(i-1),1]); |
| } |
| Rks[i,1] = dummy * (prefixSum + ((as.scalar(X[i,1])+1)/2)); |
| } |
| Ranks = Rks; |
| } |
| |
| #------------------------------------------------------------------------- |
| |
| bivar_oo = function(Matrix[Double] A, Matrix[Double] B) return (Double sp) { |
| |
| # compute contingency table |
| F = table(A,B); |
| |
| catA = nrow(F); # number of categories in A |
| catB = ncol(F); # number of categories in B |
| |
| # compute category-wise counts for both the attributes |
| R = rowSums(F); |
| S = colSums(F); |
| |
| # compute scores, both are column vectors |
| [C] = computeRanks(R); |
| meanX = mean(C,R); |
| |
| columnS = t(S); |
| [D] = computeRanks(columnS); |
| |
| # scores (C,D) are individual values, and counts (R,S) act as weights |
| meanY = mean(D,columnS); |
| |
| W = sum(F); # total weight, or total #cases |
| varX = moment(C,R,2)*(W/(W-1.0)); |
| varY = moment(D,columnS,2)*(W/(W-1.0)); |
| |
| covXY = 0.0; |
| for(i in 1:catA) { |
| covXY = covXY + sum((F[i,]/(W-1)) * (as.scalar(C[i,1])-meanX) * (t(D[,1])-meanY)); |
| } |
| |
| sp = covXY/(sqrt(varX)*sqrt(varY)); |
| } |
| |
| # ----------------------------------------------------------------------------------------------------------- |