| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # DML Script to compute univariate statistics for all attributes |
| # in a given data set |
| # |
| # Three inputs: |
| # $1) A - input data |
| # $2) K - row matrix that denotes the "kind" for each |
| # attribute |
| # kind=1 for scale, kind=2 for nominal, |
| # kind=3 for ordinal |
| # $3) maxC - maximum number of categories in any categorical |
| # attribute |
| # |
| # One output: |
| # $4) output directory in which following three statistics |
| # files are created |
| # + base.stats - matrix with all 17 statistics (14 scale, |
| # 3 categorical) computed for all attributes |
| # + categorical.counts - matrix in which each column |
| # gives the category-wise counts for all categories in |
| # that attribute |
| # |
| # |
| |
| A = read($1); # data file |
| K = read($2); # attribute kind file |
| maxC = $3; # max number of categories in any categorical attribute |
| |
| |
| if (maxC < 0) { |
| print("ERROR: maximum number maxC of categories must be a positve value."); |
| } |
| else { |
| |
| |
| # number of features/attributes |
| n = ncol(A); |
| |
| # number of data records |
| m = nrow(A); |
| |
| # number of statistics |
| numBaseStats = 17; # (14 scale stats, 3 categorical stats) |
| |
| max_kind = max(K); |
| |
| # matrices to store computed statistics |
| baseStats = matrix(0, rows=numBaseStats, cols=n); |
| |
| if (maxC > 0) { |
| countsArray = matrix(0, rows=maxC, cols=n); |
| } |
| |
| parfor(i in 1:n, check=0) { |
| |
| # project out the i^th column |
| F = A[,i]; |
| |
| kind = as.scalar(K[1,i]); |
| |
| if ( kind == 1 ) { |
| print("[" + i + "] Scale"); |
| # compute SCALE statistics on the projected column |
| minimum = min(F); |
| maximum = max(F); |
| rng = maximum - minimum; |
| |
| mu = mean(F); |
| m2 = moment(F, 2); |
| m3 = moment(F, 3); |
| m4 = moment(F, 4); |
| |
| var = m/(m-1.0)*m2; |
| std_dev = sqrt(var); |
| se = std_dev/sqrt(m); |
| cv = std_dev/mu; |
| |
| g1 = m3/(std_dev^3); |
| g2 = m4/(std_dev^4) - 3; |
| #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); |
| se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); |
| |
| #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) ); |
| se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); |
| |
| md = median(F) #quantile(F, 0.5); |
| iqm = interQuartileMean(F); |
| |
| # place the computed statistics in output matrices |
| baseStats[1,i] = minimum; |
| baseStats[2,i] = maximum; |
| baseStats[3,i] = rng; |
| |
| baseStats[4,i] = mu; |
| baseStats[5,i] = var; |
| baseStats[6,i] = std_dev; |
| baseStats[7,i] = se; |
| baseStats[8,i] = cv; |
| |
| baseStats[9,i] = g1; |
| baseStats[10,i] = g2; |
| baseStats[11,i] = se_g1; |
| baseStats[12,i] = se_g2; |
| |
| baseStats[13,i] = md; |
| baseStats[14,i] = iqm; |
| } |
| else { |
| if (kind == 2 | kind == 3) { |
| print("[" + i + "] Categorical"); |
| |
| # check if the categorical column has valid values |
| minF = min(F); |
| if (minF <=0) { |
| print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i); |
| } |
| else { |
| # compute CATEGORICAL statistics on the projected column |
| cat_counts = table(F,1); # counts for each category |
| num_cat = nrow(cat_counts); # number of categories |
| |
| mode = rowIndexMax(t(cat_counts)); |
| mx = max(cat_counts) |
| modeArr = ppred(cat_counts, mx, "==") |
| numModes = sum(modeArr); |
| |
| # place the computed statistics in output matrices |
| baseStats[15,i] = num_cat; |
| baseStats[16,i] = mode; |
| baseStats[17,i] = numModes; |
| |
| if (max_kind > 1) { |
| countsArray[,i] = cat_counts; |
| } |
| } |
| } |
| } |
| } |
| |
| write(baseStats, $4+"/base.stats"); |
| if (max_kind > 1) { |
| write(countsArray, $4+"/categorical.counts"); |
| } |
| |
| } |