| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # DML Script to compute univariate statistics for all attributes in a given data set |
| # |
| # INPUT PARAMETERS: |
| # ------------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # ------------------------------------------------------------------------------------------------- |
| # X String --- Location of INPUT data matrix |
| # TYPES String --- Location of INPUT matrix that lists the types of the features: |
| # 1 for scale, 2 for nominal, 3 for ordinal |
| # CONSOLE_OUTPUT Boolean FALSE If TRUE, print summary statistics to console |
| # STATS String --- Location of OUTPUT matrix with summary statistics computed for |
| # all features (17 statistics - 14 scale, 3 categorical) |
| # ------------------------------------------------------------------------------------------------- |
| # OUTPUT: Matrix of summary statistics |
| # |
| # HOW TO INVOKE THIS SCRIPT - EXAMPLE: |
| # hadoop jar SystemDS.jar -f Univar-Stats.dml -nvargs X=data/haberman.data TYPES=data/types.csv |
| # STATS=data/univarOut.mtx CONSOLE_OUTPUT=TRUE |
| # |
| |
| consoleOutput = ifdef($CONSOLE_OUTPUT, FALSE); |
| |
| A = read($X); # data file |
| K = read($TYPES); # attribute kind file |
| n = ncol(A); # number of features/attributes |
| m = nrow(A); # number of data records |
| numBaseStats = 17; # number of statistics (14 scale, 3 categorical) |
| max_kind = max(K); |
| |
| # matrices to store computed statistics |
| baseStats = matrix(0, rows=numBaseStats, cols=n); |
| |
| # Compute max domain size among all categorical attributes |
| maxDomain = as.integer(max((K > 1) * colMaxs(A))); |
| |
| parfor(i in 1:n, check=0) { |
| # project out the i^th column |
| F = A[,i]; |
| |
| kind = as.scalar(K[1,i]); |
| minF = min(F); |
| maxF = max(F); |
| |
| if ( kind == 1 ) { |
| # compute SCALE statistics on the projected column |
| rng = maxF - minF; |
| |
| mu = mean(F); |
| m2 = moment(F, 2); |
| m3 = moment(F, 3); |
| m4 = moment(F, 4); |
| |
| var = m/(m-1.0)*m2; |
| std_dev = sqrt(var); |
| se = std_dev/sqrt(m); |
| cv = std_dev/mu; |
| |
| g1 = m3/(std_dev^3); |
| g2 = m4/(std_dev^4) - 3; |
| se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); |
| se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); |
| |
| md = median(F); |
| iqm = interQuartileMean(F); |
| |
| baseStats[1:14,i] = as.matrix(list(minF, maxF, rng, |
| mu, var, std_dev, se, cv, g1, g2, se_g1, se_g2, md, iqm)); |
| } |
| else { |
| if (kind == 2 | kind == 3) { |
| # check if the categorical column has valid values |
| if( minF <= 0 ) { |
| print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i); |
| } |
| else { |
| # compute CATEGORICAL statistics on the projected column |
| cat_counts = table(F,1, maxDomain, 1); # counts for each category |
| mode = as.scalar(rowIndexMax(t(cat_counts))); |
| numModes = sum(cat_counts == max(cat_counts)); |
| |
| baseStats[15:17,i] = as.matrix(list(maxF, mode, numModes)); |
| } |
| } |
| } |
| } |
| |
| if (consoleOutput == TRUE) { |
| for(i in 1:n) { |
| print("-------------------------------------------------"); |
| kind = as.scalar(K[1,i]); |
| if (kind == 1) { |
| print("Feature [" + i + "]: Scale"); |
| print(" (01) Minimum | " + as.scalar(baseStats[1,i])); |
| print(" (02) Maximum | " + as.scalar(baseStats[2,i])); |
| print(" (03) Range | " + as.scalar(baseStats[3,i])); |
| print(" (04) Mean | " + as.scalar(baseStats[4,i])); |
| print(" (05) Variance | " + as.scalar(baseStats[5,i])); |
| print(" (06) Std deviation | " + as.scalar(baseStats[6,i])); |
| print(" (07) Std err of mean | " + as.scalar(baseStats[7,i])); |
| print(" (08) Coeff of variation | " + as.scalar(baseStats[8,i])); |
| print(" (09) Skewness | " + as.scalar(baseStats[9,i])); |
| print(" (10) Kurtosis | " + as.scalar(baseStats[10,i])); |
| print(" (11) Std err of skewness | " + as.scalar(baseStats[11,i])); |
| print(" (12) Std err of kurtosis | " + as.scalar(baseStats[12,i])); |
| print(" (13) Median | " + as.scalar(baseStats[13,i])); |
| print(" (14) Interquartile mean | " + as.scalar(baseStats[14,i])); |
| } |
| else if (kind == 2 | kind == 3) { |
| print(ifelse(kind == 2, |
| "Feature [" + i + "]: Categorical (Nominal)", |
| "Feature [" + i + "]: Categorical (Ordinal)")); |
| print(" (15) Num of categories | " + as.integer(as.scalar(baseStats[15,i]))); |
| print(" (16) Mode | " + as.integer(as.scalar(baseStats[16,i]))); |
| print(" (17) Num of modes | " + as.integer(as.scalar(baseStats[17,i]))); |
| } |
| } |
| } |
| |
| write(baseStats, $STATS); |