scripts/algorithms/Univar-Stats.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # DML Script to compute univariate statistics for all attributes in a given data set
 #
 # INPUT PARAMETERS:
 # -------------------------------------------------------------------------------------------------
 # NAME           TYPE     DEFAULT  MEANING
 # -------------------------------------------------------------------------------------------------
 # X              String   ---      Location of INPUT data matrix
 # TYPES          String   ---      Location of INPUT matrix that lists the types of the features:
 #                                     1 for scale, 2 for nominal, 3 for ordinal
 # CONSOLE_OUTPUT Boolean  FALSE    If TRUE, print summary statistics to console
 # STATS          String   ---      Location of OUTPUT matrix with summary statistics computed for
 #                                  all features (17 statistics - 14 scale, 3 categorical)
 # -------------------------------------------------------------------------------------------------
 # OUTPUT: Matrix of summary statistics
 #
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar SystemML.jar -f Univar-Stats.dml -nvargs X=data/haberman.data TYPES=data/types.csv
 #    STATS=data/univarOut.mtx CONSOLE_OUTPUT=TRUE
 #

 consoleOutput = ifdef($CONSOLE_OUTPUT, FALSE);

 A = read($X); # data file
 K = read($TYPES); # attribute kind file

 # number of features/attributes
 n = ncol(A);

 # number of data records
 m = nrow(A);

 # number of statistics
 numBaseStats = 17; # (14 scale stats, 3 categorical stats)

 max_kind = max(K);

 # matrices to store computed statistics
 baseStats = matrix(0, rows=numBaseStats, cols=n);

 # Compute max domain size among all categorical attributes
 maxs = colMaxs(A);
 maxDomainSize = max( (K > 1) * maxs );
 maxDomain = as.integer(maxDomainSize);

 parfor(i in 1:n, check=0) {

 	# project out the i^th column
 	F = A[,i];

 	kind = as.scalar(K[1,i]);

 	if ( kind == 1 ) {
 		#print("[" + i + "] Scale");
 		# compute SCALE statistics on the projected column
 		minimum = min(F);
 		maximum = max(F);
 		rng = maximum - minimum;

 		mu = mean(F);
 		m2 = moment(F, 2);
 		m3 = moment(F, 3);
 		m4 = moment(F, 4);

 		var = m/(m-1.0)*m2;
 		std_dev = sqrt(var);
 		se = std_dev/sqrt(m);
 		cv = std_dev/mu;

 		g1 = m3/(std_dev^3);
 		g2 = m4/(std_dev^4) - 3;
 		#se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) );
 		se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) );

 		#se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) );
 		se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 );

 		md = median(F); #quantile(F, 0.5);
 		iqm = interQuartileMean(F);

 		# place the computed statistics in output matrices
 		baseStats[1,i] = minimum;
 		baseStats[2,i] = maximum;
 		baseStats[3,i] = rng;

 		baseStats[4,i] = mu;
 		baseStats[5,i] = var;
 		baseStats[6,i] = std_dev;
 		baseStats[7,i] = se;
 		baseStats[8,i] = cv;

 		baseStats[9,i] = g1;
 		baseStats[10,i] = g2;
 		baseStats[11,i] = se_g1;
 		baseStats[12,i] = se_g2;

 		baseStats[13,i] = md;
 		baseStats[14,i] = iqm;
 	}
 	else {
 		if (kind == 2 | kind == 3) {
 			#print("[" + i + "] Categorical");

 			# check if the categorical column has valid values
 			minF = min(F);
 			if (minF <=0) {
 				print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
 			}
 			else {
 				# compute CATEGORICAL statistics on the projected column
 				num_cat = max(F); # number of categories
 				cat_counts = table(F,1, maxDomain, 1);  # counts for each category

 				mode = rowIndexMax(t(cat_counts));
 				mx = max(cat_counts)
 				modeArr =  (cat_counts == mx)
 				numModes = sum(modeArr);

 				# place the computed statistics in output matrices
 				baseStats[15,i] = num_cat;
 				baseStats[16,i] = mode;
 				baseStats[17,i] = numModes;
 			}
 		}
 	}
 }

 if (consoleOutput == TRUE) {
 	for(i in 1:n) {
 		print("-------------------------------------------------");
 		kind = as.scalar(K[1,i]);
 		if (kind == 1) {
 			print("Feature [" + i + "]: Scale");
 			print(" (01) Minimum             | " + as.scalar(baseStats[1,i]));
 			print(" (02) Maximum             | " + as.scalar(baseStats[2,i]));
 			print(" (03) Range               | " + as.scalar(baseStats[3,i]));
 			print(" (04) Mean                | " + as.scalar(baseStats[4,i]));
 			print(" (05) Variance            | " + as.scalar(baseStats[5,i]));
 			print(" (06) Std deviation       | " + as.scalar(baseStats[6,i]));
 			print(" (07) Std err of mean     | " + as.scalar(baseStats[7,i]));
 			print(" (08) Coeff of variation  | " + as.scalar(baseStats[8,i]));
 			print(" (09) Skewness            | " + as.scalar(baseStats[9,i]));
 			print(" (10) Kurtosis            | " + as.scalar(baseStats[10,i]));
 			print(" (11) Std err of skewness | " + as.scalar(baseStats[11,i]));
 			print(" (12) Std err of kurtosis | " + as.scalar(baseStats[12,i]));
 			print(" (13) Median              | " + as.scalar(baseStats[13,i]));
 			print(" (14) Interquartile mean  | " + as.scalar(baseStats[14,i]));
 		} else {
 			if (kind == 2 | kind == 3) {
 				if (kind == 2) {
 					print("Feature [" + i + "]: Categorical (Nominal)");
 				} else {
 					print("Feature [" + i + "]: Categorical (Ordinal)");
 				}
 				print(" (15) Num of categories   | " + as.integer(as.scalar(baseStats[15,i])));
 				print(" (16) Mode                | " + as.integer(as.scalar(baseStats[16,i])));
 				print(" (17) Num of modes        | " + as.integer(as.scalar(baseStats[17,i])));
 			}
 		}
 	}
 }

 write(baseStats, $STATS);
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# DML Script to compute univariate statistics for all attributes in a given data set
	#
	# INPUT PARAMETERS:
	# -------------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# -------------------------------------------------------------------------------------------------
	# X String --- Location of INPUT data matrix
	# TYPES String --- Location of INPUT matrix that lists the types of the features:
	# 1 for scale, 2 for nominal, 3 for ordinal
	# CONSOLE_OUTPUT Boolean FALSE If TRUE, print summary statistics to console
	# STATS String --- Location of OUTPUT matrix with summary statistics computed for
	# all features (17 statistics - 14 scale, 3 categorical)
	# -------------------------------------------------------------------------------------------------
	# OUTPUT: Matrix of summary statistics
	#
	# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
	# hadoop jar SystemML.jar -f Univar-Stats.dml -nvargs X=data/haberman.data TYPES=data/types.csv
	# STATS=data/univarOut.mtx CONSOLE_OUTPUT=TRUE
	#

	consoleOutput = ifdef($CONSOLE_OUTPUT, FALSE);

	A = read($X); # data file
	K = read($TYPES); # attribute kind file

	# number of features/attributes
	n = ncol(A);

	# number of data records
	m = nrow(A);

	# number of statistics
	numBaseStats = 17; # (14 scale stats, 3 categorical stats)

	max_kind = max(K);

	# matrices to store computed statistics
	baseStats = matrix(0, rows=numBaseStats, cols=n);

	# Compute max domain size among all categorical attributes
	maxs = colMaxs(A);
	maxDomainSize = max( (K > 1) * maxs );
	maxDomain = as.integer(maxDomainSize);

	parfor(i in 1:n, check=0) {

	# project out the i^th column
	F = A[,i];

	kind = as.scalar(K[1,i]);

	if ( kind == 1 ) {
	#print("[" + i + "] Scale");
	# compute SCALE statistics on the projected column
	minimum = min(F);
	maximum = max(F);
	rng = maximum - minimum;

	mu = mean(F);
	m2 = moment(F, 2);
	m3 = moment(F, 3);
	m4 = moment(F, 4);

	var = m/(m-1.0)*m2;
	std_dev = sqrt(var);
	se = std_dev/sqrt(m);
	cv = std_dev/mu;

	g1 = m3/(std_dev^3);
	g2 = m4/(std_dev^4) - 3;
	#se_g1=sqrt( 6m(m-1.0) / ((m-2.0)(m+1.0)(m+3.0)) );
	se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) );

	#se_g2= sqrt( (4(m^2-1)se_g1^2)/((m+5.0)*(m-3.0)) );
	se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 );

	md = median(F); #quantile(F, 0.5);
	iqm = interQuartileMean(F);

	# place the computed statistics in output matrices
	baseStats[1,i] = minimum;
	baseStats[2,i] = maximum;
	baseStats[3,i] = rng;

	baseStats[4,i] = mu;
	baseStats[5,i] = var;
	baseStats[6,i] = std_dev;
	baseStats[7,i] = se;
	baseStats[8,i] = cv;

	baseStats[9,i] = g1;
	baseStats[10,i] = g2;
	baseStats[11,i] = se_g1;
	baseStats[12,i] = se_g2;

	baseStats[13,i] = md;
	baseStats[14,i] = iqm;
	}
	else {
	if (kind == 2 \| kind == 3) {
	#print("[" + i + "] Categorical");

	# check if the categorical column has valid values
	minF = min(F);
	if (minF <=0) {
	print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
	}
	else {
	# compute CATEGORICAL statistics on the projected column
	num_cat = max(F); # number of categories
	cat_counts = table(F,1, maxDomain, 1); # counts for each category

	mode = rowIndexMax(t(cat_counts));
	mx = max(cat_counts)
	modeArr = (cat_counts == mx)
	numModes = sum(modeArr);

	# place the computed statistics in output matrices
	baseStats[15,i] = num_cat;
	baseStats[16,i] = mode;
	baseStats[17,i] = numModes;
	}
	}
	}
	}

	if (consoleOutput == TRUE) {
	for(i in 1:n) {
	print("-------------------------------------------------");
	kind = as.scalar(K[1,i]);
	if (kind == 1) {
	print("Feature [" + i + "]: Scale");
	print(" (01) Minimum \| " + as.scalar(baseStats[1,i]));
	print(" (02) Maximum \| " + as.scalar(baseStats[2,i]));
	print(" (03) Range \| " + as.scalar(baseStats[3,i]));
	print(" (04) Mean \| " + as.scalar(baseStats[4,i]));
	print(" (05) Variance \| " + as.scalar(baseStats[5,i]));
	print(" (06) Std deviation \| " + as.scalar(baseStats[6,i]));
	print(" (07) Std err of mean \| " + as.scalar(baseStats[7,i]));
	print(" (08) Coeff of variation \| " + as.scalar(baseStats[8,i]));
	print(" (09) Skewness \| " + as.scalar(baseStats[9,i]));
	print(" (10) Kurtosis \| " + as.scalar(baseStats[10,i]));
	print(" (11) Std err of skewness \| " + as.scalar(baseStats[11,i]));
	print(" (12) Std err of kurtosis \| " + as.scalar(baseStats[12,i]));
	print(" (13) Median \| " + as.scalar(baseStats[13,i]));
	print(" (14) Interquartile mean \| " + as.scalar(baseStats[14,i]));
	} else {
	if (kind == 2 \| kind == 3) {
	if (kind == 2) {
	print("Feature [" + i + "]: Categorical (Nominal)");
	} else {
	print("Feature [" + i + "]: Categorical (Ordinal)");
	}
	print(" (15) Num of categories \| " + as.integer(as.scalar(baseStats[15,i])));
	print(" (16) Mode \| " + as.integer(as.scalar(baseStats[16,i])));
	print(" (17) Num of modes \| " + as.integer(as.scalar(baseStats[17,i])));
	}
	}
	}
	}

	write(baseStats, $STATS);