src/test/scripts/applications/parfor/parfor_univariate4.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # DML Script to compute univariate statistics for all attributes
 # in a given data set
 #
 # Three inputs:
 #     $1) A - input data
 #     $2) K - row matrix that denotes the "kind" for each
 #              attribute
 #             kind=1 for scale, kind=2 for nominal,
 #             kind=3 for ordinal
 #     $3) maxC - maximum number of categories in any categorical
 #         attribute
 #
 # One output:
 #     $4) output directory in which following three statistics
 #         files are created
 #         + base.stats - matrix with all 17 statistics (14 scale,
 #         3 categorical) computed for all attributes
 #         + categorical.counts - matrix in which each column
 #         gives the category-wise counts for all categories in
 #         that attribute
 #
 #

 A = read($1); # data file
 K = read($2); # attribute kind file
 maxC = $3;  # max number of categories in any categorical attribute


 if (maxC < 0) {
 	print("ERROR: maximum number maxC of categories must be a positve value.");
 }
 else {


 	# number of features/attributes
 	n = ncol(A);

 	# number of data records
 	m = nrow(A);

 	# number of statistics
 	numBaseStats = 17; # (14 scale stats, 3 categorical stats)

     max_kind = max(K);

 	# matrices to store computed statistics
 	baseStats = matrix(0, rows=numBaseStats, cols=n);

 	if (maxC > 0) {
 	  countsArray = matrix(0, rows=maxC, cols=n);
     }

 	parfor(i in 1:n, check=0) {

 		# project out the i^th column
 		F = A[,i];

 		kind = as.scalar(K[1,i]);

 		if ( kind == 1 ) {
 			print("[" + i + "] Scale");
 			# compute SCALE statistics on the projected column
 			minimum = min(F);
 			maximum = max(F);
 			rng = maximum - minimum;

 			mu = mean(F);
 			m2 = moment(F, 2);
 			m3 = moment(F, 3);
 			m4 = moment(F, 4);

 			var = m/(m-1.0)*m2;
 			std_dev = sqrt(var);
 			se = std_dev/sqrt(m);
 			cv = std_dev/mu;

 			g1 = m3/(std_dev^3);
 			g2 = m4/(std_dev^4) - 3;
 			#se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) );
 			se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) );

 			#se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) );
 			se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 );

 			md = median(F) #quantile(F, 0.5);
 			iqm = interQuartileMean(F);

 			# place the computed statistics in output matrices
 			baseStats[1,i] = minimum;
 			baseStats[2,i] = maximum;
 			baseStats[3,i] = rng;

 			baseStats[4,i] = mu;
 			baseStats[5,i] = var;
 			baseStats[6,i] = std_dev;
 			baseStats[7,i] = se;
 			baseStats[8,i] = cv;

 			baseStats[9,i] = g1;
 			baseStats[10,i] = g2;
 			baseStats[11,i] = se_g1;
 			baseStats[12,i] = se_g2;

 			baseStats[13,i] = md;
 			baseStats[14,i] = iqm;
 		}
 		else {
 			if (kind == 2 | kind == 3) {
 				print("[" + i + "] Categorical");

 				# check if the categorical column has valid values
 				minF = min(F);
 				if (minF <=0) {
 					print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
 				}
 				else {
 					# compute CATEGORICAL statistics on the projected column
 					cat_counts = table(F,1);  # counts for each category
 					num_cat = nrow(cat_counts); # number of categories

 					mode = rowIndexMax(t(cat_counts));
 					mx = max(cat_counts)
 					modeArr =  ppred(cat_counts, mx, "==")
 					numModes = sum(modeArr);

 					# place the computed statistics in output matrices
 					baseStats[15,i] = num_cat;
 					baseStats[16,i] = mode;
 					baseStats[17,i] = numModes;

           if (max_kind > 1) {
 					  countsArray[,i] = cat_counts;
 					}
 				}
 			}
 		}
 	}

 	write(baseStats, $4+"/base.stats");
 	if (max_kind > 1) {
 		write(countsArray, $4+"/categorical.counts");
 	}

 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# DML Script to compute univariate statistics for all attributes
	# in a given data set
	#
	# Three inputs:
	# $1) A - input data
	# $2) K - row matrix that denotes the "kind" for each
	# attribute
	# kind=1 for scale, kind=2 for nominal,
	# kind=3 for ordinal
	# $3) maxC - maximum number of categories in any categorical
	# attribute
	#
	# One output:
	# $4) output directory in which following three statistics
	# files are created
	# + base.stats - matrix with all 17 statistics (14 scale,
	# 3 categorical) computed for all attributes
	# + categorical.counts - matrix in which each column
	# gives the category-wise counts for all categories in
	# that attribute
	#
	#

	A = read($1); # data file
	K = read($2); # attribute kind file
	maxC = $3; # max number of categories in any categorical attribute


	if (maxC < 0) {
	print("ERROR: maximum number maxC of categories must be a positve value.");
	}
	else {


	# number of features/attributes
	n = ncol(A);

	# number of data records
	m = nrow(A);

	# number of statistics
	numBaseStats = 17; # (14 scale stats, 3 categorical stats)

	max_kind = max(K);

	# matrices to store computed statistics
	baseStats = matrix(0, rows=numBaseStats, cols=n);

	if (maxC > 0) {
	countsArray = matrix(0, rows=maxC, cols=n);
	}

	parfor(i in 1:n, check=0) {

	# project out the i^th column
	F = A[,i];

	kind = as.scalar(K[1,i]);

	if ( kind == 1 ) {
	print("[" + i + "] Scale");
	# compute SCALE statistics on the projected column
	minimum = min(F);
	maximum = max(F);
	rng = maximum - minimum;

	mu = mean(F);
	m2 = moment(F, 2);
	m3 = moment(F, 3);
	m4 = moment(F, 4);

	var = m/(m-1.0)*m2;
	std_dev = sqrt(var);
	se = std_dev/sqrt(m);
	cv = std_dev/mu;

	g1 = m3/(std_dev^3);
	g2 = m4/(std_dev^4) - 3;
	#se_g1=sqrt( 6m(m-1.0) / ((m-2.0)(m+1.0)(m+3.0)) );
	se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) );

	#se_g2= sqrt( (4(m^2-1)se_g1^2)/((m+5.0)*(m-3.0)) );
	se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 );

	md = median(F) #quantile(F, 0.5);
	iqm = interQuartileMean(F);

	# place the computed statistics in output matrices
	baseStats[1,i] = minimum;
	baseStats[2,i] = maximum;
	baseStats[3,i] = rng;

	baseStats[4,i] = mu;
	baseStats[5,i] = var;
	baseStats[6,i] = std_dev;
	baseStats[7,i] = se;
	baseStats[8,i] = cv;

	baseStats[9,i] = g1;
	baseStats[10,i] = g2;
	baseStats[11,i] = se_g1;
	baseStats[12,i] = se_g2;

	baseStats[13,i] = md;
	baseStats[14,i] = iqm;
	}
	else {
	if (kind == 2 \| kind == 3) {
	print("[" + i + "] Categorical");

	# check if the categorical column has valid values
	minF = min(F);
	if (minF <=0) {
	print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
	}
	else {
	# compute CATEGORICAL statistics on the projected column
	cat_counts = table(F,1); # counts for each category
	num_cat = nrow(cat_counts); # number of categories

	mode = rowIndexMax(t(cat_counts));
	mx = max(cat_counts)
	modeArr = ppred(cat_counts, mx, "==")
	numModes = sum(modeArr);

	# place the computed statistics in output matrices
	baseStats[15,i] = num_cat;
	baseStats[16,i] = mode;
	baseStats[17,i] = numModes;

	if (max_kind > 1) {
	countsArray[,i] = cat_counts;
	}
	}
	}
	}
	}

	write(baseStats, $4+"/base.stats");
	if (max_kind > 1) {
	write(countsArray, $4+"/categorical.counts");
	}

	}