scripts/builtin/univar.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------
 #
 # Computes univariate statistics for all attributes in a given data set
 #
 # INPUT PARAMETERS:
 # -------------------------------------------------------------------------------------------------
 # NAME           TYPE               DEFAULT  MEANING
 # -------------------------------------------------------------------------------------------------
 # X              Matrix[Double]     ---      Input matrix of the shape (N, D)
 # TYPES          Matrix[Integer]    ---      Matrix of the shape (1, D) with features types:
 #                                            1 for scale, 2 for nominal, 3 for ordinal
 # -------------------------------------------------------------------------------------------------
 # OUTPUT: Matrix of summary statistics

 m_univar = function(Matrix[Double] X, Matrix[Double] types)
 return(Matrix[Double] univarStats)
 {
   max_kind = max(types);
   N = nrow(X);
   D = ncol(X);

   # Number of statistics (14 scale, 3 categorical)
   numBaseStats = 17;
   univarStats = matrix(0, rows=numBaseStats, cols=D);

   # Compute max domain size among all categorical attributes
   maxDomain = as.integer(max((types > 1) * colMaxs(X)));

   parfor(i in 1:D, check=0) {
     F = X[,i];

     type = as.scalar(types[1,i]);
     minF = min(F);
     maxF = max(F);

     if (type == 1) {
       # compute SCALE statistics on the projected column
       rng = maxF - minF;

       mu = mean(F);
       m2 = moment(F, 2);
       m3 = moment(F, 3);
       m4 = moment(F, 4);

       var = N/(N-1.0)*m2;
       std_dev = sqrt(var);
       se = std_dev/sqrt(N);
       cv = std_dev/mu;

       g1 = m3/(std_dev^3);
       g2 = m4/(std_dev^4) - 3;
       se_g1=sqrt( (6/(N-2.0)) * (N/(N+1.0)) * ((N-1.0)/(N+3.0)) );
       se_g2=sqrt( (4/(N+5.0)) * ((N^2-1)/(N-3.0)) * se_g1^2 );

       md = median(F);
       iqm = interQuartileMean(F);

       univarStats[1:14,i] = as.matrix(list(minF, maxF, rng,
         mu, var, std_dev, se, cv, g1, g2, se_g1, se_g2, md, iqm));
     }

     if (type == 2 | type == 3) {
       # check if the categorical column has valid values
       if( minF <= 0 ) {
         print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
       }

       # compute CATEGORICAL statistics on the projected column
       cat_counts = table(F, 1, maxDomain, 1);
       mode = as.scalar(rowIndexMax(t(cat_counts)));
       numModes = sum(cat_counts == max(cat_counts));
       univarStats[15:17,i] = as.matrix(list(maxF, mode, numModes));
     }
   }
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------
	#
	# Computes univariate statistics for all attributes in a given data set
	#
	# INPUT PARAMETERS:
	# -------------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# -------------------------------------------------------------------------------------------------
	# X Matrix[Double] --- Input matrix of the shape (N, D)
	# TYPES Matrix[Integer] --- Matrix of the shape (1, D) with features types:
	# 1 for scale, 2 for nominal, 3 for ordinal
	# -------------------------------------------------------------------------------------------------
	# OUTPUT: Matrix of summary statistics

	m_univar = function(Matrix[Double] X, Matrix[Double] types)
	return(Matrix[Double] univarStats)
	{
	max_kind = max(types);
	N = nrow(X);
	D = ncol(X);

	# Number of statistics (14 scale, 3 categorical)
	numBaseStats = 17;
	univarStats = matrix(0, rows=numBaseStats, cols=D);

	# Compute max domain size among all categorical attributes
	maxDomain = as.integer(max((types > 1) * colMaxs(X)));

	parfor(i in 1:D, check=0) {
	F = X[,i];

	type = as.scalar(types[1,i]);
	minF = min(F);
	maxF = max(F);

	if (type == 1) {
	# compute SCALE statistics on the projected column
	rng = maxF - minF;

	mu = mean(F);
	m2 = moment(F, 2);
	m3 = moment(F, 3);
	m4 = moment(F, 4);

	var = N/(N-1.0)*m2;
	std_dev = sqrt(var);
	se = std_dev/sqrt(N);
	cv = std_dev/mu;

	g1 = m3/(std_dev^3);
	g2 = m4/(std_dev^4) - 3;
	se_g1=sqrt( (6/(N-2.0)) * (N/(N+1.0)) * ((N-1.0)/(N+3.0)) );
	se_g2=sqrt( (4/(N+5.0)) * ((N^2-1)/(N-3.0)) * se_g1^2 );

	md = median(F);
	iqm = interQuartileMean(F);

	univarStats[1:14,i] = as.matrix(list(minF, maxF, rng,
	mu, var, std_dev, se, cv, g1, g2, se_g1, se_g2, md, iqm));
	}

	if (type == 2 \| type == 3) {
	# check if the categorical column has valid values
	if( minF <= 0 ) {
	print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
	}

	# compute CATEGORICAL statistics on the projected column
	cat_counts = table(F, 1, maxDomain, 1);
	mode = as.scalar(rowIndexMax(t(cat_counts)));
	numModes = sum(cat_counts == max(cat_counts));
	univarStats[15:17,i] = as.matrix(list(maxF, mode, numModes));
	}
	}
	}