blob: 80a4606ae445b8c5cbc10a08873b1a8216653d59 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# Computes univariate statistics for all attributes in a given data set
#
# INPUT PARAMETERS:
# -------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# -------------------------------------------------------------------------------------------------
# X Matrix[Double] --- Input matrix of the shape (N, D)
# TYPES Matrix[Integer] --- Matrix of the shape (1, D) with features types:
# 1 for scale, 2 for nominal, 3 for ordinal
# -------------------------------------------------------------------------------------------------
# OUTPUT: Matrix of summary statistics
m_univar = function(Matrix[Double] X, Matrix[Double] types)
return(Matrix[Double] univarStats)
{
max_kind = max(types);
N = nrow(X);
D = ncol(X);
# Number of statistics (14 scale, 3 categorical)
numBaseStats = 17;
univarStats = matrix(0, rows=numBaseStats, cols=D);
# Compute max domain size among all categorical attributes
maxDomain = as.integer(max((types > 1) * colMaxs(X)));
parfor(i in 1:D, check=0) {
F = X[,i];
type = as.scalar(types[1,i]);
minF = min(F);
maxF = max(F);
if (type == 1) {
# compute SCALE statistics on the projected column
rng = maxF - minF;
mu = mean(F);
m2 = moment(F, 2);
m3 = moment(F, 3);
m4 = moment(F, 4);
var = N/(N-1.0)*m2;
std_dev = sqrt(var);
se = std_dev/sqrt(N);
cv = std_dev/mu;
g1 = m3/(std_dev^3);
g2 = m4/(std_dev^4) - 3;
se_g1=sqrt( (6/(N-2.0)) * (N/(N+1.0)) * ((N-1.0)/(N+3.0)) );
se_g2=sqrt( (4/(N+5.0)) * ((N^2-1)/(N-3.0)) * se_g1^2 );
md = median(F);
iqm = interQuartileMean(F);
univarStats[1:14,i] = as.matrix(list(minF, maxF, rng,
mu, var, std_dev, se, cv, g1, g2, se_g1, se_g2, md, iqm));
}
if (type == 2 | type == 3) {
# check if the categorical column has valid values
if( minF <= 0 ) {
print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
}
# compute CATEGORICAL statistics on the projected column
cat_counts = table(F, 1, maxDomain, 1);
mode = as.scalar(rowIndexMax(t(cat_counts)));
numModes = sum(cat_counts == max(cat_counts));
univarStats[15:17,i] = as.matrix(list(maxF, mode, numModes));
}
}
}