blob: a7620f6f6b95d0ad715ce001f03aeeeba0a60585 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT APPLIES THE ESTIMATED PARAMETERS OF A GLM-TYPE REGRESSION TO A NEW (TEST) DATASET
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- Location to read the matrix X of records (feature vectors)
# B String --- Location to read GLM regression parameters (the betas), with dimensions
# ncol(X) x k: do not add intercept
# ncol(X)+1 x k: add intercept as given by the last B-row
# if k > 1, use only B[, 1] unless it is Multinomial Logit (dfam=3)
# M String " " Location to write the matrix of predicted response means/probabilities:
# nrow(X) x 1 : for Power-type distributions (dfam=1)
# nrow(X) x 2 : for Binomial distribution (dfam=2), column 2 is "No"
# nrow(X) x k+1: for Multinomial Logit (dfam=3), col# k+1 is baseline
# Y String " " Location to read response matrix Y, with the following dimensions:
# nrow(X) x 1 : for all distributions (dfam=1 or 2 or 3)
# nrow(X) x 2 : for Binomial (dfam=2) given by (#pos, #neg) counts
# nrow(X) x k+1: for Multinomial (dfam=3) given by category counts
# O String " " Location to write the printed statistics; by default is standard output
# dfam Int 1 GLM distribution family: 1 = Power, 2 = Binomial, 3 = Multinomial Logit
# vpow Double 0.0 Power for Variance defined as (mean)^power (ignored if dfam != 1):
# 0.0 = Gaussian, 1.0 = Poisson, 2.0 = Gamma, 3.0 = Inverse Gaussian
# link Int 0 Link function code: 0 = canonical (depends on distribution), 1 = Power,
# 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit; ignored if Multinomial
# lpow Double 1.0 Power for Link function defined as (mean)^power (ignored if link != 1):
# -2.0 = 1/mu^2, -1.0 = reciprocal, 0.0 = log, 0.5 = sqrt, 1.0 = identity
# disp Double 1.0 Dispersion value, when available
# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
# ---------------------------------------------------------------------------------------------
# OUTPUT: Matrix M of predicted means/probabilities, some statistics in CSV format (see below)
# The statistics are printed one per each line, in the following CSV format:
# NAME,[COLUMN],[SCALED],VALUE
# NAME is the string identifier for the statistic, see the table below.
# COLUMN is an optional integer value that specifies the Y-column for per-column statistics;
# note that a Binomial/Multinomial one-column Y input is converted into multi-column.
# SCALED is an optional Boolean value (TRUE or FALSE) that tells us whether or not the input
# dispersion parameter (disp) scaling has been applied to this statistic.
# VALUE is the value of the statistic.
#
# NAME COLUMN SCALED MEANING
# ---------------------------------------------------------------------------------------------
# LOGLHOOD_Z + Log-Likelihood Z-score (in st.dev's from mean)
# LOGLHOOD_Z_PVAL + Log-Likelihood Z-score p-value
# PEARSON_X2 + Pearson residual X^2 statistic
# PEARSON_X2_BY_DF + Pearson X^2 divided by degrees of freedom
# PEARSON_X2_PVAL + Pearson X^2 p-value
# DEVIANCE_G2 + Deviance from saturated model G^2 statistic
# DEVIANCE_G2_BY_DF + Deviance G^2 divided by degrees of freedom
# DEVIANCE_G2_PVAL + Deviance G^2 p-value
# AVG_TOT_Y + Average of Y column for a single response value
# STDEV_TOT_Y + St.Dev. of Y column for a single response value
# AVG_RES_Y + Average of column residual, i.e. of Y - mean(Y|X)
# STDEV_RES_Y + St.Dev. of column residual, i.e. of Y - mean(Y|X)
# PRED_STDEV_RES + + Model-predicted St.Dev. of column residual
# R2 + R^2 of Y column residual with bias included
# ADJUSTED_R2 + Adjusted R^2 of Y column residual with bias included
# R2_NOBIAS + R^2 of Y column residual with bias subtracted
# ADJUSTED_R2_NOBIAS + Adjusted R^2 of Y column residual with bias subtracted
# ---------------------------------------------------------------------------------------------
#
# Example with distribution = "Poisson.log":
# hadoop jar SystemDS.jar -f GLM_HOME/GLM-predict.dml -nvargs dfam=1 vpow=1.0 link=1 lpow=0.0
# disp=3.0 fmt=csv X=INPUT_DIR/X B=INPUT_DIR/B Y=INPUT_DIR/Y M=OUTPUT_DIR/M O=OUTPUT_DIR/out.csv
# Default values for input parameters:
fileX = $X;
fileB = $B;
fileM = ifdef ($M, " ");
fileY = ifdef ($Y, " ");
fileO = ifdef ($O, " ");
fmtM = ifdef ($fmt, "text");
dist_type = ifdef ($dfam, 1); # $dfam = 1;
var_power = ifdef ($vpow, 0.0); # $vpow = 0.0;
link_type = ifdef ($link, 0); # $link = 0;
link_power = ifdef ($lpow, 1.0); # $lpow = 1.0;
dispersion = ifdef ($disp, 1.0); # $disp = 1.0;
var_power = as.double (var_power);
link_power = as.double (link_power);
dispersion = as.double (dispersion);
if (dist_type == 3) {
link_type = 2;
} else if (link_type == 0) { # Canonical Link
if (dist_type == 1) {
link_type = 1;
link_power = 1.0 - var_power;
} else if (dist_type == 2) {
link_type = 2;
} }
X = read (fileX);
num_records = nrow (X);
num_features = ncol (X);
B_full = read (fileB);
if (dist_type == 3) {
beta = B_full [1 : ncol (X), ];
intercept = B_full [nrow(B_full), ];
} else {
beta = B_full [1 : ncol (X), 1];
intercept = B_full [nrow(B_full), 1];
}
if (nrow (B_full) == ncol (X)) {
intercept = 0.0 * intercept;
is_intercept = FALSE;
} else {
num_features = num_features + 1;
is_intercept = TRUE;
}
ones_rec = matrix (1, rows = num_records, cols = 1);
linear_terms = X %*% beta + ones_rec %*% intercept;
[means, vars] =
glm_means_and_vars (linear_terms, dist_type, var_power, link_type, link_power);
if (fileM != " ") {
write (means, fileM, format=fmtM);
}
if (fileY != " ")
{
Y = read (fileY);
ones_ctg = matrix (1, rows = ncol(Y), cols = 1);
# Statistics To Compute:
Z_logl = NaN;
Z_logl_pValue = NaN;
X2_pearson = NaN;
df_pearson = -1;
G2_deviance = NaN;
df_deviance = -1;
X2_pearson_pValue = NaN;
G2_deviance_pValue = NaN;
Z_logl_scaled = NaN;
Z_logl_scaled_pValue = NaN;
X2_scaled = NaN;
X2_scaled_pValue = NaN;
G2_scaled = NaN;
G2_scaled_pValue = NaN;
# set Y_counts to avoid 'Initialization of Y_counts depends on if-else execution' warning
Y_counts = matrix(0.0, rows=1, cols=1);
if (dist_type == 1 & link_type == 1) {
#
# POWER DISTRIBUTIONS (GAUSSIAN, POISSON, GAMMA, ETC.)
#
if (link_power == 0) {
is_zero_Y = (Y == 0);
lt_saturated = log (Y + is_zero_Y) - is_zero_Y / (1.0 - is_zero_Y);
} else {
lt_saturated = Y ^ link_power;
}
Y_counts = ones_rec;
X2_pearson = sum ((Y - means) ^ 2 / vars);
df_pearson = num_records - num_features;
log_l_part =
glm_partial_loglikelihood_for_power_dist_and_link (linear_terms, Y, var_power, link_power);
log_l_part_saturated =
glm_partial_loglikelihood_for_power_dist_and_link (lt_saturated, Y, var_power, link_power);
G2_deviance = 2 * sum (log_l_part_saturated) - 2 * sum (log_l_part);
df_deviance = num_records - num_features;
} else { if (dist_type >= 2) {
#
# BINOMIAL AND MULTINOMIAL DISTRIBUTIONS
#
if (ncol (Y) == 1) {
num_categories = ncol (beta) + 1;
if (min (Y) <= 0) {
# Category labels "0", "-1" etc. are converted into the baseline label
Y = Y + (- Y + num_categories) * (Y <= 0);
}
Y_size = min (num_categories, max(Y));
Y_unsized = table (seq (1, num_records, 1), Y);
Y = matrix (0, rows = num_records, cols = num_categories);
Y [, 1 : Y_size] = Y_unsized [, 1 : Y_size];
Y_counts = ones_rec;
} else {
Y_counts = rowSums (Y);
}
P = means;
zero_Y = (Y == 0);
zero_P = (P == 0);
ones_ctg = matrix (1, rows = ncol(Y), cols = 1);
logl_vec = rowSums (Y * log (P + zero_Y) );
ent1_vec = rowSums (P * log (P + zero_P) );
ent2_vec = rowSums (P * (log (P + zero_P))^2);
E_logl = sum (Y_counts * ent1_vec);
V_logl = sum (Y_counts * (ent2_vec - ent1_vec ^ 2));
Z_logl = (sum (logl_vec) - E_logl) / sqrt (V_logl);
means = means * (Y_counts %*% t(ones_ctg));
vars = vars * (Y_counts %*% t(ones_ctg));
frac_below_5 = sum (means < 5) / (nrow (means) * ncol (means));
frac_below_1 = sum (means < 1) / (nrow (means) * ncol (means));
if (frac_below_5 > 0.2 | frac_below_1 > 0) {
print ("WARNING: residual statistics are inaccurate here due to low cell means.");
}
X2_pearson = sum ((Y - means) ^ 2 / means);
df_pearson = (num_records - num_features) * (ncol(Y) - 1);
G2_deviance = 2 * sum (Y * log ((Y + zero_Y) / (means + zero_Y)));
df_deviance = (num_records - num_features) * (ncol(Y) - 1);
}}
if (Z_logl == Z_logl) {
Z_logl_absneg = - abs (Z_logl);
Z_logl_pValue = 2.0 * pnorm(target = Z_logl_absneg);
}
if (X2_pearson == X2_pearson & df_pearson > 0) {
X2_pearson_pValue = pchisq(target = X2_pearson, df = df_pearson, lower.tail=FALSE);
}
if (G2_deviance == G2_deviance & df_deviance > 0) {
G2_deviance_pValue = pchisq(target = G2_deviance, df = df_deviance, lower.tail=FALSE);
}
Z_logl_scaled = Z_logl / sqrt (dispersion);
X2_scaled = X2_pearson / dispersion;
G2_scaled = G2_deviance / dispersion;
if (Z_logl_scaled == Z_logl_scaled) {
Z_logl_scaled_absneg = - abs (Z_logl_scaled);
Z_logl_scaled_pValue = 2.0 * pnorm(target = Z_logl_scaled_absneg);
}
if (X2_scaled == X2_scaled & df_pearson > 0) {
X2_scaled_pValue = pchisq(target = X2_scaled, df = df_pearson, lower.tail=FALSE);
}
if (G2_scaled == G2_scaled & df_deviance > 0) {
G2_scaled_pValue = pchisq(target = G2_scaled, df = df_deviance, lower.tail=FALSE);
}
avg_tot_Y = colSums ( Y ) / sum (Y_counts);
avg_res_Y = colSums (Y - means) / sum (Y_counts);
ss_avg_tot_Y = colSums (( Y - Y_counts %*% avg_tot_Y) ^ 2);
ss_res_Y = colSums ((Y - means) ^ 2);
ss_avg_res_Y = colSums ((Y - means - Y_counts %*% avg_res_Y) ^ 2);
df_ss_res_Y = sum (Y_counts) - num_features;
if (is_intercept) {
df_ss_avg_res_Y = df_ss_res_Y;
} else {
df_ss_avg_res_Y = df_ss_res_Y - 1;
}
var_tot_Y = ss_avg_tot_Y / (sum (Y_counts) - 1);
if (df_ss_avg_res_Y > 0) {
var_res_Y = ss_avg_res_Y / df_ss_avg_res_Y;
} else {
var_res_Y = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0;
}
R2_nobias = 1 - ss_avg_res_Y / ss_avg_tot_Y;
adjust_R2_nobias = 1 - var_res_Y / var_tot_Y;
R2 = 1 - ss_res_Y / ss_avg_tot_Y;
if (df_ss_res_Y > 0) {
adjust_R2 = 1 - (ss_res_Y / df_ss_res_Y) / var_tot_Y;
} else {
adjust_R2 = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0;
}
predicted_avg_var_res_Y = dispersion * colSums (vars) / sum (Y_counts);
# PREPARING THE OUTPUT CSV STATISTICS FILE
str = "LOGLHOOD_Z,,FALSE," + Z_logl;
str = append (str, "LOGLHOOD_Z_PVAL,,FALSE," + Z_logl_pValue);
str = append (str, "PEARSON_X2,,FALSE," + X2_pearson);
str = append (str, "PEARSON_X2_BY_DF,,FALSE," + (X2_pearson / df_pearson));
str = append (str, "PEARSON_X2_PVAL,,FALSE," + X2_pearson_pValue);
str = append (str, "DEVIANCE_G2,,FALSE," + G2_deviance);
str = append (str, "DEVIANCE_G2_BY_DF,,FALSE," + (G2_deviance / df_deviance));
str = append (str, "DEVIANCE_G2_PVAL,,FALSE," + G2_deviance_pValue);
str = append (str, "LOGLHOOD_Z,,TRUE," + Z_logl_scaled);
str = append (str, "LOGLHOOD_Z_PVAL,,TRUE," + Z_logl_scaled_pValue);
str = append (str, "PEARSON_X2,,TRUE," + X2_scaled);
str = append (str, "PEARSON_X2_BY_DF,,TRUE," + (X2_scaled / df_pearson));
str = append (str, "PEARSON_X2_PVAL,,TRUE," + X2_scaled_pValue);
str = append (str, "DEVIANCE_G2,,TRUE," + G2_scaled);
str = append (str, "DEVIANCE_G2_BY_DF,,TRUE," + (G2_scaled / df_deviance));
str = append (str, "DEVIANCE_G2_PVAL,,TRUE," + G2_scaled_pValue);
for (i in 1:ncol(Y)) {
str = append (str, "AVG_TOT_Y," + i + ",," + as.scalar (avg_tot_Y [1, i]));
str = append (str, "STDEV_TOT_Y," + i + ",," + as.scalar (sqrt (var_tot_Y [1, i])));
str = append (str, "AVG_RES_Y," + i + ",," + as.scalar (avg_res_Y [1, i]));
str = append (str, "STDEV_RES_Y," + i + ",," + as.scalar (sqrt (var_res_Y [1, i])));
str = append (str, "PRED_STDEV_RES," + i + ",TRUE," + as.scalar (sqrt (predicted_avg_var_res_Y [1, i])));
str = append (str, "R2," + i + ",," + as.scalar (R2 [1, i]));
str = append (str, "ADJUSTED_R2," + i + ",," + as.scalar (adjust_R2 [1, i]));
str = append (str, "R2_NOBIAS," + i + ",," + as.scalar (R2_nobias [1, i]));
str = append (str, "ADJUSTED_R2_NOBIAS," + i + ",," + as.scalar (adjust_R2_nobias [1, i]));
}
if (fileO != " ") {
write (str, fileO);
} else {
print (str);
}
}
glm_means_and_vars =
function (Matrix[double] linear_terms, int dist_type, double var_power, int link_type, double link_power)
return (Matrix[double] means, Matrix[double] vars)
# NOTE: "vars" represents the variance without dispersion, i.e. the V(mu) function.
{
num_points = nrow (linear_terms);
if (dist_type == 1 & link_type == 1) {
# POWER DISTRIBUTION
if (link_power == 0) {
y_mean = exp (linear_terms);
} else if (link_power == 1.0) {
y_mean = linear_terms;
} else if (link_power == -1.0) {
y_mean = 1.0 / linear_terms;
} else {
y_mean = linear_terms ^ (1.0 / link_power);
}
if (var_power == 0) {
var_function = matrix (1.0, rows = num_points, cols = 1);
} else if (var_power == 1.0) {
var_function = y_mean;
} else {
var_function = y_mean ^ var_power;
}
means = y_mean;
vars = var_function;
} else if (dist_type == 2 & link_type >= 1 & link_type <= 5) {
# BINOMIAL/BERNOULLI DISTRIBUTION
y_prob = matrix (0.0, rows = num_points, cols = 2);
if (link_type == 1 & link_power == 0) { # Binomial.log
y_prob [, 1] = exp (linear_terms);
y_prob [, 2] = 1.0 - y_prob [, 1];
} else if (link_type == 1 & link_power != 0) { # Binomial.power_nonlog
y_prob [, 1] = linear_terms ^ (1.0 / link_power);
y_prob [, 2] = 1.0 - y_prob [, 1];
} else if (link_type == 2) { # Binomial.logit
elt = exp (linear_terms);
y_prob [, 1] = elt / (1.0 + elt);
y_prob [, 2] = 1.0 / (1.0 + elt);
} else if (link_type == 3) { # Binomial.probit
sign_lt = 2 * (linear_terms >= 0) - 1;
t_gp = 1.0 / (1.0 + abs (linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0)
erf_corr =
t_gp * ( 0.254829592
+ t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+ t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+ t_gp * (-1.453152027
+ t_gp * 1.061405429)))) * sign_lt * exp (- (linear_terms ^ 2) / 2.0);
y_prob [, 1] = (1 + sign_lt) - erf_corr;
y_prob [, 2] = (1 - sign_lt) + erf_corr;
y_prob = y_prob / 2;
} else if (link_type == 4) { # Binomial.cloglog
elt = exp (linear_terms);
is_too_small = ((10000000 + elt) == 10000000);
y_prob [, 2] = exp (- elt);
y_prob [, 1] = (1 - is_too_small) * (1.0 - y_prob [, 2]) + is_too_small * elt * (1.0 - elt / 2);
} else if (link_type == 5) { # Binomial.cauchit
atan_linear_terms = atan (linear_terms);
y_prob [, 1] = 0.5 + atan_linear_terms / pi;
y_prob [, 2] = 0.5 - atan_linear_terms / pi;
}
means = y_prob;
ones_ctg = matrix (1, rows = 2, cols = 1);
vars = means * (means %*% (1 - diag (ones_ctg)));
} else if (dist_type == 3) {
# MULTINOMIAL LOGIT DISTRIBUTION
elt = exp (linear_terms);
ones_pts = matrix (1, rows = num_points, cols = 1);
elt = cbind (elt, ones_pts);
ones_ctg = matrix (1, rows = ncol (elt), cols = 1);
means = elt / (rowSums (elt) %*% t(ones_ctg));
vars = means * (means %*% (1 - diag (ones_ctg)));
} else {
means = matrix (0.0, rows = num_points, cols = 1);
vars = matrix (0.0, rows = num_points, cols = 1);
} }
glm_partial_loglikelihood_for_power_dist_and_link = # Assumes: dist_type == 1 & link_type == 1
function (Matrix[double] linear_terms, Matrix[double] Y, double var_power, double link_power)
return (Matrix[double] log_l_part)
{
num_records = nrow (Y);
if (var_power == 1.0) { # Poisson
if (link_power == 0) { # Poisson.log
is_natural_parameter_log_zero = (linear_terms == -Inf);
natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0);
b_cumulant = exp (linear_terms);
} else { # Poisson.power_nonlog
is_natural_parameter_log_zero = (linear_terms == 0);
natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
}
is_minus_infinity = (Y > 0) * is_natural_parameter_log_zero;
log_l_part = Y * natural_parameters - b_cumulant - is_minus_infinity / (1 - is_minus_infinity);
} else {
if (var_power == 2.0 & link_power == 0) { # Gamma.log
natural_parameters = - exp (- linear_terms);
b_cumulant = linear_terms;
} else if (var_power == 2.0) { # Gamma.power_nonlog
natural_parameters = - linear_terms ^ (- 1.0 / link_power);
b_cumulant = log (linear_terms) / link_power;
} else if (link_power == 0) { # PowerDist.log
natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
} else { # PowerDist.power_nonlog
power_np = (1.0 - var_power) / link_power;
natural_parameters = (linear_terms ^ power_np) / (1.0 - var_power);
power_cu = (2.0 - var_power) / link_power;
b_cumulant = (linear_terms ^ power_cu) / (2.0 - var_power);
}
log_l_part = Y * natural_parameters - b_cumulant;
} }