| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # Default values for input parameters: |
| fileX = $X; |
| fileB = $B; |
| fileM = ifdef ($M, " "); |
| fileY = ifdef ($Y, " "); |
| fileO = ifdef ($O, " "); |
| fmtM = ifdef ($fmt, "text"); |
| |
| dist_type = ifdef ($dfam, 1); # $dfam = 1; |
| var_power = ifdef ($vpow, 0.0); # $vpow = 0.0; |
| link_type = ifdef ($link, 0); # $link = 0; |
| link_power = ifdef ($lpow, 1.0); # $lpow = 1.0; |
| dispersion = ifdef ($disp, 1.0); # $disp = 1.0; |
| |
| var_power = as.double (var_power); |
| link_power = as.double (link_power); |
| dispersion = as.double (dispersion); |
| |
| if (dist_type == 3) { |
| link_type = 2; |
| } else { if (link_type == 0) { # Canonical Link |
| if (dist_type == 1) { |
| link_type = 1; |
| link_power = 1.0 - var_power; |
| } else { if (dist_type == 2) { |
| link_type = 2; |
| }}} } |
| |
| X = read(fileX); |
| #X = table(X[,1], X[,2], X[,3]) |
| |
| num_records = nrow (X); |
| num_features = ncol (X); |
| |
| W = read (fileB); |
| if (dist_type == 3) { |
| beta = W [1 : ncol (X), ]; |
| intercept = W [nrow(W), ]; |
| } else { |
| beta = W [1 : ncol (X), 1]; |
| intercept = W [nrow(W), 1]; |
| } |
| if (nrow (W) == ncol (X)) { |
| intercept = 0.0 * intercept; |
| is_intercept = FALSE; |
| } else { |
| num_features = num_features + 1; |
| is_intercept = TRUE; |
| } |
| |
| ones_rec = matrix (1, rows = num_records, cols = 1); |
| linear_terms = X %*% beta + ones_rec %*% intercept; |
| [means, vars] = |
| glm_means_and_vars (linear_terms, dist_type, var_power, link_type, link_power); |
| |
| if (fileM != " ") { |
| write (means, fileM, format=fmtM); |
| } |
| |
| predicted_y = rowIndexMax(means) |
| write(predicted_y, $P, format=fmtM) |
| |
| if (fileY != " ") |
| { |
| Y = read (fileY); |
| ones_ctg = matrix (1, rows = ncol(Y), cols = 1); |
| |
| # Statistics To Compute: |
| |
| Z_logl = NaN; |
| Z_logl_pValue = NaN; |
| X2_pearson = NaN; |
| df_pearson = -1; |
| G2_deviance = NaN; |
| df_deviance = -1; |
| X2_pearson_pValue = NaN; |
| G2_deviance_pValue = NaN; |
| Z_logl_scaled = NaN; |
| Z_logl_scaled_pValue = NaN; |
| X2_scaled = NaN; |
| X2_scaled_pValue = NaN; |
| G2_scaled = NaN; |
| G2_scaled_pValue = NaN; |
| |
| # set Y_counts to avoid 'Initialization of Y_counts depends on if-else execution' warning |
| Y_counts = matrix(0.0, rows=1, cols=1); |
| |
| if (dist_type == 1 & link_type == 1) { |
| # |
| # POWER DISTRIBUTIONS (GAUSSIAN, POISSON, GAMMA, ETC.) |
| # |
| if (link_power == 0.0) { |
| is_zero_Y = (Y == 0.0); |
| lt_saturated = log (Y + is_zero_Y) - is_zero_Y / (1.0 - is_zero_Y); |
| } else { |
| lt_saturated = Y ^ link_power; |
| } |
| Y_counts = ones_rec; |
| |
| X2_pearson = sum ((Y - means) ^ 2 / vars); |
| df_pearson = num_records - num_features; |
| |
| log_l_part = |
| glm_partial_loglikelihood_for_power_dist_and_link (linear_terms, Y, var_power, link_power); |
| log_l_part_saturated = |
| glm_partial_loglikelihood_for_power_dist_and_link (lt_saturated, Y, var_power, link_power); |
| |
| G2_deviance = 2 * sum (log_l_part_saturated) - 2 * sum (log_l_part); |
| df_deviance = num_records - num_features; |
| |
| } else { if (dist_type >= 2) { |
| # |
| # BINOMIAL AND MULTINOMIAL DISTRIBUTIONS |
| # |
| if (ncol (Y) == 1) { |
| num_categories = ncol (beta) + 1; |
| if (min (Y) <= 0) { |
| # Category labels "0", "-1" etc. are converted into the baseline label |
| Y = Y + (- Y + num_categories) * (Y <= 0); |
| } |
| Y_size = min (num_categories, max(Y)); |
| Y_unsized = table (seq (1, num_records, 1), Y); |
| Y = matrix (0, rows = num_records, cols = num_categories); |
| Y [, 1 : Y_size] = Y_unsized [, 1 : Y_size]; |
| Y_counts = ones_rec; |
| } else { |
| Y_counts = rowSums (Y); |
| } |
| |
| P = means; |
| zero_Y = (Y == 0.0); |
| zero_P = (P == 0.0); |
| ones_ctg = matrix (1, rows = ncol(Y), cols = 1); |
| |
| logl_vec = rowSums (Y * log (P + zero_Y) ); |
| ent1_vec = rowSums (P * log (P + zero_P) ); |
| ent2_vec = rowSums (P * (log (P + zero_P))^2); |
| E_logl = sum (Y_counts * ent1_vec); |
| V_logl = sum (Y_counts * (ent2_vec - ent1_vec ^ 2)); |
| Z_logl = (sum (logl_vec) - E_logl) / sqrt (V_logl); |
| |
| means = means * (Y_counts %*% t(ones_ctg)); |
| vars = vars * (Y_counts %*% t(ones_ctg)); |
| |
| frac_below_5 = sum (means < 5) / (nrow (means) * ncol (means)); |
| frac_below_1 = sum (means < 1) / (nrow (means) * ncol (means)); |
| |
| if (frac_below_5 > 0.2 | frac_below_1 > 0.0) { |
| print ("WARNING: residual statistics are inaccurate here due to low cell means."); |
| } |
| |
| X2_pearson = sum ((Y - means) ^ 2 / means); |
| df_pearson = (num_records - num_features) * (ncol(Y) - 1); |
| |
| G2_deviance = 2 * sum (Y * log ((Y + zero_Y) / (means + zero_Y))); |
| df_deviance = (num_records - num_features) * (ncol(Y) - 1); |
| }} |
| |
| if (Z_logl == Z_logl) { |
| Z_logl_absneg = - abs (Z_logl); |
| Z_logl_pValue = 2.0 * pnorm(target = Z_logl_absneg); |
| } |
| if (X2_pearson == X2_pearson & df_pearson > 0) { |
| X2_pearson_pValue = pchisq(target = X2_pearson, df = df_pearson, lower.tail=FALSE); |
| } |
| if (G2_deviance == G2_deviance & df_deviance > 0) { |
| G2_deviance_pValue = pchisq(target = G2_deviance, df = df_deviance, lower.tail=FALSE); |
| } |
| |
| Z_logl_scaled = Z_logl / sqrt (dispersion); |
| X2_scaled = X2_pearson / dispersion; |
| G2_scaled = G2_deviance / dispersion; |
| |
| if (Z_logl_scaled == Z_logl_scaled) { |
| Z_logl_scaled_absneg = - abs (Z_logl_scaled); |
| Z_logl_scaled_pValue = 2.0 * pnorm(target = Z_logl_scaled_absneg); |
| } |
| if (X2_scaled == X2_scaled & df_pearson > 0) { |
| X2_scaled_pValue = pchisq(target = X2_scaled, df = df_pearson, lower.tail=FALSE); |
| } |
| if (G2_scaled == G2_scaled & df_deviance > 0) { |
| G2_scaled_pValue = pchisq(target = G2_scaled, df = df_deviance, lower.tail=FALSE); |
| } |
| |
| avg_tot_Y = colSums ( Y ) / sum (Y_counts); |
| avg_res_Y = colSums (Y - means) / sum (Y_counts); |
| |
| ss_avg_tot_Y = colSums (( Y - Y_counts %*% avg_tot_Y) ^ 2); |
| ss_res_Y = colSums ((Y - means) ^ 2); |
| ss_avg_res_Y = colSums ((Y - means - Y_counts %*% avg_res_Y) ^ 2); |
| |
| df_ss_res_Y = sum (Y_counts) - num_features; |
| if (is_intercept) { |
| df_ss_avg_res_Y = df_ss_res_Y; |
| } else { |
| df_ss_avg_res_Y = df_ss_res_Y - 1; |
| } |
| |
| var_tot_Y = ss_avg_tot_Y / (sum (Y_counts) - 1); |
| if (df_ss_avg_res_Y > 0) { |
| var_res_Y = ss_avg_res_Y / df_ss_avg_res_Y; |
| } else { |
| var_res_Y = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0; |
| } |
| plain_R2_nobias = 1 - ss_avg_res_Y / ss_avg_tot_Y; |
| adjust_R2_nobias = 1 - var_res_Y / var_tot_Y; |
| plain_R2 = 1 - ss_res_Y / ss_avg_tot_Y; |
| if (df_ss_res_Y > 0) { |
| adjust_R2 = 1 - (ss_res_Y / df_ss_res_Y) / var_tot_Y; |
| } else { |
| adjust_R2 = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0; |
| } |
| |
| predicted_avg_var_res_Y = dispersion * colSums (vars) / sum (Y_counts); |
| |
| # PREPARING THE OUTPUT CSV STATISTICS FILE |
| |
| str = "LOGLHOOD_Z,,FALSE," + Z_logl; |
| str = append (str, "LOGLHOOD_Z_PVAL,,FALSE," + Z_logl_pValue); |
| str = append (str, "PEARSON_X2,,FALSE," + X2_pearson); |
| str = append (str, "PEARSON_X2_BY_DF,,FALSE," + (X2_pearson / df_pearson)); |
| str = append (str, "PEARSON_X2_PVAL,,FALSE," + X2_pearson_pValue); |
| str = append (str, "DEVIANCE_G2,,FALSE," + G2_deviance); |
| str = append (str, "DEVIANCE_G2_BY_DF,,FALSE," + (G2_deviance / df_deviance)); |
| str = append (str, "DEVIANCE_G2_PVAL,,FALSE," + G2_deviance_pValue); |
| str = append (str, "LOGLHOOD_Z,,TRUE," + Z_logl_scaled); |
| str = append (str, "LOGLHOOD_Z_PVAL,,TRUE," + Z_logl_scaled_pValue); |
| str = append (str, "PEARSON_X2,,TRUE," + X2_scaled); |
| str = append (str, "PEARSON_X2_BY_DF,,TRUE," + (X2_scaled / df_pearson)); |
| str = append (str, "PEARSON_X2_PVAL,,TRUE," + X2_scaled_pValue); |
| str = append (str, "DEVIANCE_G2,,TRUE," + G2_scaled); |
| str = append (str, "DEVIANCE_G2_BY_DF,,TRUE," + (G2_scaled / df_deviance)); |
| str = append (str, "DEVIANCE_G2_PVAL,,TRUE," + G2_scaled_pValue); |
| |
| for (i in 1:ncol(Y)) { |
| str = append (str, "AVG_TOT_Y," + i + ",," + as.scalar (avg_tot_Y [1, i])); |
| str = append (str, "STDEV_TOT_Y," + i + ",," + as.scalar (sqrt (var_tot_Y [1, i]))); |
| str = append (str, "AVG_RES_Y," + i + ",," + as.scalar (avg_res_Y [1, i])); |
| str = append (str, "STDEV_RES_Y," + i + ",," + as.scalar (sqrt (var_res_Y [1, i]))); |
| str = append (str, "PRED_STDEV_RES," + i + ",TRUE," + as.scalar (sqrt (predicted_avg_var_res_Y [1, i]))); |
| str = append (str, "PLAIN_R2," + i + ",," + as.scalar (plain_R2 [1, i])); |
| str = append (str, "ADJUSTED_R2," + i + ",," + as.scalar (adjust_R2 [1, i])); |
| str = append (str, "PLAIN_R2_NOBIAS," + i + ",," + as.scalar (plain_R2_nobias [1, i])); |
| str = append (str, "ADJUSTED_R2_NOBIAS," + i + ",," + as.scalar (adjust_R2_nobias [1, i])); |
| } |
| |
| if (fileO != " ") { |
| write (str, fileO); |
| } else { |
| print (str); |
| } |
| } |
| |
| glm_means_and_vars = |
| function (Matrix[double] linear_terms, int dist_type, double var_power, int link_type, double link_power) |
| return (Matrix[double] means, Matrix[double] vars) |
| # NOTE: "vars" represents the variance without dispersion, i.e. the V(mu) function. |
| { |
| num_points = nrow (linear_terms); |
| if (dist_type == 1 & link_type == 1) { |
| # POWER DISTRIBUTION |
| if (link_power == 0.0) { |
| y_mean = exp (linear_terms); |
| } else { if (link_power == 1.0) { |
| y_mean = linear_terms; |
| } else { if (link_power == -1.0) { |
| y_mean = 1.0 / linear_terms; |
| } else { |
| y_mean = linear_terms ^ (1.0 / link_power); |
| }}} |
| if (var_power == 0.0) { |
| var_function = matrix (1.0, rows = num_points, cols = 1); |
| } else { if (var_power == 1.0) { |
| var_function = y_mean; |
| } else { |
| var_function = y_mean ^ var_power; |
| }} |
| means = y_mean; |
| vars = var_function; |
| } else { if (dist_type == 2 & link_type >= 1 & link_type <= 5) { |
| # BINOMIAL/BERNOULLI DISTRIBUTION |
| y_prob = matrix (0.0, rows = num_points, cols = 2); |
| if (link_type == 1 & link_power == 0.0) { # Binomial.log |
| y_prob [, 1] = exp (linear_terms); |
| y_prob [, 2] = 1.0 - y_prob [, 1]; |
| } else { if (link_type == 1 & link_power != 0.0) { # Binomial.power_nonlog |
| y_prob [, 1] = linear_terms ^ (1.0 / link_power); |
| y_prob [, 2] = 1.0 - y_prob [, 1]; |
| } else { if (link_type == 2) { # Binomial.logit |
| elt = exp (linear_terms); |
| y_prob [, 1] = elt / (1.0 + elt); |
| y_prob [, 2] = 1.0 / (1.0 + elt); |
| } else { if (link_type == 3) { # Binomial.probit |
| sign_lt = 2 * (linear_terms >= 0.0) - 1; |
| t_gp = 1.0 / (1.0 + abs (linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) |
| erf_corr = |
| t_gp * ( 0.254829592 |
| + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, |
| + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 |
| + t_gp * (-1.453152027 |
| + t_gp * 1.061405429)))) * sign_lt * exp (- (linear_terms ^ 2) / 2.0); |
| y_prob [, 1] = (1 + sign_lt) - erf_corr; |
| y_prob [, 2] = (1 - sign_lt) + erf_corr; |
| y_prob = y_prob / 2; |
| } else { if (link_type == 4) { # Binomial.cloglog |
| elt = exp (linear_terms); |
| is_too_small = ((10000000 + elt) == 10000000); |
| y_prob [, 2] = exp (- elt); |
| y_prob [, 1] = (1 - is_too_small) * (1.0 - y_prob [, 2]) + is_too_small * elt * (1.0 - elt / 2); |
| } else { if (link_type == 5) { # Binomial.cauchit |
| atan_linear_terms = atan (linear_terms); |
| y_prob [, 1] = 0.5 + atan_linear_terms / pi; |
| y_prob [, 2] = 0.5 - atan_linear_terms / pi; |
| }}}}}} |
| means = y_prob; |
| ones_ctg = matrix (1, rows = 2, cols = 1); |
| vars = means * (means %*% (1 - diag (ones_ctg))); |
| } else { if (dist_type == 3) { |
| # MULTINOMIAL LOGIT DISTRIBUTION |
| elt = exp (linear_terms); |
| ones_pts = matrix (1, rows = num_points, cols = 1); |
| elt = cbind (elt, ones_pts); |
| ones_ctg = matrix (1, rows = ncol (elt), cols = 1); |
| means = elt / (rowSums (elt) %*% t(ones_ctg)); |
| vars = means * (means %*% (1 - diag (ones_ctg))); |
| } else { |
| means = matrix (0.0, rows = num_points, cols = 1); |
| vars = matrix (0.0, rows = num_points, cols = 1); |
| } }}} |
| |
| glm_partial_loglikelihood_for_power_dist_and_link = # Assumes: dist_type == 1 & link_type == 1 |
| function (Matrix[double] linear_terms, Matrix[double] Y, double var_power, double link_power) |
| return (Matrix[double] log_l_part) |
| { |
| num_records = nrow (Y); |
| if (var_power == 1.0) { # Poisson |
| if (link_power == 0.0) { # Poisson.log |
| is_natural_parameter_log_zero = (linear_terms == -Inf); |
| natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0); |
| b_cumulant = exp (linear_terms); |
| } else { # Poisson.power_nonlog |
| is_natural_parameter_log_zero = (linear_terms == 0.0); |
| natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power; |
| b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero; |
| } |
| is_minus_infinity = (Y > 0) * is_natural_parameter_log_zero; |
| log_l_part = Y * natural_parameters - b_cumulant - is_minus_infinity / (1 - is_minus_infinity); |
| } else { |
| if (var_power == 2.0 & link_power == 0.0) { # Gamma.log |
| natural_parameters = - exp (- linear_terms); |
| b_cumulant = linear_terms; |
| } else { if (var_power == 2.0) { # Gamma.power_nonlog |
| natural_parameters = - linear_terms ^ (- 1.0 / link_power); |
| b_cumulant = log (linear_terms) / link_power; |
| } else { if (link_power == 0.0) { # PowerDist.log |
| natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power); |
| b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power); |
| } else { # PowerDist.power_nonlog |
| power_np = (1.0 - var_power) / link_power; |
| natural_parameters = (linear_terms ^ power_np) / (1.0 - var_power); |
| power_cu = (2.0 - var_power) / link_power; |
| b_cumulant = (linear_terms ^ power_cu) / (2.0 - var_power); |
| }}} |
| log_l_part = Y * natural_parameters - b_cumulant; |
| } } |