| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # THIS SCRIPT CHOOSES A GLM REGRESSION MODEL IN A STEPWISE ALGIRITHM USING AIC |
| # EACH GLM REGRESSION IS SOLVED USING NEWTON/FISHER SCORING WITH TRUST REGIONS |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X String --- Location to read the matrix X of feature vectors |
| # Y String --- Location to read response matrix Y with 1 column |
| # B String --- Location to store estimated regression parameters (the betas) |
| # S String --- Location to write the selected features ordered as computed by the algorithm |
| # O String " " Location to write the printed statistics; by default is standard output |
| # link Int 2 Link function code: 1 = log, 2 = Logit, 3 = Probit, 4 = Cloglog |
| # yneg Double 0.0 Response value for Bernoulli "No" label, usually 0.0 or -1.0 |
| # icpt Int 0 Intercept presence, X columns shifting and rescaling: |
| # 0 = no intercept, no shifting, no rescaling; |
| # 1 = add intercept, but neither shift nor rescale X; |
| # 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 |
| # tol Double 0.000001 Tolerance (epsilon) |
| # disp Double 0.0 (Over-)dispersion value, or 0.0 to estimate it from data |
| # moi Int 200 Maximum number of outer (Newton / Fisher Scoring) iterations |
| # mii Int 0 Maximum number of inner (Conjugate Gradient) iterations, 0 = no maximum |
| # thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr |
| # no further features are being checked and the algorithm stops |
| # fmt String "text" The betas matrix output format, such as "text" or "csv" |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT: Matrix beta, whose size depends on icpt: |
| # icpt=0: ncol(X) x 1; icpt=1: (ncol(X) + 1) x 1; icpt=2: (ncol(X) + 1) x 2 |
| # |
| # In addition, in the last run of GLM some statistics are provided in CSV format, one comma-separated name-value |
| # pair per each line, as follows: |
| # |
| # NAME MEANING |
| # ------------------------------------------------------------------------------------------- |
| # TERMINATION_CODE A positive integer indicating success/failure as follows: |
| # 1 = Converged successfully; 2 = Maximum number of iterations reached; |
| # 3 = Input (X, Y) out of range; 4 = Distribution/link is not supported |
| # BETA_MIN Smallest beta value (regression coefficient), excluding the intercept |
| # BETA_MIN_INDEX Column index for the smallest beta value |
| # BETA_MAX Largest beta value (regression coefficient), excluding the intercept |
| # BETA_MAX_INDEX Column index for the largest beta value |
| # INTERCEPT Intercept value, or NaN if there is no intercept (if icpt=0) |
| # DISPERSION Dispersion used to scale deviance, provided as "disp" input parameter |
| # or estimated (same as DISPERSION_EST) if the "disp" parameter is <= 0 |
| # DISPERSION_EST Dispersion estimated from the dataset |
| # DEVIANCE_UNSCALED Deviance from the saturated model, assuming dispersion == 1.0 |
| # DEVIANCE_SCALED Deviance from the saturated model, scaled by the DISPERSION value |
| # ------------------------------------------------------------------------------------------- |
| # |
| # HOW TO INVOKE THIS SCRIPT - EXAMPLE: |
| # hadoop jar SystemDS.jar -f StepGLM.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas |
| # S=OUTPUT_DIR_S/selected O=OUTPUT_DIR/stats link=2 yneg=-1.0 icpt=2 tol=0.00000001 |
| # disp=1.0 moi=100 mii=10 thr=0.01 fmt=csv |
| # |
| # THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE FOLLOWING LINK FUNCTIONS ONLY! |
| # - LOG |
| # - LOGIT |
| # - PROBIT |
| # - CLOGLOG |
| |
| fileX = $X; |
| fileY = $Y; |
| fileB = $B; |
| intercept_status = ifdef ($icpt, 0); |
| thr = ifdef ($thr, 0.01); |
| bernoulli_No_label = ifdef ($yneg, 0.0); # $yneg = 0.0; |
| distribution_type = 2; |
| |
| bernoulli_No_label = as.double (bernoulli_No_label); |
| |
| # currently only the forward selection strategy in supported: start from one feature and iteratively add |
| # features until AIC improves |
| dir = "forward"; |
| |
| print("BEGIN STEPWISE GLM SCRIPT"); |
| print ("Reading X and Y..."); |
| X_orig = read (fileX); |
| Y = read (fileY); |
| |
| if (distribution_type == 2 & ncol(Y) == 1) { |
| is_Y_negative = (Y == bernoulli_No_label); |
| Y = cbind (1 - is_Y_negative, is_Y_negative); |
| count_Y_negative = sum (is_Y_negative); |
| if (count_Y_negative == 0) { |
| stop ("StepGLM Input Error: all Y-values encode Bernoulli YES-label, none encode NO-label"); |
| } |
| if (count_Y_negative == nrow(Y)) { |
| stop ("StepGLM Input Error: all Y-values encode Bernoulli NO-label, none encode YES-label"); |
| } |
| } |
| |
| num_records = nrow (X_orig); |
| num_features = ncol (X_orig); |
| |
| # BEGIN STEPWISE GENERALIZED LINEAR MODELS |
| |
| if (dir == "forward") { |
| |
| continue = TRUE; |
| columns_fixed = matrix (0, rows = 1, cols = num_features); |
| columns_fixed_ordered = matrix (0, rows = 1, cols = 1); |
| |
| # X_global stores the best model found at each step |
| X_global = matrix (0, rows = num_records, cols = 1); |
| |
| if (intercept_status == 0) { |
| # Compute AIC of an empty model with no features and no intercept (all Ys are zero) |
| [AIC_best] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered, " "); |
| } else { |
| # compute AIC of an empty model with only intercept (all Ys are constant) |
| all_ones = matrix (1, rows = num_records, cols = 1); |
| [AIC_best] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered, " "); |
| } |
| print ("Best AIC without any features: " + AIC_best); |
| |
| # First pass to examine single features |
| AICs = matrix (AIC_best, rows = 1, cols = num_features); |
| parfor (i in 1:num_features) { |
| [AIC_1] = glm_fit (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered, " "); |
| AICs[1,i] = AIC_1; |
| } |
| |
| # Determine the best AIC |
| column_best = 0; |
| for (k in 1:num_features) { |
| AIC_cur = as.scalar (AICs[1,k]); |
| if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) { |
| column_best = k; |
| AIC_best = as.scalar(AICs[1,k]); |
| } |
| } |
| |
| if (column_best == 0) { |
| print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); |
| if (intercept_status == 0) { |
| # Compute AIC of an empty model with no features and no intercept (all Ys are zero) |
| [AIC_best] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered, fileB); |
| } else { |
| # compute AIC of an empty model with only intercept (all Ys are constant) |
| ###all_ones = matrix (1, rows = num_records, cols = 1); |
| [AIC_best] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered, fileB); |
| } |
| }; |
| |
| print ("Best AIC " + AIC_best + " achieved with feature: " + column_best); |
| columns_fixed[1,column_best] = 1; |
| columns_fixed_ordered[1,1] = column_best; |
| X_global = X_orig[,column_best]; |
| |
| while (continue) { |
| # Subsequent passes over the features |
| parfor (i in 1:num_features) { |
| if (as.scalar(columns_fixed[1,i]) == 0) { |
| |
| # Construct the feature matrix |
| X = cbind (X_global, X_orig[,i]); |
| |
| [AIC_2] = glm_fit (X, Y, intercept_status, num_features, columns_fixed_ordered, " "); |
| AICs[1,i] = AIC_2; |
| } |
| } |
| |
| # Determine the best AIC |
| for (k in 1:num_features) { |
| AIC_cur = as.scalar (AICs[1,k]); |
| if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) { |
| column_best = k; |
| AIC_best = as.scalar(AICs[1,k]); |
| } |
| } |
| |
| # cbind best found features (i.e., columns) to X_global |
| if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found |
| print ("Best AIC " + AIC_best + " achieved with feature: " + column_best); |
| columns_fixed[1,column_best] = 1; |
| columns_fixed_ordered = cbind (columns_fixed_ordered, as.matrix(column_best)); |
| if (ncol(columns_fixed_ordered) == num_features) { # all features examined |
| X_global = cbind (X_global, X_orig[,column_best]); |
| continue = FALSE; |
| } else { |
| X_global = cbind (X_global, X_orig[,column_best]); |
| } |
| } else { |
| continue = FALSE; |
| } |
| } |
| |
| # run GLM with selected set of features |
| print ("Running GLM with selected features..."); |
| [AIC] = glm_fit (X_global, Y, intercept_status, num_features, columns_fixed_ordered, fileB); |
| |
| } else { |
| stop ("Currently only forward selection strategy is supported!"); |
| } |
| |
| |
| ################### UDFS USED IN THIS SCRIPT ################## |
| |
| glm_fit = function (Matrix[Double] X, Matrix[Double] Y, Int intercept_status, Double num_features_orig, Matrix[Double] Selected, String fileB) return (Double AIC) { |
| |
| # distribution family code: 1 = Power, 2 = Bernoulli/Binomial; currently only Bernouli distribution family is supported! |
| distribution_type = 2; # $dfam = 2; |
| variance_as_power_of_the_mean = 0.0; # $vpow = 0.0; |
| # link function code: 0 = canonical (depends on distribution), 1 = Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit; |
| # currently only log (link = 1), logit (link = 2), probit (link = 3), and cloglog (link = 4) are supported! |
| link_type = ifdef ($link, 2); # $link = 2; |
| link_as_power_of_the_mean = 0.0; # $lpow = 0.0; |
| |
| dispersion = ifdef ($disp, 0.0); # $disp = 0.0; |
| eps = ifdef ($tol, 0.000001); # $tol = 0.000001; |
| max_iteration_IRLS = ifdef ($moi, 200); # $moi = 200; |
| max_iteration_CG = ifdef ($mii, 0); # $mii = 0; |
| |
| variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean); |
| link_as_power_of_the_mean = as.double (link_as_power_of_the_mean); |
| |
| dispersion = as.double (dispersion); |
| eps = as.double (eps); |
| |
| # Default values for output statistics: |
| regularization = 0.0; |
| termination_code = 0.0; |
| min_beta = NaN; |
| i_min_beta = NaN; |
| max_beta = NaN; |
| i_max_beta = NaN; |
| intercept_value = NaN; |
| dispersion = NaN; |
| estimated_dispersion = NaN; |
| deviance_nodisp = NaN; |
| deviance = NaN; |
| |
| ##### INITIALIZE THE PARAMETERS ##### |
| |
| num_records = nrow (X); |
| num_features = ncol (X); |
| zeros_r = matrix (0, rows = num_records, cols = 1); |
| ones_r = 1 + zeros_r; |
| |
| # Introduce the intercept, shift and rescale the columns of X if needed |
| |
| if (intercept_status == 1 | intercept_status == 2) { # add the intercept column |
| X = cbind (X, ones_r); |
| num_features = ncol (X); |
| } |
| |
| scale_lambda = matrix (1, rows = num_features, cols = 1); |
| if (intercept_status == 1 | intercept_status == 2) { |
| scale_lambda [num_features, 1] = 0; |
| } |
| |
| if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1 |
| # Important assumption: X [, num_features] = ones_r |
| avg_X_cols = t(colSums(X)) / num_records; |
| var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / (num_records - 1); |
| is_unsafe = (var_X_cols <= 0); |
| scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); |
| scale_X [num_features, 1] = 1; |
| shift_X = - avg_X_cols * scale_X; |
| shift_X [num_features, 1] = 0; |
| rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2); |
| } else { |
| scale_X = matrix (1, rows = num_features, cols = 1); |
| shift_X = matrix (0, rows = num_features, cols = 1); |
| rowSums_X_sq = rowSums (X ^ 2); |
| } |
| |
| # Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2) |
| # with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale. |
| # The transform is then associatively applied to the other side of the expression, |
| # and is rewritten via "scale_X" and "shift_X" as follows: |
| # |
| # ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: |
| # ssX_A = diag (scale_X) %*% A; |
| # ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A; |
| # |
| # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: |
| # tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ]; |
| |
| # Initialize other input-dependent parameters |
| |
| lambda = scale_lambda * regularization; |
| if (max_iteration_CG == 0) { |
| max_iteration_CG = num_features; |
| } |
| |
| # Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const] |
| |
| if (link_type == 0) { |
| if (distribution_type == 1) { |
| link_type = 1; |
| link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean; |
| } else { |
| if (distribution_type == 2) { |
| link_type = 2; |
| } |
| } |
| } |
| |
| # For power distributions and/or links, we use two constants, |
| # "variance as power of the mean" and "link_as_power_of_the_mean", |
| # to specify the variance and the link as arbitrary powers of the |
| # mean. However, the variance-powers of 1.0 (Poisson family) and |
| # 2.0 (Gamma family) have to be treated as special cases, because |
| # these values integrate into logarithms. The link-power of 0.0 |
| # is also special as it represents the logarithm link. |
| |
| num_response_columns = ncol (Y); |
| is_supported = 0; |
| if (num_response_columns == 2 & distribution_type == 2 & link_type >= 1 & link_type <= 4) { # BERNOULLI DISTRIBUTION |
| is_supported = 1; |
| } |
| if (num_response_columns == 1 & distribution_type == 2) { |
| print ("Error: Bernoulli response matrix has not been converted into two-column format."); |
| } |
| |
| if (is_supported == 1) { |
| |
| ##### INITIALIZE THE BETAS ##### |
| |
| [beta, saturated_log_l, isNaN] = |
| glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG); |
| |
| # print(" --- saturated logLik " + saturated_log_l); |
| |
| if (isNaN == 0) { |
| |
| ##### START OF THE MAIN PART ##### |
| |
| sum_X_sq = sum (rowSums_X_sq); |
| trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq)); |
| ### max_trust_delta = trust_delta * 10000.0; |
| log_l = 0.0; |
| deviance_nodisp = 0.0; |
| new_deviance_nodisp = 0.0; |
| isNaN_log_l = 2; |
| newbeta = beta; |
| g = matrix (0.0, rows = num_features, cols = 1); |
| g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); |
| accept_new_beta = 1; |
| reached_trust_boundary = 0; |
| neg_log_l_change_predicted = 0.0; |
| i_IRLS = 0; |
| |
| # print ("BEGIN IRLS ITERATIONS..."); |
| |
| ssX_newbeta = diag (scale_X) %*% newbeta; |
| ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; |
| all_linear_terms = X %*% ssX_newbeta; |
| |
| [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part |
| (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); |
| |
| if (isNaN_new_log_l == 0) { |
| new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); |
| new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); |
| } |
| |
| while (termination_code == 0) { |
| accept_new_beta = 1; |
| |
| if (i_IRLS > 0) { |
| if (isNaN_log_l == 0) { |
| accept_new_beta = 0; |
| } |
| |
| # Decide whether to accept a new iteration point and update the trust region |
| # See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright |
| |
| rho = (- new_log_l + log_l) / neg_log_l_change_predicted; |
| if (rho < 0.25 | isNaN_new_log_l == 1) { |
| trust_delta = 0.25 * trust_delta; |
| } |
| if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) { |
| trust_delta = 2 * trust_delta; |
| |
| ### if (trust_delta > max_trust_delta) { |
| ### trust_delta = max_trust_delta; |
| ### } |
| } |
| if (rho > 0.1 & isNaN_new_log_l == 0) { |
| accept_new_beta = 1; |
| } |
| } |
| |
| if (accept_new_beta == 1) { |
| beta = newbeta; log_l = new_log_l; deviance_nodisp = new_deviance_nodisp; isNaN_log_l = isNaN_new_log_l; |
| |
| [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); |
| |
| # We introduced these variables to avoid roundoff errors: |
| # g_Y = y_residual / (y_var * link_grad); |
| # w = 1.0 / (y_var * link_grad * link_grad); |
| |
| gXY = - t(X) %*% g_Y; |
| g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ]; |
| g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); |
| } |
| |
| [z, neg_log_l_change_predicted, num_CG_iters, reached_trust_boundary] = |
| get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG); |
| |
| newbeta = beta + z; |
| |
| ssX_newbeta = diag (scale_X) %*% newbeta; |
| ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; |
| all_linear_terms = X %*% ssX_newbeta; |
| |
| [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part |
| (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); |
| |
| if (isNaN_new_log_l == 0) { |
| new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); |
| new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); |
| } |
| |
| log_l_change = new_log_l - log_l; # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps |
| |
| if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 & |
| (2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) ) { |
| termination_code = 1; |
| } |
| rho = - log_l_change / neg_log_l_change_predicted; |
| z_norm = sqrt (sum (z * z)); |
| |
| i_IRLS = i_IRLS + 1; |
| |
| if (i_IRLS == max_iteration_IRLS) { |
| termination_code = 2; |
| } |
| } |
| |
| beta = newbeta; |
| log_l = new_log_l; |
| deviance_nodisp = new_deviance_nodisp; |
| |
| #---------------------------- last part |
| |
| if (termination_code != 1) { |
| print ("One of the runs of GLM did not converged in " + i_IRLS + " steps!"); |
| } |
| |
| ##### COMPUTE AIC ##### |
| |
| if (distribution_type == 2 & link_type >= 1 & link_type <= 4) { |
| AIC = -2 * log_l; |
| if (sum (X) != 0) { |
| AIC = AIC + 2 * num_features; |
| } |
| } else { |
| stop ("Currently only the Bernoulli distribution family the following link functions are supported: log, logit, probit, and cloglog!"); |
| } |
| |
| if (fileB != " ") { |
| fileO = ifdef ($O, " "); |
| fileS = $S; |
| fmt = ifdef ($fmt, "text"); |
| |
| # Output which features give the best AIC and are being used for linear regression |
| write (Selected, fileS, format=fmt); |
| |
| ssX_beta = diag (scale_X) %*% beta; |
| ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta; |
| if (intercept_status == 2) { |
| beta_out = cbind (ssX_beta, beta); |
| } else { |
| beta_out = ssX_beta; |
| } |
| |
| if (intercept_status == 0 & num_features == 1) { |
| p = sum (X == 1); |
| if (p == num_records) { |
| beta_out = beta_out[1,]; |
| } |
| } |
| |
| |
| if (intercept_status == 1 | intercept_status == 2) { |
| intercept_value = as.scalar (beta_out [num_features, 1]); |
| beta_noicept = beta_out [1 : (num_features - 1), 1]; |
| } else { |
| beta_noicept = beta_out [1 : num_features, 1]; |
| } |
| min_beta = min (beta_noicept); |
| max_beta = max (beta_noicept); |
| tmp_i_min_beta = rowIndexMin (t(beta_noicept)) |
| i_min_beta = as.scalar (tmp_i_min_beta [1, 1]); |
| tmp_i_max_beta = rowIndexMax (t(beta_noicept)) |
| i_max_beta = as.scalar (tmp_i_max_beta [1, 1]); |
| |
| ##### OVER-DISPERSION PART ##### |
| |
| all_linear_terms = X %*% ssX_beta; |
| [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); |
| |
| pearson_residual_sq = g_Y ^ 2 / w; |
| pearson_residual_sq = replace (target = pearson_residual_sq, pattern = NaN, replacement = 0); |
| # pearson_residual_sq = (y_residual ^ 2) / y_var; |
| |
| if (num_records > num_features) { |
| estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features); |
| } |
| if (dispersion <= 0) { |
| dispersion = estimated_dispersion; |
| } |
| deviance = deviance_nodisp / dispersion; |
| |
| ##### END OF THE MAIN PART ##### |
| |
| str = "BETA_MIN," + min_beta; |
| str = append (str, "BETA_MIN_INDEX," + i_min_beta); |
| str = append (str, "BETA_MAX," + max_beta); |
| str = append (str, "BETA_MAX_INDEX," + i_max_beta); |
| str = append (str, "INTERCEPT," + intercept_value); |
| str = append (str, "DISPERSION," + dispersion); |
| str = append (str, "DISPERSION_EST," + estimated_dispersion); |
| str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp); |
| str = append (str, "DEVIANCE_SCALED," + deviance); |
| |
| if (fileO != " ") { |
| write (str, fileO); |
| } |
| else { |
| print (str); |
| } |
| |
| # Prepare the output matrix |
| print ("Writing the output matrix..."); |
| if (intercept_status == 0 & num_features == 1) { |
| if (p == num_records) { |
| beta_out_tmp = matrix (0, rows = num_features_orig + 1, cols = 1); |
| beta_out_tmp[num_features_orig + 1,] = beta_out; |
| beta_out = beta_out_tmp; |
| write (beta_out, fileB, format=fmt); |
| stop (""); |
| } else if (sum (X) == 0){ |
| beta_out = matrix (0, rows = num_features_orig, cols = 1); |
| write (beta_out, fileB, format=fmt); |
| stop (""); |
| } |
| } |
| |
| no_selected = ncol (Selected); |
| max_selected = max (Selected); |
| last = max_selected + 1; |
| |
| if (intercept_status != 0) { |
| |
| Selected_ext = cbind (Selected, as.matrix (last)); |
| P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); |
| |
| if (intercept_status == 2) { |
| |
| P1_ssX_beta = P1 * ssX_beta; |
| P2_ssX_beta = colSums (P1_ssX_beta); |
| P1_beta = P1 * beta; |
| P2_beta = colSums (P1_beta); |
| |
| if (max_selected < num_features_orig) { |
| |
| P2_ssX_beta = cbind (P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); |
| P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); |
| |
| P2_ssX_beta[1, num_features_orig+1] = P2_ssX_beta[1, max_selected + 1]; |
| P2_ssX_beta[1, max_selected + 1] = 0; |
| |
| P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1]; |
| P2_beta[1, max_selected + 1] = 0; |
| |
| } |
| beta_out = cbind (t(P2_ssX_beta), t(P2_beta)); |
| |
| } else { |
| |
| P1_beta = P1 * beta; |
| P2_beta = colSums (P1_beta); |
| |
| if (max_selected < num_features_orig) { |
| P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); |
| P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1] ; |
| P2_beta[1, max_selected + 1] = 0; |
| } |
| beta_out = t(P2_beta); |
| |
| } |
| } else { |
| |
| P1 = table (seq (1, no_selected), t(Selected)); |
| P1_beta = P1 * beta; |
| P2_beta = colSums (P1_beta); |
| |
| if (max_selected < num_features_orig) { |
| P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); |
| } |
| |
| beta_out = t(P2_beta); |
| } |
| |
| write ( beta_out, fileB, format=fmt ); |
| |
| } |
| |
| } else { |
| stop ("Input matrices X and/or Y are out of range!"); |
| } |
| } else { |
| stop ("Response matrix with " + num_response_columns + " columns, distribution family (" + distribution_type + ", " + variance_as_power_of_the_mean |
| + ") and link family (" + link_type + ", " + link_as_power_of_the_mean + ") are NOT supported together."); |
| } |
| } |
| |
| glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG) |
| return (Matrix[double] beta, double saturated_log_l, int isNaN) |
| { |
| saturated_log_l = 0.0; |
| isNaN = 0; |
| y_corr = Y [, 1]; |
| if (dist_type == 2) { |
| n_corr = rowSums (Y); |
| is_n_zero = (n_corr == 0); |
| y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero; |
| } |
| linear_terms = y_corr; |
| if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION |
| if (link_power == 0) { |
| if (sum (y_corr < 0) == 0) { |
| is_zero_y_corr = (y_corr == 0); |
| linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); |
| } else { isNaN = 1; } |
| } else { if (link_power == 1.0) { |
| linear_terms = y_corr; |
| } else { if (link_power == -1.0) { |
| linear_terms = 1.0 / y_corr; |
| } else { if (link_power == 0.5) { |
| if (sum (y_corr < 0) == 0) { |
| linear_terms = sqrt (y_corr); |
| } else { isNaN = 1; } |
| } else { if (link_power > 0) { |
| if (sum (y_corr < 0) == 0) { |
| is_zero_y_corr = (y_corr == 0); |
| linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; |
| } else { isNaN = 1; } |
| } else { |
| if (sum (y_corr <= 0) == 0) { |
| linear_terms = y_corr ^ link_power; |
| } else { isNaN = 1; } |
| }}}}} |
| } |
| if (dist_type == 2 & link_type >= 1 & link_type <= 5) |
| { # BINOMIAL/BERNOULLI DISTRIBUTION |
| if (link_type == 1 & link_power == 0) { # Binomial.log |
| if (sum (y_corr < 0) == 0) { |
| is_zero_y_corr = (y_corr == 0); |
| linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); |
| } else { isNaN = 1; } |
| } else { if (link_type == 1 & link_power > 0) { # Binomial.power_nonlog pos |
| if (sum (y_corr < 0) == 0) { |
| is_zero_y_corr = (y_corr == 0); |
| linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; |
| } else { isNaN = 1; } |
| } else { if (link_type == 1) { # Binomial.power_nonlog neg |
| if (sum (y_corr <= 0) == 0) { |
| linear_terms = y_corr ^ link_power; |
| } else { isNaN = 1; } |
| } else { |
| is_zero_y_corr = (y_corr <= 0); |
| is_one_y_corr = (y_corr >= 1.0); |
| y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr); |
| if (link_type == 2) { # Binomial.logit |
| linear_terms = log (y_corr / (1.0 - y_corr)) |
| + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); |
| } else { if (link_type == 3) { # Binomial.probit |
| y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5); |
| t = sqrt (- 2.0 * log (y_below_half)); |
| approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308))); |
| linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5)) |
| + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); |
| } else { if (link_type == 4) { # Binomial.cloglog |
| linear_terms = log (- log (1.0 - y_corr)) |
| - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr) |
| + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); |
| } else { if (link_type == 5) { # Binomial.cauchit |
| linear_terms = tan ((y_corr - 0.5) * pi) |
| + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); |
| }} }}}}} |
| } |
| |
| if (isNaN == 0) { |
| [saturated_log_l, isNaN] = |
| glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power); |
| } |
| |
| if ((dist_type == 1 & link_type == 1 & link_power == 0) | |
| (dist_type == 2 & link_type >= 2)) |
| { |
| desired_eta = 0.0; |
| } else { if (link_type == 1 & link_power == 0) { |
| desired_eta = log (0.5); |
| } else { if (link_type == 1) { |
| desired_eta = 0.5 ^ link_power; |
| } else { |
| desired_eta = 0.5; |
| }}} |
| |
| beta = matrix (0.0, rows = ncol(X), cols = 1); |
| |
| if (desired_eta != 0) { |
| if (icept_status == 1 | icept_status == 2) { |
| beta [nrow(beta), 1] = desired_eta; |
| } else { |
| # We want: avg (X %*% ssX_transform %*% beta) = desired_eta |
| # Note that "ssX_transform" is trivial here, hence ignored |
| |
| beta = straightenX (X, 0.000001, max_iter_CG); |
| beta = beta * desired_eta; |
| } } } |
| |
| |
| glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y, |
| int dist_type, double var_power, int link_type, double link_power) |
| return (Matrix[double] g_Y, Matrix[double] w) |
| # ORIGINALLY we returned more meaningful vectors, namely: |
| # Matrix[double] y_residual : y - y_mean, i.e. y observed - y predicted |
| # Matrix[double] link_gradient : derivative of the link function |
| # Matrix[double] var_function : variance without dispersion, i.e. the V(mu) function |
| # BUT, this caused roundoff errors, so we had to compute "directly useful" vectors |
| # and skip over the "meaningful intermediaries". Now we output these two variables: |
| # g_Y = y_residual / (var_function * link_gradient); |
| # w = 1.0 / (var_function * link_gradient ^ 2); |
| { |
| num_records = nrow (linear_terms); |
| zeros_r = matrix (0.0, rows = num_records, cols = 1); |
| ones_r = 1 + zeros_r; |
| g_Y = zeros_r; |
| w = zeros_r; |
| |
| # Some constants |
| |
| one_over_sqrt_two_pi = 0.39894228040143267793994605993438; |
| ones_2 = matrix (1.0, rows = 1, cols = 2); |
| p_one_m_one = ones_2; |
| p_one_m_one [1, 2] = -1.0; |
| m_one_p_one = ones_2; |
| m_one_p_one [1, 1] = -1.0; |
| zero_one = ones_2; |
| zero_one [1, 1] = 0.0; |
| one_zero = ones_2; |
| one_zero [1, 2] = 0.0; |
| flip_pos = matrix (0, rows = 2, cols = 2); |
| flip_neg = flip_pos; |
| flip_pos [1, 2] = 1; |
| flip_pos [2, 1] = 1; |
| flip_neg [1, 2] = -1; |
| flip_neg [2, 1] = 1; |
| |
| if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION |
| y_mean = zeros_r; |
| if (link_power == 0) { |
| y_mean = exp (linear_terms); |
| y_mean_pow = y_mean ^ (1 - var_power); |
| w = y_mean_pow * y_mean; |
| g_Y = y_mean_pow * (Y - y_mean); |
| } else { if (link_power == 1.0) { |
| y_mean = linear_terms; |
| w = y_mean ^ (- var_power); |
| g_Y = w * (Y - y_mean); |
| } else { |
| y_mean = linear_terms ^ (1.0 / link_power); |
| c1 = (1 - var_power) / link_power - 1; |
| c2 = (2 - var_power) / link_power - 2; |
| g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power; |
| w = (linear_terms ^ c2) / (link_power ^ 2); |
| } }} |
| if (dist_type == 2 & link_type >= 1 & link_type <= 5) |
| { # BINOMIAL/BERNOULLI DISTRIBUTION |
| if (link_type == 1) { # BINOMIAL.POWER LINKS |
| if (link_power == 0) { # Binomial.log |
| vec1 = 1 / (exp (- linear_terms) - 1); |
| g_Y = Y [, 1] - Y [, 2] * vec1; |
| w = rowSums (Y) * vec1; |
| } else { # Binomial.nonlog |
| vec1 = zeros_r; |
| if (link_power == 0.5) { |
| vec1 = 1 / (1 - linear_terms ^ 2); |
| } else { if (sum (linear_terms < 0) == 0) { |
| vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power)); |
| } else {isNaN = 1;}} |
| # We want a "zero-protected" version of |
| # vec2 = Y [, 1] / linear_terms; |
| is_y_0 = (Y [, 1] == 0); |
| vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0; |
| g_Y = (vec2 - Y [, 2] * vec1 * linear_terms) / link_power; |
| w = rowSums (Y) * vec1 / link_power ^ 2; |
| } |
| } else { |
| is_LT_pos_infinite = (linear_terms == Inf); |
| is_LT_neg_infinite = (linear_terms == -Inf); |
| is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; |
| finite_linear_terms = replace (target = linear_terms, pattern = Inf, replacement = 0); |
| finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0); |
| if (link_type == 2) { # Binomial.logit |
| Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; |
| Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); |
| Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; |
| g_Y = rowSums (Y * (Y_prob %*% flip_neg)); ### = y_residual; |
| w = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob); ### = y_variance; |
| } else { if (link_type == 3) { # Binomial.probit |
| is_lt_pos = (linear_terms >= 0); |
| t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) |
| pt_gp = t_gp * ( 0.254829592 |
| + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, |
| + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 |
| + t_gp * (-1.453152027 |
| + t_gp * 1.061405429)))); |
| the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0); |
| vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp); |
| vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5); |
| w = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1; |
| g_Y = one_over_sqrt_two_pi * vec2 / vec1; |
| } else { if (link_type == 4) { # Binomial.cloglog |
| the_exp = exp (linear_terms) |
| the_exp_exp = exp (- the_exp); |
| is_too_small = ((10000000 + the_exp) == 10000000); |
| the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2); |
| g_Y = (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio; |
| w = the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio; |
| } else { if (link_type == 5) { # Binomial.cauchit |
| Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi; |
| Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; |
| y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1]; |
| var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2]; |
| link_gradient_normalized = (1 + linear_terms ^ 2) * pi; |
| g_Y = rowSums (Y) * y_residual / (var_function * link_gradient_normalized); |
| w = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2); |
| }}}} |
| } |
| } |
| } |
| |
| |
| glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y, |
| int dist_type, double var_power, int link_type, double link_power) |
| return (double log_l, int isNaN) |
| { |
| isNaN = 0; |
| log_l = 0.0; |
| num_records = nrow (Y); |
| zeros_r = matrix (0.0, rows = num_records, cols = 1); |
| |
| if (dist_type == 1 & link_type == 1) |
| { # POWER DISTRIBUTION |
| b_cumulant = zeros_r; |
| natural_parameters = zeros_r; |
| is_natural_parameter_log_zero = zeros_r; |
| if (var_power == 1.0 & link_power == 0) { # Poisson.log |
| b_cumulant = exp (linear_terms); |
| is_natural_parameter_log_zero = (linear_terms == -Inf); |
| natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0); |
| } else { if (var_power == 1.0 & link_power == 1.0) { # Poisson.id |
| if (sum (linear_terms < 0) == 0) { |
| b_cumulant = linear_terms; |
| is_natural_parameter_log_zero = (linear_terms == 0); |
| natural_parameters = log (linear_terms + is_natural_parameter_log_zero); |
| } else {isNaN = 1;} |
| } else { if (var_power == 1.0 & link_power == 0.5) { # Poisson.sqrt |
| if (sum (linear_terms < 0) == 0) { |
| b_cumulant = linear_terms ^ 2; |
| is_natural_parameter_log_zero = (linear_terms == 0); |
| natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero); |
| } else {isNaN = 1;} |
| } else { if (var_power == 1.0 & link_power > 0) { # Poisson.power_nonlog, pos |
| if (sum (linear_terms < 0) == 0) { |
| is_natural_parameter_log_zero = (linear_terms == 0); |
| b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero; |
| natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power; |
| } else {isNaN = 1;} |
| } else { if (var_power == 1.0) { # Poisson.power_nonlog, neg |
| if (sum (linear_terms <= 0) == 0) { |
| b_cumulant = linear_terms ^ (1.0 / link_power); |
| natural_parameters = log (linear_terms) / link_power; |
| } else {isNaN = 1;} |
| } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse |
| if (sum (linear_terms <= 0) == 0) { |
| b_cumulant = - log (linear_terms); |
| natural_parameters = - linear_terms; |
| } else {isNaN = 1;} |
| } else { if (var_power == 2.0 & link_power == 1.0) { # Gamma.id |
| if (sum (linear_terms <= 0) == 0) { |
| b_cumulant = log (linear_terms); |
| natural_parameters = - 1.0 / linear_terms; |
| } else {isNaN = 1;} |
| } else { if (var_power == 2.0 & link_power == 0) { # Gamma.log |
| b_cumulant = linear_terms; |
| natural_parameters = - exp (- linear_terms); |
| } else { if (var_power == 2.0) { # Gamma.power_nonlog |
| if (sum (linear_terms <= 0) == 0) { |
| b_cumulant = log (linear_terms) / link_power; |
| natural_parameters = - linear_terms ^ (- 1.0 / link_power); |
| } else {isNaN = 1;} |
| } else { if (link_power == 0) { # PowerDist.log |
| natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power); |
| b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power); |
| } else { # PowerDist.power_nonlog |
| if (-2 * link_power == 1.0 - var_power) { |
| natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power); |
| } else { if (-1 * link_power == 1.0 - var_power) { |
| natural_parameters = 1.0 / linear_terms / (1.0 - var_power); |
| } else { if ( link_power == 1.0 - var_power) { |
| natural_parameters = linear_terms / (1.0 - var_power); |
| } else { if ( 2 * link_power == 1.0 - var_power) { |
| natural_parameters = linear_terms ^ 2 / (1.0 - var_power); |
| } else { |
| if (sum (linear_terms <= 0) == 0) { |
| power = (1.0 - var_power) / link_power; |
| natural_parameters = (linear_terms ^ power) / (1.0 - var_power); |
| } else {isNaN = 1;} |
| }}}} |
| if (-2 * link_power == 2.0 - var_power) { |
| b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power); |
| } else { if (-1 * link_power == 2.0 - var_power) { |
| b_cumulant = 1.0 / linear_terms / (2.0 - var_power); |
| } else { if ( link_power == 2.0 - var_power) { |
| b_cumulant = linear_terms / (2.0 - var_power); |
| } else { if ( 2 * link_power == 2.0 - var_power) { |
| b_cumulant = linear_terms ^ 2 / (2.0 - var_power); |
| } else { |
| if (sum (linear_terms <= 0) == 0) { |
| power = (2.0 - var_power) / link_power; |
| b_cumulant = (linear_terms ^ power) / (2.0 - var_power); |
| } else {isNaN = 1;} |
| }}}} |
| }}}}} }}}}} |
| if (sum (is_natural_parameter_log_zero * abs (Y)) > 0) { |
| log_l = -Inf; |
| isNaN = 1; |
| } |
| if (isNaN == 0) |
| { |
| log_l = sum (Y * natural_parameters - b_cumulant); |
| if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { |
| log_l = -Inf; |
| isNaN = 1; |
| } } } |
| |
| if (dist_type == 2 & link_type >= 1 & link_type <= 5) |
| { # BINOMIAL/BERNOULLI DISTRIBUTION |
| |
| [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power); |
| |
| if (isNaN == 0) { |
| does_prob_contradict = (Y_prob <= 0); |
| if (sum (does_prob_contradict * abs (Y)) == 0) { |
| log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict)); |
| if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { |
| isNaN = 1; |
| } |
| } else { |
| log_l = -Inf; |
| isNaN = 1; |
| } } } |
| |
| if (isNaN == 1) { |
| log_l = - Inf; |
| } |
| } |
| |
| |
| |
| binomial_probability_two_column = |
| function (Matrix[double] linear_terms, int link_type, double link_power) |
| return (Matrix[double] Y_prob, int isNaN) |
| { |
| isNaN = 0; |
| num_records = nrow (linear_terms); |
| |
| # Define some auxiliary matrices |
| |
| ones_2 = matrix (1.0, rows = 1, cols = 2); |
| p_one_m_one = ones_2; |
| p_one_m_one [1, 2] = -1.0; |
| m_one_p_one = ones_2; |
| m_one_p_one [1, 1] = -1.0; |
| zero_one = ones_2; |
| zero_one [1, 1] = 0.0; |
| one_zero = ones_2; |
| one_zero [1, 2] = 0.0; |
| |
| zeros_r = matrix (0.0, rows = num_records, cols = 1); |
| ones_r = 1.0 + zeros_r; |
| |
| # Begin the function body |
| |
| Y_prob = zeros_r %*% ones_2; |
| if (link_type == 1) { # Binomial.power |
| if (link_power == 0) { # Binomial.log |
| Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one; |
| } else { if (link_power == 0.5) { # Binomial.sqrt |
| Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one; |
| } else { # Binomial.power_nonlog |
| if (sum (linear_terms < 0) == 0) { |
| Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one; |
| } else {isNaN = 1;} |
| }} |
| } else { # Binomial.non_power |
| is_LT_pos_infinite = (linear_terms == Inf); |
| is_LT_neg_infinite = (linear_terms == -Inf); |
| is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; |
| finite_linear_terms = replace (target = linear_terms, pattern = Inf, replacement = 0); |
| finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0); |
| if (link_type == 2) { # Binomial.logit |
| Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; |
| Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); |
| } else { if (link_type == 3) { # Binomial.probit |
| lt_pos_neg = (finite_linear_terms >= 0) %*% p_one_m_one + ones_r %*% zero_one; |
| t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) |
| pt_gp = t_gp * ( 0.254829592 |
| + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, |
| + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 |
| + t_gp * (-1.453152027 |
| + t_gp * 1.061405429)))); |
| the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0); |
| Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg); |
| } else { if (link_type == 4) { # Binomial.cloglog |
| the_exp = exp (finite_linear_terms); |
| the_exp_exp = exp (- the_exp); |
| is_too_small = ((10000000 + the_exp) == 10000000); |
| Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2); |
| Y_prob [, 2] = the_exp_exp; |
| } else { if (link_type == 5) { # Binomial.cauchit |
| Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi; |
| } else { |
| isNaN = 1; |
| }}}} |
| Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; |
| } } |
| |
| |
| # THE CG-STEIHAUG PROCEDURE SCRIPT |
| |
| # Apply Conjugate Gradient - Steihaug algorithm in order to approximately minimize |
| # 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z |
| # under constraint: ||z|| <= trust_delta. |
| # See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and Wright |
| # IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this transform |
| # is given separately because sparse "X" may become dense after applying the transform. |
| # |
| get_CG_Steihaug_point = |
| function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w, |
| Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG) |
| return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary) |
| { |
| trust_delta_sq = trust_delta ^ 2; |
| size_CG = nrow (g); |
| z = matrix (0.0, rows = size_CG, cols = 1); |
| neg_log_l_change = 0.0; |
| reached_trust_boundary = 0; |
| g_reg = g + lambda * beta; |
| r_CG = g_reg; |
| p_CG = -r_CG; |
| rr_CG = sum(r_CG * r_CG); |
| eps_CG = rr_CG * min (0.25, sqrt (rr_CG)); |
| converged_CG = 0; |
| if (rr_CG < eps_CG) { |
| converged_CG = 1; |
| } |
| |
| max_iteration_CG = max_iter_CG; |
| if (max_iteration_CG <= 0) { |
| max_iteration_CG = size_CG; |
| } |
| i_CG = 0; |
| while (converged_CG == 0) |
| { |
| i_CG = i_CG + 1; |
| ssX_p_CG = diag (scale_X) %*% p_CG; |
| ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG; |
| temp_CG = t(X) %*% (w * (X %*% ssX_p_CG)); |
| q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ]; |
| pq_CG = sum (p_CG * q_CG); |
| if (pq_CG <= 0) { |
| pp_CG = sum (p_CG * p_CG); |
| if (pp_CG > 0) { |
| [z, neg_log_l_change] = |
| get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); |
| reached_trust_boundary = 1; |
| } else { |
| neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); |
| } |
| converged_CG = 1; |
| } |
| if (converged_CG == 0) { |
| alpha_CG = rr_CG / pq_CG; |
| new_z = z + alpha_CG * p_CG; |
| if (sum(new_z * new_z) >= trust_delta_sq) { |
| pp_CG = sum (p_CG * p_CG); |
| [z, neg_log_l_change] = |
| get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); |
| reached_trust_boundary = 1; |
| converged_CG = 1; |
| } |
| if (converged_CG == 0) { |
| z = new_z; |
| old_rr_CG = rr_CG; |
| r_CG = r_CG + alpha_CG * q_CG; |
| rr_CG = sum(r_CG * r_CG); |
| if (i_CG == max_iteration_CG | rr_CG < eps_CG) { |
| neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); |
| reached_trust_boundary = 0; |
| converged_CG = 1; |
| } |
| if (converged_CG == 0) { |
| p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG; |
| } } } } } |
| |
| |
| # An auxiliary function used twice inside the CG-STEIHAUG loop: |
| get_trust_boundary_point = |
| function (Matrix[double] g, Matrix[double] z, Matrix[double] p, |
| Matrix[double] q, Matrix[double] r, double pp, double pq, |
| double trust_delta_sq) |
| return (Matrix[double] new_z, double f_change) |
| { |
| zz = sum (z * z); pz = sum (p * z); |
| sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq)); |
| tau_1 = (- pz + sq_root_d) / pp; |
| tau_2 = (- pz - sq_root_d) / pp; |
| zq = sum (z * q); gp = sum (g * p); |
| f_extra = 0.5 * sum (z * (r + g)); |
| f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1; |
| f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2; |
| ind1 = as.integer(f_change_1 < f_change_2); |
| ind2 = as.integer(f_change_1 >= f_change_2); |
| new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p); |
| f_change = ind1 * f_change_1 + ind2 * f_change_2; |
| } |
| |
| |
| # Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1 |
| # We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale |
| # it to compute w = c * z_LS such that sum(X %*% w) = nrow(X). |
| straightenX = |
| function (Matrix[double] X, double eps, int max_iter_CG) |
| return (Matrix[double] w) |
| { |
| w_X = t(colSums(X)); |
| lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X); |
| eps_LS = eps * nrow(X); |
| |
| # BEGIN LEAST SQUARES |
| |
| r_LS = - w_X; |
| z_LS = matrix (0.0, rows = ncol(X), cols = 1); |
| p_LS = - r_LS; |
| norm_r2_LS = sum (r_LS ^ 2); |
| i_LS = 0; |
| while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS) |
| { |
| q_LS = t(X) %*% X %*% p_LS; |
| q_LS = q_LS + lambda_LS * p_LS; |
| alpha_LS = norm_r2_LS / sum (p_LS * q_LS); |
| z_LS = z_LS + alpha_LS * p_LS; |
| old_norm_r2_LS = norm_r2_LS; |
| r_LS = r_LS + alpha_LS * q_LS; |
| norm_r2_LS = sum (r_LS ^ 2); |
| p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS; |
| i_LS = i_LS + 1; |
| } |
| |
| # END LEAST SQUARES |
| |
| w = (nrow(X) / sum (w_X * z_LS)) * z_LS; |
| } |
| |
| |