| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| # |
| # Intended to solve GLM Regression using R, in order to compare against the DML implementation |
| # INPUT 1: Matrix X [rows, columns] |
| # and Matrix y [rows, 1] |
| # INPUT 2-5: Distribution family and link, see below: |
| # --------------------------------------------- |
| # Dst Var Lnk Lnk Distribution Cano- |
| # typ pow typ pow Family.link nical? |
| # --------------------------------------------- |
| # 1 0.0 1 -1.0 Gaussian.inverse |
| # 1 0.0 1 0.0 Gaussian.log |
| # 1 0.0 1 1.0 Gaussian.id Yes |
| # 1 1.0 1 0.0 Poisson.log Yes |
| # 1 1.0 1 0.5 Poisson.sqrt |
| # 1 1.0 1 1.0 Poisson.id |
| # 1 2.0 1 -1.0 Gamma.inverse Yes |
| # 1 2.0 1 0.0 Gamma.log |
| # 1 2.0 1 1.0 Gamma.id |
| # 1 3.0 1 -2.0 InvGaussian.1/mu^2 Yes |
| # 1 3.0 1 -1.0 InvGaussian.inverse |
| # 1 3.0 1 0.0 InvGaussian.log |
| # 1 3.0 1 1.0 InvGaussian.id |
| # 1 * 1 * AnyVariance.AnyLink |
| # --------------------------------------------- |
| # 2 -1.0 * * Binomial {-1, 1} |
| # 2 0.0 * * Binomial { 0, 1} |
| # 2 1.0 * * Binomial two-column |
| # 2 * 1 0.0 Binomial.log |
| # 2 * 2 * Binomial.logit Yes |
| # 2 * 3 * Binomial.probit |
| # 2 * 4 * Binomial.cloglog |
| # 2 * 5 * Binomial.cauchit |
| # --------------------------------------------- |
| # INPUT 2: (int) Distribution type |
| # INPUT 3: (double) For Power families: Variance power of the mean |
| # INPUT 4: (int) Link function type |
| # INPUT 5: (double) Link as power of the mean |
| # INPUT 6: (int) Intercept: 0 = no, 1 = yes |
| # INPUT 7: (double) tolerance (epsilon) |
| # INPUT 8: the regression coefficients output file |
| # OUTPUT : Matrix beta [columns, 1] |
| # |
| # Assume that $GLMR_HOME is set to the home of the R script |
| # Assume input and output directories are $GLMR_HOME/in/ and $GLMR_HOME/expected/ |
| # Rscript $GLMR_HOME/GLM.R $GLMR_HOME/in/X.mtx $GLMR_HOME/in/y.mtx 2 0.0 2 0.0 1 0.00000001 $GLMR_HOME/expected/w.mtx |
| |
| args <- commandArgs (TRUE); |
| |
| library ("Matrix"); |
| # library ("batch"); |
| |
| options (warn = -1); |
| |
| X_here <- as.matrix(readMM(args[1])) |
| y_here <- as.matrix(readMM(args[2])) |
| |
| num_records <- nrow (X_here); |
| num_features <- ncol (X_here); |
| dist_type <- as.integer (args[3]); |
| dist_param <- as.numeric (args[4]); |
| link_type <- as.integer (args[5]); |
| link_power <- as.numeric (args[6]); |
| icept <- as.integer (args[7]); |
| eps_n <- as.numeric (args[8]); |
| |
| f_ly <- gaussian (); |
| var_power <- dist_param; |
| |
| if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == 1.0) { f_ly <- gaussian (link = "identity"); } else |
| if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == -1.0) { f_ly <- gaussian (link = "inverse"); } else |
| if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == 0.0) { f_ly <- gaussian (link = "log"); } else |
| if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 1.0) { f_ly <- poisson (link = "identity"); } else |
| if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 0.0) { f_ly <- poisson (link = "log"); } else |
| if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 0.5) { f_ly <- poisson (link = "sqrt"); } else |
| if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == 1.0) { f_ly <- Gamma (link = "identity"); } else |
| if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == -1.0) { f_ly <- Gamma (link = "inverse"); } else |
| if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == 0.0) { f_ly <- Gamma (link = "log"); } else |
| if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == 1.0) { f_ly <- inverse.gaussian (link = "identity"); } else |
| if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == -1.0) { f_ly <- inverse.gaussian (link = "inverse"); } else |
| if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == 0.0) { f_ly <- inverse.gaussian (link = "log"); } else |
| if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == -2.0) { f_ly <- inverse.gaussian (link = "1/mu^2"); } else |
| if (dist_type == 2 & link_type == 1 & link_power == 0.0) { f_ly <- binomial (link = "log"); } else |
| if (dist_type == 2 & link_type == 1 & link_power == 1.0) { f_ly <- binomial (link = "identity"); } else |
| if (dist_type == 2 & link_type == 1 & link_power == 0.5) { f_ly <- binomial (link = "sqrt"); } else |
| if (dist_type == 2 & link_type == 2 ) { f_ly <- binomial (link = "logit"); } else |
| if (dist_type == 2 & link_type == 3 ) { f_ly <- binomial (link = "probit"); } else |
| if (dist_type == 2 & link_type == 4 ) { f_ly <- binomial (link = "cloglog"); } else |
| if (dist_type == 2 & link_type == 5 ) { f_ly <- binomial (link = "cauchit"); } |
| |
| # quasi(link = "identity", variance = "constant") |
| # quasibinomial(link = "logit") |
| # quasipoisson(link = "log") |
| |
| if (dist_type == 2 & dist_param != 1.0) { |
| y_here <- (y_here - dist_param) / (1.0 - dist_param); |
| } |
| |
| # epsilon tolerance: the iterations converge when |dev - devold|/(|dev| + 0.1) < epsilon. |
| # maxit integer giving the maximal number of IWLS iterations. |
| # trace logical indicating if output should be produced for each iteration. |
| # |
| c_rol <- glm.control (epsilon = eps_n, maxit = 100, trace = FALSE); |
| |
| X_matrix = as.matrix (X_here); |
| y_matrix = as.matrix (y_here); |
| |
| if (icept == 0) { |
| glmOut <- glm (y_matrix ~ X_matrix - 1, family = f_ly, control = c_rol); # zero intercept |
| betas <- coef (glmOut); |
| } else { |
| glmOut <- glm (y_matrix ~ X_matrix , family = f_ly, control = c_rol); |
| betas <- coef (glmOut); |
| beta_intercept = betas [1]; |
| betas [1 : num_features] = betas [2 : (num_features + 1)]; |
| betas [num_features + 1] = beta_intercept; |
| } |
| |
| print (c("Deviance", glmOut$deviance)); |
| writeMM (as (betas, "CsparseMatrix"), args[9], format = "text"); |