src/test/scripts/functions/builtin/glmTest.R - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------
 #
 # Intended to solve GLM Regression using R, in order to compare against the DML implementation
 # INPUT 1: Matrix X [rows, columns]
 # and Matrix y [rows, 1]
 # INPUT 2-5: Distribution family and link, see below:
 # ---------------------------------------------
 #   Dst Var Lnk Lnk   Distribution       Cano-
 #   typ pow typ pow   Family.link        nical?
 # ---------------------------------------------
 #    1  0.0  1 -1.0   Gaussian.inverse
 #    1  0.0  1  0.0   Gaussian.log
 #    1  0.0  1  1.0   Gaussian.id         Yes
 #    1  1.0  1  0.0   Poisson.log         Yes
 #    1  1.0  1  0.5   Poisson.sqrt
 #    1  1.0  1  1.0   Poisson.id
 #    1  2.0  1 -1.0   Gamma.inverse       Yes
 #    1  2.0  1  0.0   Gamma.log
 #    1  2.0  1  1.0   Gamma.id
 #    1  3.0  1 -2.0   InvGaussian.1/mu^2  Yes
 #    1  3.0  1 -1.0   InvGaussian.inverse
 #    1  3.0  1  0.0   InvGaussian.log
 #    1  3.0  1  1.0   InvGaussian.id
 #    1   *   1   *    AnyVariance.AnyLink
 # ---------------------------------------------
 #    2 -1.0  *   *    Binomial {-1, 1}
 #    2  0.0  *   *    Binomial { 0, 1}
 #    2  1.0  *   *    Binomial two-column
 #    2   *   1  0.0   Binomial.log
 #    2   *   2   *    Binomial.logit      Yes
 #    2   *   3   *    Binomial.probit
 #    2   *   4   *    Binomial.cloglog
 #    2   *   5   *    Binomial.cauchit
 # ---------------------------------------------
 # INPUT 2: (int) Distribution type
 # INPUT 3: (double) For Power families: Variance power of the mean
 # INPUT 4: (int) Link function type
 # INPUT 5: (double) Link as power of the mean
 # INPUT 6: (int) Intercept: 0 = no, 1 = yes
 # INPUT 7: (double) tolerance (epsilon)
 # INPUT 8: the regression coefficients output file
 # OUTPUT : Matrix beta [columns, 1]
 #
 # Assume that $GLMR_HOME is set to the home of the R script
 # Assume input and output directories are $GLMR_HOME/in/ and $GLMR_HOME/expected/
 # Rscript $GLMR_HOME/GLM.R $GLMR_HOME/in/X.mtx $GLMR_HOME/in/y.mtx 2 0.0 2 0.0 1 0.00000001 $GLMR_HOME/expected/w.mtx

 args <- commandArgs (TRUE);

 library ("Matrix");
 # library ("batch");

 options (warn = -1);

 X_here <- as.matrix(readMM(args[1]))
 y_here <- as.matrix(readMM(args[2]))

 num_records  <- nrow (X_here);
 num_features <- ncol (X_here);
 dist_type  <- as.integer (args[3]);
 dist_param <- as.numeric (args[4]);
 link_type  <- as.integer (args[5]);
 link_power <- as.numeric (args[6]);
 icept <- as.integer (args[7]);
 eps_n <- as.numeric (args[8]);

 f_ly <- gaussian ();
 var_power <- dist_param;

 if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power ==  1.0) { f_ly <- gaussian (link = "identity");         } else
 if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == -1.0) { f_ly <- gaussian (link = "inverse");          } else
 if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power ==  0.0) { f_ly <- gaussian (link = "log");              } else
 if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power ==  1.0) { f_ly <-  poisson (link = "identity");         } else
 if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power ==  0.0) { f_ly <-  poisson (link = "log");              } else
 if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power ==  0.5) { f_ly <-  poisson (link = "sqrt");             } else
 if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power ==  1.0) { f_ly <-    Gamma (link = "identity");         } else
 if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == -1.0) { f_ly <-    Gamma (link = "inverse");          } else
 if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power ==  0.0) { f_ly <-    Gamma (link = "log");              } else
 if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power ==  1.0) { f_ly <- inverse.gaussian (link = "identity"); } else
 if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == -1.0) { f_ly <- inverse.gaussian (link = "inverse");  } else
 if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power ==  0.0) { f_ly <- inverse.gaussian (link = "log");      } else
 if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == -2.0) { f_ly <- inverse.gaussian (link = "1/mu^2");   } else
 if (dist_type == 2                    & link_type == 1 & link_power ==  0.0) { f_ly <- binomial (link = "log");              } else
 if (dist_type == 2                    & link_type == 1 & link_power ==  1.0) { f_ly <- binomial (link = "identity");         } else
 if (dist_type == 2                    & link_type == 1 & link_power ==  0.5) { f_ly <- binomial (link = "sqrt");             } else
 if (dist_type == 2                    & link_type == 2                     ) { f_ly <- binomial (link = "logit");            } else
 if (dist_type == 2                    & link_type == 3                     ) { f_ly <- binomial (link = "probit");           } else
 if (dist_type == 2                    & link_type == 4                     ) { f_ly <- binomial (link = "cloglog");          } else
 if (dist_type == 2                    & link_type == 5                     ) { f_ly <- binomial (link = "cauchit");          }

 # quasi(link = "identity", variance = "constant")
 # quasibinomial(link = "logit")
 # quasipoisson(link = "log")

 if (dist_type == 2 & dist_param != 1.0) {
     y_here <- (y_here - dist_param) / (1.0 - dist_param);
 }

 # epsilon 	tolerance: the iterations converge when |dev - devold|/(|dev| + 0.1) < epsilon.
 # maxit 	integer giving the maximal number of IWLS iterations.
 # trace 	logical indicating if output should be produced for each iteration.
 #
 c_rol <- glm.control (epsilon = eps_n, maxit = 100, trace = FALSE);

 X_matrix = as.matrix (X_here);
 y_matrix = as.matrix (y_here);

 if (icept == 0) {
     glmOut <- glm (y_matrix ~ X_matrix - 1, family = f_ly, control = c_rol); # zero intercept
     betas <- coef (glmOut);
 } else {
     glmOut <- glm (y_matrix ~ X_matrix    , family = f_ly, control = c_rol);
     betas <- coef (glmOut);
     beta_intercept = betas [1];
     betas [1 : num_features] = betas [2 : (num_features + 1)];
     betas [num_features + 1] = beta_intercept;
 }

 print (c("Deviance", glmOut$deviance));
 writeMM (as (betas, "CsparseMatrix"), args[9], format = "text");
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------
	#
	# Intended to solve GLM Regression using R, in order to compare against the DML implementation
	# INPUT 1: Matrix X [rows, columns]
	# and Matrix y [rows, 1]
	# INPUT 2-5: Distribution family and link, see below:
	# ---------------------------------------------
	# Dst Var Lnk Lnk Distribution Cano-
	# typ pow typ pow Family.link nical?
	# ---------------------------------------------
	# 1 0.0 1 -1.0 Gaussian.inverse
	# 1 0.0 1 0.0 Gaussian.log
	# 1 0.0 1 1.0 Gaussian.id Yes
	# 1 1.0 1 0.0 Poisson.log Yes
	# 1 1.0 1 0.5 Poisson.sqrt
	# 1 1.0 1 1.0 Poisson.id
	# 1 2.0 1 -1.0 Gamma.inverse Yes
	# 1 2.0 1 0.0 Gamma.log
	# 1 2.0 1 1.0 Gamma.id
	# 1 3.0 1 -2.0 InvGaussian.1/mu^2 Yes
	# 1 3.0 1 -1.0 InvGaussian.inverse
	# 1 3.0 1 0.0 InvGaussian.log
	# 1 3.0 1 1.0 InvGaussian.id
	# 1 * 1 * AnyVariance.AnyLink
	# ---------------------------------------------
	# 2 -1.0 * * Binomial {-1, 1}
	# 2 0.0 * * Binomial { 0, 1}
	# 2 1.0 * * Binomial two-column
	# 2 * 1 0.0 Binomial.log
	# 2 * 2 * Binomial.logit Yes
	# 2 * 3 * Binomial.probit
	# 2 * 4 * Binomial.cloglog
	# 2 * 5 * Binomial.cauchit
	# ---------------------------------------------
	# INPUT 2: (int) Distribution type
	# INPUT 3: (double) For Power families: Variance power of the mean
	# INPUT 4: (int) Link function type
	# INPUT 5: (double) Link as power of the mean
	# INPUT 6: (int) Intercept: 0 = no, 1 = yes
	# INPUT 7: (double) tolerance (epsilon)
	# INPUT 8: the regression coefficients output file
	# OUTPUT : Matrix beta [columns, 1]
	#
	# Assume that $GLMR_HOME is set to the home of the R script
	# Assume input and output directories are $GLMR_HOME/in/ and $GLMR_HOME/expected/
	# Rscript $GLMR_HOME/GLM.R $GLMR_HOME/in/X.mtx $GLMR_HOME/in/y.mtx 2 0.0 2 0.0 1 0.00000001 $GLMR_HOME/expected/w.mtx

	args <- commandArgs (TRUE);

	library ("Matrix");
	# library ("batch");

	options (warn = -1);

	X_here <- as.matrix(readMM(args[1]))
	y_here <- as.matrix(readMM(args[2]))

	num_records <- nrow (X_here);
	num_features <- ncol (X_here);
	dist_type <- as.integer (args[3]);
	dist_param <- as.numeric (args[4]);
	link_type <- as.integer (args[5]);
	link_power <- as.numeric (args[6]);
	icept <- as.integer (args[7]);
	eps_n <- as.numeric (args[8]);

	f_ly <- gaussian ();
	var_power <- dist_param;

	if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == 1.0) { f_ly <- gaussian (link = "identity"); } else
	if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == -1.0) { f_ly <- gaussian (link = "inverse"); } else
	if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == 0.0) { f_ly <- gaussian (link = "log"); } else
	if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 1.0) { f_ly <- poisson (link = "identity"); } else
	if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 0.0) { f_ly <- poisson (link = "log"); } else
	if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 0.5) { f_ly <- poisson (link = "sqrt"); } else
	if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == 1.0) { f_ly <- Gamma (link = "identity"); } else
	if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == -1.0) { f_ly <- Gamma (link = "inverse"); } else
	if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == 0.0) { f_ly <- Gamma (link = "log"); } else
	if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == 1.0) { f_ly <- inverse.gaussian (link = "identity"); } else
	if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == -1.0) { f_ly <- inverse.gaussian (link = "inverse"); } else
	if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == 0.0) { f_ly <- inverse.gaussian (link = "log"); } else
	if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == -2.0) { f_ly <- inverse.gaussian (link = "1/mu^2"); } else
	if (dist_type == 2 & link_type == 1 & link_power == 0.0) { f_ly <- binomial (link = "log"); } else
	if (dist_type == 2 & link_type == 1 & link_power == 1.0) { f_ly <- binomial (link = "identity"); } else
	if (dist_type == 2 & link_type == 1 & link_power == 0.5) { f_ly <- binomial (link = "sqrt"); } else
	if (dist_type == 2 & link_type == 2 ) { f_ly <- binomial (link = "logit"); } else
	if (dist_type == 2 & link_type == 3 ) { f_ly <- binomial (link = "probit"); } else
	if (dist_type == 2 & link_type == 4 ) { f_ly <- binomial (link = "cloglog"); } else
	if (dist_type == 2 & link_type == 5 ) { f_ly <- binomial (link = "cauchit"); }

	# quasi(link = "identity", variance = "constant")
	# quasibinomial(link = "logit")
	# quasipoisson(link = "log")

	if (dist_type == 2 & dist_param != 1.0) {
	y_here <- (y_here - dist_param) / (1.0 - dist_param);
	}

	# epsilon tolerance: the iterations converge when \|dev - devold\|/(\|dev\| + 0.1) < epsilon.
	# maxit integer giving the maximal number of IWLS iterations.
	# trace logical indicating if output should be produced for each iteration.
	#
	c_rol <- glm.control (epsilon = eps_n, maxit = 100, trace = FALSE);

	X_matrix = as.matrix (X_here);
	y_matrix = as.matrix (y_here);

	if (icept == 0) {
	glmOut <- glm (y_matrix ~ X_matrix - 1, family = f_ly, control = c_rol); # zero intercept
	betas <- coef (glmOut);
	} else {
	glmOut <- glm (y_matrix ~ X_matrix , family = f_ly, control = c_rol);
	betas <- coef (glmOut);
	beta_intercept = betas [1];
	betas [1 : num_features] = betas [2 : (num_features + 1)];
	betas [num_features + 1] = beta_intercept;
	}

	print (c("Deviance", glmOut$deviance));
	writeMM (as (betas, "CsparseMatrix"), args[9], format = "text");