scripts/algorithms/MultiLogReg.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # Solves Multinomial Logistic Regression using Trust Region methods.
 # (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650)

 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME  TYPE   DEFAULT  MEANING
 # --------------------------------------------------------------------------------------------
 # X     String  ---     Location to read the matrix of feature vectors
 # Y     String  ---     Location to read the matrix with category labels
 # B     String  ---     Location to store estimated regression parameters (the betas)
 # Log   String  " "     Location to write per-iteration variables for log/debugging purposes
 # icpt  Int      0      Intercept presence, shifting and rescaling X columns:
 #                       0 = no intercept, no shifting, no rescaling;
 #                       1 = add intercept, but neither shift nor rescale X;
 #                       2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
 # reg   Double  0.0     regularization parameter (lambda = 1/C); intercept is not regularized
 # tol   Double 0.000001 tolerance ("epsilon")
 # moi   Int     100     max. number of outer (Newton) iterations
 # mii   Int      0      max. number of inner (conjugate gradient) iterations, 0 = no max
 # fmt   String "text"   Matrix output format, usually "text" or "csv" (for matrices only)
 # --------------------------------------------------------------------------------------------
 # The largest label represents the baseline category; if label -1 or 0 is present, then it is
 # the baseline label (and it is converted to the largest label).
 #
 # The Log file, when requested, contains the following per-iteration variables in CSV format,
 # each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values:
 #
 # NAME                  MEANING
 # -------------------------------------------------------------------------------------------
 # LINEAR_TERM_MIN       The minimum value of X %*% B, used to check for overflows
 # LINEAR_TERM_MAX       The maximum value of X %*% B, used to check for overflows
 # NUM_CG_ITERS          Number of inner (Conj.Gradient) iterations in this outer iteration
 # IS_TRUST_REACHED      1 = trust region boundary was reached, 0 = otherwise
 # POINT_STEP_NORM       L2-norm of iteration step from old point (i.e. matrix B) to new point
 # OBJECTIVE             The loss function we minimize (negative regularized log-likelihood)
 # OBJ_DROP_REAL         Reduction in the objective during this iteration, actual value
 # OBJ_DROP_PRED         Reduction in the objective predicted by a quadratic approximation
 # OBJ_DROP_RATIO        Actual-to-predicted reduction ratio, used to update the trust region
 # IS_POINT_UPDATED      1 = new point accepted; 0 = new point rejected, old point restored
 # GRADIENT_NORM         L2-norm of the loss function gradient (omitted if point is rejected)
 # TRUST_DELTA           Updated trust region size, the "delta"
 # -------------------------------------------------------------------------------------------
 #
 # Script invocation example:
 # hadoop jar SystemDS.jar -f MultiLogReg.dml -nvargs icpt=2 reg=1.0 tol=0.000001 moi=100 mii=20
 #     X=INPUT_DIR/X123 Y=INPUT_DIR/Y123 B=OUTPUT_DIR/B123 fmt=csv Log=OUTPUT_DIR/log

 fileX = $X;
 fileY = $Y;
 fileB = $B;
 fileLog = ifdef ($Log, " ");
 fmtB = ifdef ($fmt, "text");

 intercept_status = ifdef ($icpt, 0); # $icpt = 0;
 regularization = ifdef ($reg, 0.0);  # $reg  = 0.0;
 tol = ifdef ($tol, 0.000001);        # $tol  = 0.000001;
 maxiter = ifdef ($moi, 100);         # $moi  = 100;
 maxinneriter = ifdef ($mii, 0);      # $mii  = 0;

 tol = as.double (tol);

 print ("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT");
 print ("Reading X...");
 X = read (fileX);
 print ("Reading Y...");
 Y_vec = read (fileY);

 eta0 = 0.0001;
 eta1 = 0.25;
 eta2 = 0.75;
 sigma1 = 0.25;
 sigma2 = 0.5;
 sigma3 = 4.0;
 psi = 0.1;

 N = nrow (X);
 D = ncol (X);

 # Introduce the intercept, shift and rescale the columns of X if needed
 if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
 {
     X = cbind (X, matrix (1, rows = N, cols = 1));
     D = ncol (X);
 }

 scale_lambda = matrix (1, rows = D, cols = 1);
 if (intercept_status == 1 | intercept_status == 2)
 {
     scale_lambda [D, 1] = 0;
 }

 if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
 {                           # Important assumption: X [, D] = matrix (1, rows = N, cols = 1)
     avg_X_cols = t(colSums(X)) / N;
     var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1);
     is_unsafe = var_X_cols <= 0;
     scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
     scale_X [D, 1] = 1;
     shift_X = - avg_X_cols * scale_X;
     shift_X [D, 1] = 0;
     rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
 } else {
     scale_X = matrix (1, rows = D, cols = 1);
     shift_X = matrix (0, rows = D, cols = 1);
     rowSums_X_sq = rowSums (X ^ 2);
 }

 # Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
 # with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
 # The transform is then associatively applied to the other side of the expression,
 # and is rewritten via "scale_X" and "shift_X" as follows:
 #
 # ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
 # ssX_A  = diag (scale_X) %*% A;
 # ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A;
 #
 # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
 # tssX_A = diag (scale_X) %*% A + shift_X %*% A [D, ];

 # Convert "Y_vec" into indicator matrix:
 max_y = max (Y_vec);
 if (min (Y_vec) <= 0) {
     # Category labels "0", "-1" etc. are converted into the largest label
     Y_vec  = Y_vec  + (- Y_vec  + max_y + 1) * (Y_vec <= 0);
     max_y = max_y + 1;
 }
 Y = table (seq (1, N, 1), Y_vec, N, max_y);
 K = ncol (Y) - 1;   # The number of  non-baseline categories

 lambda = (scale_lambda %*% matrix (1, rows = 1, cols = K)) * regularization;
 delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq));

 B = matrix (0, rows = D, cols = K);     ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B;
                                         ### LT = cbind (LT, matrix (0, rows = N, cols = 1));
                                         ### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
 P = matrix (1, rows = N, cols = K+1);   ### exp_LT = exp (LT);
 P = P / (K + 1);                        ### P =  exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
 obj = N * log (K + 1);                  ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));

 Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
 if (intercept_status == 2) {
     Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
 }
 Grad = Grad + lambda * B;
 norm_Grad = sqrt (sum (Grad ^ 2));
 norm_Grad_initial = norm_Grad;

 if (maxinneriter == 0) {
     maxinneriter = D * K;
 }
 iter = 1;

 # boolean for convergence check
 converge = (norm_Grad < tol) | (iter > maxiter);

 print ("-- Initially:  Objective = " + obj + ",  Gradient Norm = " + norm_Grad + ",  Trust Delta = " + delta);

 if (fileLog != " ") {
     log_str = "OBJECTIVE,0," + obj;
     log_str = append (log_str, "GRADIENT_NORM,0," + norm_Grad);
     log_str = append (log_str, "TRUST_DELTA,0," + delta);
 } else {
     log_str = " ";
 }

 while (! converge)
 {
 	# SOLVE TRUST REGION SUB-PROBLEM
 	S = matrix (0, rows = D, cols = K);
 	R = - Grad;
 	V = R;
 	delta2 = delta ^ 2;
 	inneriter = 1;
 	norm_R2 = sum (R ^ 2);
 	innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
 	is_trust_boundary_reached = 0;

 	while (! innerconverge)
 	{
 	    if (intercept_status == 2) {
 	        ssX_V = diag (scale_X) %*% V;
 	        ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V;
 	    } else {
 	        ssX_V = V;
 	    }
         Q = P [, 1:K] * (X %*% ssX_V);
         HV = t(X) %*% (Q - P [, 1:K] * (rowSums (Q) %*% matrix (1, rows = 1, cols = K)));
         if (intercept_status == 2) {
             HV = diag (scale_X) %*% HV + shift_X %*% HV [D, ];
         }
         HV = HV + lambda * V;
 		alpha = norm_R2 / sum (V * HV);
 		Snew = S + alpha * V;
 		norm_Snew2 = sum (Snew ^ 2);
 		if (norm_Snew2 <= delta2)
 		{
 			S = Snew;
 			R = R - alpha * HV;
 			old_norm_R2 = norm_R2
 			norm_R2 = sum (R ^ 2);
 			V = R + (norm_R2 / old_norm_R2) * V;
 			innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
 		} else {
 	        is_trust_boundary_reached = 1;
 			sv = sum (S * V);
 			v2 = sum (V ^ 2);
 			s2 = sum (S ^ 2);
 			rad = sqrt (sv ^ 2 + v2 * (delta2 - s2));
 			if (sv >= 0) {
 				alpha = (delta2 - s2) / (sv + rad);
 			} else {
 				alpha = (rad - sv) / v2;
 			}
 			S = S + alpha * V;
 			R = R - alpha * HV;
 			innerconverge = TRUE;
 		}
 	    inneriter = inneriter + 1;
 	    innerconverge = innerconverge | (inneriter > maxinneriter);
 	}

 	# END TRUST REGION SUB-PROBLEM

 	# compute rho, update B, obtain delta
 	gs = sum (S * Grad);
 	qk = - 0.5 * (gs - sum (S * R));
 	B_new = B + S;
 	if (intercept_status == 2) {
 	    ssX_B_new = diag (scale_X) %*% B_new;
 	    ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new;
     } else {
         ssX_B_new = B_new;
     }

     LT = cbind ((X %*% ssX_B_new), matrix (0, rows = N, cols = 1));
     if (fileLog != " ") {
         log_str = append (log_str, "LINEAR_TERM_MIN,"  + iter + "," + min (LT));
         log_str = append (log_str, "LINEAR_TERM_MAX,"  + iter + "," + max (LT));
     }
     LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
     exp_LT = exp (LT);
     P_new  = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
     obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));

 	# Consider updating LT in the inner loop
 	# Consider the big "obj" and "obj_new" rounding-off their small difference below:

 	actred = (obj - obj_new);

 	rho = actred / qk;
 	is_rho_accepted = (rho > eta0);
 	snorm = sqrt (sum (S ^ 2));

     if (fileLog != " ") {
         log_str = append (log_str, "NUM_CG_ITERS,"     + iter + "," + (inneriter - 1));
         log_str = append (log_str, "IS_TRUST_REACHED," + iter + "," + is_trust_boundary_reached);
         log_str = append (log_str, "POINT_STEP_NORM,"  + iter + "," + snorm);
         log_str = append (log_str, "OBJECTIVE,"        + iter + "," + obj_new);
         log_str = append (log_str, "OBJ_DROP_REAL,"    + iter + "," + actred);
         log_str = append (log_str, "OBJ_DROP_PRED,"    + iter + "," + qk);
         log_str = append (log_str, "OBJ_DROP_RATIO,"   + iter + "," + rho);
     }

 	if (iter == 1) {
 	   delta = min (delta, snorm);
 	}

 	alpha2 = obj_new - obj - gs;
     alpha = ifelse(alpha2 <= 0, sigma3, max(sigma1, -0.5 * gs / alpha2));

     if (rho < eta0)
         delta = min (max (alpha, sigma1) * snorm, sigma2 * delta);
     else if (rho < eta1)
         delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta));
     else if (rho < eta2)
         delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta));
     else
         delta = max (delta, min (alpha * snorm, sigma3 * delta));

 	if (is_trust_boundary_reached == 1)
 	{
 	    print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations, trust bound REACHED");
 	} else {
 	    print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations");
 	}
 	print ("   -- Obj.Reduction:  Actual = " + actred + ",  Predicted = " + qk +
 	       "  (A/P: " + (round (10000.0 * rho) / 10000.0) + "),  Trust Delta = " + delta);

 	if (is_rho_accepted)
 	{
 		B = B_new;
 		P = P_new;
 		Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
 		if (intercept_status == 2) {
 		    Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
 		}
 		Grad = Grad + lambda * B;
 		norm_Grad = sqrt (sum (Grad ^ 2));
 		obj = obj_new;
 	    print ("   -- New Objective = " + obj + ",  Beta Change Norm = " + snorm + ",  Gradient Norm = " + norm_Grad);
         if (fileLog != " ") {
             log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",1");
             log_str = append (log_str, "GRADIENT_NORM,"    + iter + "," + norm_Grad);
         }
 	} else {
         if (fileLog != " ") {
             log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",0");
         }
     }

     if (fileLog != " ") {
         log_str = append (log_str, "TRUST_DELTA," + iter + "," + delta);
     }

 	iter = iter + 1;
 	converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxiter) |
 	    ((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001)));
     if (converge) { print ("Termination / Convergence condition satisfied."); } else { print (" "); }
 }

 if (intercept_status == 2) {
     B_out = diag (scale_X) %*% B;
     B_out [D, ] = B_out [D, ] + t(shift_X) %*% B;
 } else {
     B_out = B;
 }
 write (B_out, fileB, format=fmtB);

 if (fileLog != " ") {
     write (log_str, fileLog);
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# Solves Multinomial Logistic Regression using Trust Region methods.
	# (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650)

	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X String --- Location to read the matrix of feature vectors
	# Y String --- Location to read the matrix with category labels
	# B String --- Location to store estimated regression parameters (the betas)
	# Log String " " Location to write per-iteration variables for log/debugging purposes
	# icpt Int 0 Intercept presence, shifting and rescaling X columns:
	# 0 = no intercept, no shifting, no rescaling;
	# 1 = add intercept, but neither shift nor rescale X;
	# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
	# reg Double 0.0 regularization parameter (lambda = 1/C); intercept is not regularized
	# tol Double 0.000001 tolerance ("epsilon")
	# moi Int 100 max. number of outer (Newton) iterations
	# mii Int 0 max. number of inner (conjugate gradient) iterations, 0 = no max
	# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
	# --------------------------------------------------------------------------------------------
	# The largest label represents the baseline category; if label -1 or 0 is present, then it is
	# the baseline label (and it is converted to the largest label).
	#
	# The Log file, when requested, contains the following per-iteration variables in CSV format,
	# each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values:
	#
	# NAME MEANING
	# -------------------------------------------------------------------------------------------
	# LINEAR_TERM_MIN The minimum value of X %*% B, used to check for overflows
	# LINEAR_TERM_MAX The maximum value of X %*% B, used to check for overflows
	# NUM_CG_ITERS Number of inner (Conj.Gradient) iterations in this outer iteration
	# IS_TRUST_REACHED 1 = trust region boundary was reached, 0 = otherwise
	# POINT_STEP_NORM L2-norm of iteration step from old point (i.e. matrix B) to new point
	# OBJECTIVE The loss function we minimize (negative regularized log-likelihood)
	# OBJ_DROP_REAL Reduction in the objective during this iteration, actual value
	# OBJ_DROP_PRED Reduction in the objective predicted by a quadratic approximation
	# OBJ_DROP_RATIO Actual-to-predicted reduction ratio, used to update the trust region
	# IS_POINT_UPDATED 1 = new point accepted; 0 = new point rejected, old point restored
	# GRADIENT_NORM L2-norm of the loss function gradient (omitted if point is rejected)
	# TRUST_DELTA Updated trust region size, the "delta"
	# -------------------------------------------------------------------------------------------
	#
	# Script invocation example:
	# hadoop jar SystemDS.jar -f MultiLogReg.dml -nvargs icpt=2 reg=1.0 tol=0.000001 moi=100 mii=20
	# X=INPUT_DIR/X123 Y=INPUT_DIR/Y123 B=OUTPUT_DIR/B123 fmt=csv Log=OUTPUT_DIR/log

	fileX = $X;
	fileY = $Y;
	fileB = $B;
	fileLog = ifdef ($Log, " ");
	fmtB = ifdef ($fmt, "text");

	intercept_status = ifdef ($icpt, 0); # $icpt = 0;
	regularization = ifdef ($reg, 0.0); # $reg = 0.0;
	tol = ifdef ($tol, 0.000001); # $tol = 0.000001;
	maxiter = ifdef ($moi, 100); # $moi = 100;
	maxinneriter = ifdef ($mii, 0); # $mii = 0;

	tol = as.double (tol);

	print ("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT");
	print ("Reading X...");
	X = read (fileX);
	print ("Reading Y...");
	Y_vec = read (fileY);

	eta0 = 0.0001;
	eta1 = 0.25;
	eta2 = 0.75;
	sigma1 = 0.25;
	sigma2 = 0.5;
	sigma3 = 4.0;
	psi = 0.1;

	N = nrow (X);
	D = ncol (X);

	# Introduce the intercept, shift and rescale the columns of X if needed
	if (intercept_status == 1 \| intercept_status == 2) # add the intercept column
	{
	X = cbind (X, matrix (1, rows = N, cols = 1));
	D = ncol (X);
	}

	scale_lambda = matrix (1, rows = D, cols = 1);
	if (intercept_status == 1 \| intercept_status == 2)
	{
	scale_lambda [D, 1] = 0;
	}

	if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1
	{ # Important assumption: X [, D] = matrix (1, rows = N, cols = 1)
	avg_X_cols = t(colSums(X)) / N;
	var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1);
	is_unsafe = var_X_cols <= 0;
	scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
	scale_X [D, 1] = 1;
	shift_X = - avg_X_cols * scale_X;
	shift_X [D, 1] = 0;
	rowSums_X_sq = (X ^ 2) %% (scale_X ^ 2) + X %% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
	} else {
	scale_X = matrix (1, rows = D, cols = 1);
	shift_X = matrix (0, rows = D, cols = 1);
	rowSums_X_sq = rowSums (X ^ 2);
	}

	# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
	# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
	# The transform is then associatively applied to the other side of the expression,
	# and is rewritten via "scale_X" and "shift_X" as follows:
	#
	# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
	# ssX_A = diag (scale_X) %*% A;
	# ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A;
	#
	# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
	# tssX_A = diag (scale_X) %% A + shift_X %% A [D, ];

	# Convert "Y_vec" into indicator matrix:
	max_y = max (Y_vec);
	if (min (Y_vec) <= 0) {
	# Category labels "0", "-1" etc. are converted into the largest label
	Y_vec = Y_vec + (- Y_vec + max_y + 1) * (Y_vec <= 0);
	max_y = max_y + 1;
	}
	Y = table (seq (1, N, 1), Y_vec, N, max_y);
	K = ncol (Y) - 1; # The number of non-baseline categories

	lambda = (scale_lambda %% matrix (1, rows = 1, cols = K)) regularization;
	delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq));

	B = matrix (0, rows = D, cols = K); ### LT = X %% (SHIFT/SCALE TRANSFORM) %% B;
	### LT = cbind (LT, matrix (0, rows = N, cols = 1));
	### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
	P = matrix (1, rows = N, cols = K+1); ### exp_LT = exp (LT);
	P = P / (K + 1); ### P = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
	obj = N * log (K + 1); ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));

	Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
	if (intercept_status == 2) {
	Grad = diag (scale_X) %% Grad + shift_X %% Grad [D, ];
	}
	Grad = Grad + lambda * B;
	norm_Grad = sqrt (sum (Grad ^ 2));
	norm_Grad_initial = norm_Grad;

	if (maxinneriter == 0) {
	maxinneriter = D * K;
	}
	iter = 1;

	# boolean for convergence check
	converge = (norm_Grad < tol) \| (iter > maxiter);

	print ("-- Initially: Objective = " + obj + ", Gradient Norm = " + norm_Grad + ", Trust Delta = " + delta);

	if (fileLog != " ") {
	log_str = "OBJECTIVE,0," + obj;
	log_str = append (log_str, "GRADIENT_NORM,0," + norm_Grad);
	log_str = append (log_str, "TRUST_DELTA,0," + delta);
	} else {
	log_str = " ";
	}

	while (! converge)
	{
	# SOLVE TRUST REGION SUB-PROBLEM
	S = matrix (0, rows = D, cols = K);
	R = - Grad;
	V = R;
	delta2 = delta ^ 2;
	inneriter = 1;
	norm_R2 = sum (R ^ 2);
	innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
	is_trust_boundary_reached = 0;

	while (! innerconverge)
	{
	if (intercept_status == 2) {
	ssX_V = diag (scale_X) %*% V;
	ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V;
	} else {
	ssX_V = V;
	}
	Q = P [, 1:K] * (X %*% ssX_V);
	HV = t(X) %% (Q - P [, 1:K] (rowSums (Q) %*% matrix (1, rows = 1, cols = K)));
	if (intercept_status == 2) {
	HV = diag (scale_X) %% HV + shift_X %% HV [D, ];
	}
	HV = HV + lambda * V;
	alpha = norm_R2 / sum (V * HV);
	Snew = S + alpha * V;
	norm_Snew2 = sum (Snew ^ 2);
	if (norm_Snew2 <= delta2)
	{
	S = Snew;
	R = R - alpha * HV;
	old_norm_R2 = norm_R2
	norm_R2 = sum (R ^ 2);
	V = R + (norm_R2 / old_norm_R2) * V;
	innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
	} else {
	is_trust_boundary_reached = 1;
	sv = sum (S * V);
	v2 = sum (V ^ 2);
	s2 = sum (S ^ 2);
	rad = sqrt (sv ^ 2 + v2 * (delta2 - s2));
	if (sv >= 0) {
	alpha = (delta2 - s2) / (sv + rad);
	} else {
	alpha = (rad - sv) / v2;
	}
	S = S + alpha * V;
	R = R - alpha * HV;
	innerconverge = TRUE;
	}
	inneriter = inneriter + 1;
	innerconverge = innerconverge \| (inneriter > maxinneriter);
	}

	# END TRUST REGION SUB-PROBLEM

	# compute rho, update B, obtain delta
	gs = sum (S * Grad);
	qk = - 0.5 * (gs - sum (S * R));
	B_new = B + S;
	if (intercept_status == 2) {
	ssX_B_new = diag (scale_X) %*% B_new;
	ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new;
	} else {
	ssX_B_new = B_new;
	}

	LT = cbind ((X %*% ssX_B_new), matrix (0, rows = N, cols = 1));
	if (fileLog != " ") {
	log_str = append (log_str, "LINEAR_TERM_MIN," + iter + "," + min (LT));
	log_str = append (log_str, "LINEAR_TERM_MAX," + iter + "," + max (LT));
	}
	LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
	exp_LT = exp (LT);
	P_new = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
	obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));

	# Consider updating LT in the inner loop
	# Consider the big "obj" and "obj_new" rounding-off their small difference below:

	actred = (obj - obj_new);

	rho = actred / qk;
	is_rho_accepted = (rho > eta0);
	snorm = sqrt (sum (S ^ 2));

	if (fileLog != " ") {
	log_str = append (log_str, "NUM_CG_ITERS," + iter + "," + (inneriter - 1));
	log_str = append (log_str, "IS_TRUST_REACHED," + iter + "," + is_trust_boundary_reached);
	log_str = append (log_str, "POINT_STEP_NORM," + iter + "," + snorm);
	log_str = append (log_str, "OBJECTIVE," + iter + "," + obj_new);
	log_str = append (log_str, "OBJ_DROP_REAL," + iter + "," + actred);
	log_str = append (log_str, "OBJ_DROP_PRED," + iter + "," + qk);
	log_str = append (log_str, "OBJ_DROP_RATIO," + iter + "," + rho);
	}

	if (iter == 1) {
	delta = min (delta, snorm);
	}

	alpha2 = obj_new - obj - gs;
	alpha = ifelse(alpha2 <= 0, sigma3, max(sigma1, -0.5 * gs / alpha2));

	if (rho < eta0)
	delta = min (max (alpha, sigma1) * snorm, sigma2 * delta);
	else if (rho < eta1)
	delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta));
	else if (rho < eta2)
	delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta));
	else
	delta = max (delta, min (alpha * snorm, sigma3 * delta));

	if (is_trust_boundary_reached == 1)
	{
	print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations, trust bound REACHED");
	} else {
	print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations");
	}
	print (" -- Obj.Reduction: Actual = " + actred + ", Predicted = " + qk +
	" (A/P: " + (round (10000.0 * rho) / 10000.0) + "), Trust Delta = " + delta);

	if (is_rho_accepted)
	{
	B = B_new;
	P = P_new;
	Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
	if (intercept_status == 2) {
	Grad = diag (scale_X) %% Grad + shift_X %% Grad [D, ];
	}
	Grad = Grad + lambda * B;
	norm_Grad = sqrt (sum (Grad ^ 2));
	obj = obj_new;
	print (" -- New Objective = " + obj + ", Beta Change Norm = " + snorm + ", Gradient Norm = " + norm_Grad);
	if (fileLog != " ") {
	log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",1");
	log_str = append (log_str, "GRADIENT_NORM," + iter + "," + norm_Grad);
	}
	} else {
	if (fileLog != " ") {
	log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",0");
	}
	}

	if (fileLog != " ") {
	log_str = append (log_str, "TRUST_DELTA," + iter + "," + delta);
	}

	iter = iter + 1;
	converge = ((norm_Grad < (tol * norm_Grad_initial)) \| (iter > maxiter) \|
	((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001)));
	if (converge) { print ("Termination / Convergence condition satisfied."); } else { print (" "); }
	}

	if (intercept_status == 2) {
	B_out = diag (scale_X) %*% B;
	B_out [D, ] = B_out [D, ] + t(shift_X) %*% B;
	} else {
	B_out = B;
	}
	write (B_out, fileB, format=fmtB);

	if (fileLog != " ") {
	write (log_str, fileLog);
	}