blob: 5afd5e701d7f68713c878ab72bc1f810c8a03043 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Solves Multinomial Logistic Regression using Trust Region method.
# (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650)
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X String --- Location to read the matrix of feature vectors
# Y String --- Location to read the matrix with category labels
# icpt Int 0 Intercept presence, shifting and rescaling X columns:
# 0 = no intercept, no shifting, no rescaling;
# 1 = add intercept, but neither shift nor rescale X;
# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
# reg Double 0.0 regularization parameter (lambda = 1/C); intercept is not regularized
# tol Double 0.000001 tolerance ("epsilon")
# maxi Int 100 max. number of outer (Newton) iterations
# maxii Int 0 max. number of inner (conjugate gradient) iterations, 0 = no max
# --------------------------------------------------------------------------------------------
# The largest label represents the baseline category; if label -1 or 0 is present, then it is
# the baseline label (and it is converted to the largest label).
#
# NAME TYPE MEANING
# -------------------------------------------------------------------------------------------
# betas Double regression betas as output for prediction
# -------------------------------------------------------------------------------------------
m_multiLogReg = function(Matrix[Double] X, Matrix[Double] Y, Integer icpt = 2, Double tol = 0.000001,
Double reg = 1.0, Integer maxi = 100, Integer maxii = 20, Boolean verbose = TRUE)
return(Matrix[Double] betas)
{
eta0 = 0.0001;
eta1 = 0.25;
eta2 = 0.75;
sigma1 = 0.25;
sigma2 = 0.5;
sigma3 = 4.0;
psi = 0.1;
N = nrow (X);
D = ncol (X);
# Introduce the intercept, shift and rescale the columns of X if needed
if (icpt == 1 | icpt == 2) { # add the intercept column
X = cbind (X, matrix (1, N, 1));
D = ncol (X);
}
scale_lambda = matrix (1, D, 1);
if (icpt == 1 | icpt == 2)
scale_lambda [D, 1] = 0;
if (icpt == 2) # scale-&-shift X columns to mean 0, variance 1
{ # Important assumption: X [, D] = matrix (1, N, 1)
avg_X_cols = t(colSums(X)) / N;
var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1);
is_unsafe = var_X_cols <= 0;
scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
scale_X [D, 1] = 1;
shift_X = - avg_X_cols * scale_X;
shift_X [D, 1] = 0;
rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
}
else {
scale_X = matrix (1, D, 1);
shift_X = matrix (0, D, 1);
rowSums_X_sq = rowSums (X ^ 2);
}
# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
# The transform is then associatively applied to the other side of the expression,
# and is rewritten via "scale_X" and "shift_X" as follows:
# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
# ssX_A = diag (scale_X) %*% A;
# ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A;
# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
# tssX_A = diag (scale_X) %*% A + shift_X %*% A [D, ];
# Convert "Y" into indicator matrix:
max_y = max (Y);
if (min (Y) <= 0) {
# Category labels "0", "-1" etc. are converted into the largest label
Y = Y + (- Y + max_y + 1) * (Y <= 0);
max_y = max_y + 1;
}
Y = table (seq (1, N, 1), Y, N, max_y);
K = ncol (Y) - 1; # The number of non-baseline categories
lambda = (scale_lambda %*% matrix (1, 1, K)) * reg;
delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq));
B = matrix (0, D, K); ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B;
### LT = cbind (LT, matrix (0, rows = N, cols = 1));
### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
P = matrix (1, N, K+1); ### exp_LT = exp (LT);
P = P / (K + 1); ### P = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
obj = N * log (K + 1); ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
if (icpt == 2)
Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
Grad = Grad + lambda * B;
norm_Grad = sqrt (sum (Grad ^ 2));
norm_Grad_initial = norm_Grad;
if (maxii == 0)
maxii = D * K;
iter = 1;
# boolean for convergence check
converge = (norm_Grad < tol) | (iter > maxi);
if(verbose)
print ("-- Initially: Objective = " + obj + ", Gradient Norm = " + norm_Grad + ", Trust Delta = " + delta);
while (! converge)
{
# SOLVE TRUST REGION SUB-PROBLEM
S = matrix (0, D, K);
R = - Grad;
V = R;
delta2 = delta ^ 2;
inneriter = 1;
norm_R2 = sum (R ^ 2);
innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
is_trust_boundary_reached = 0;
while (! innerconverge)
{
if (icpt == 2) {
ssX_V = diag (scale_X) %*% V;
ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V;
}
else
ssX_V = V;
Q = P [, 1:K] * (X %*% ssX_V);
HV = t(X) %*% (Q - P [, 1:K] * (rowSums (Q) %*% matrix (1, 1, K)));
if (icpt == 2)
HV = diag (scale_X) %*% HV + shift_X %*% HV [D, ];
HV = HV + lambda * V;
alpha = norm_R2 / sum (V * HV);
Snew = S + alpha * V;
norm_Snew2 = sum (Snew ^ 2);
if (norm_Snew2 <= delta2) {
S = Snew;
R = R - alpha * HV;
old_norm_R2 = norm_R2
norm_R2 = sum (R ^ 2);
V = R + (norm_R2 / old_norm_R2) * V;
innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
}
else {
is_trust_boundary_reached = 1;
sv = sum (S * V);
v2 = sum (V ^ 2);
s2 = sum (S ^ 2);
rad = sqrt (sv ^ 2 + v2 * (delta2 - s2));
if (sv >= 0)
alpha = (delta2 - s2) / (sv + rad);
else
alpha = (rad - sv) / v2;
S = S + alpha * V;
R = R - alpha * HV;
innerconverge = TRUE;
}
inneriter = inneriter + 1;
innerconverge = innerconverge | (inneriter > maxii);
}
# END TRUST REGION SUB-PROBLEM
# compute rho, update B, obtain delta
gs = sum (S * Grad);
qk = - 0.5 * (gs - sum (S * R));
B_new = B + S;
if (icpt == 2) {
ssX_B_new = diag (scale_X) %*% B_new;
ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new;
}
else
ssX_B_new = B_new;
LT = cbind ((X %*% ssX_B_new), matrix (0, N, 1));
LT = LT - rowMaxs (LT) %*% matrix (1, 1, K+1);
exp_LT = exp (LT);
P_new = exp_LT / (rowSums (exp_LT) %*% matrix (1, 1, K+1));
obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
# Consider updating LT in the inner loop
# Consider the big "obj" and "obj_new" rounding-off their small difference below:
actred = (obj - obj_new);
rho = actred / qk;
is_rho_accepted = (rho > eta0);
snorm = sqrt (sum (S ^ 2));
if (iter == 1)
delta = min (delta, snorm);
alpha2 = obj_new - obj - gs;
alpha = ifelse(alpha2 <= 0, sigma3, max(sigma1, -0.5 * gs / alpha2));
if (rho < eta0)
delta = min (max (alpha, sigma1) * snorm, sigma2 * delta);
else if (rho < eta1)
delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta));
else if (rho < eta2)
delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta));
else
delta = max (delta, min (alpha * snorm, sigma3 * delta));
if(verbose) {
if (is_trust_boundary_reached == 1)
print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations, trust bound REACHED");
else print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations");
print (" -- Obj.Reduction: Actual = " + actred + ", Predicted = " + qk +
" (A/P: " + (round (10000.0 * rho) / 10000.0) + "), Trust Delta = " + delta);
}
if (is_rho_accepted) {
B = B_new;
P = P_new;
Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
if (icpt == 2)
Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
Grad = Grad + lambda * B;
norm_Grad = sqrt (sum (Grad ^ 2));
obj = obj_new;
if(verbose)
print (" -- New Objective = " + obj + ", Beta Change Norm = " + snorm + ", Gradient Norm = " + norm_Grad);
}
iter = iter + 1;
converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxi) |
((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001)));
if (verbose & converge)
print ("Termination / Convergence condition satisfied.");
}
if (icpt == 2) {
B_out = diag (scale_X) %*% B;
B_out [D, ] = B_out [D, ] + t(shift_X) %*% B;
}
else {
B_out = B;
}
betas = B_out
}