blob: b16ed3f4cf2ad1cbdba01fa0a944ea07900fa58a [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# 10K Dataset w/ 4-fold CV
# Total time: 5.933374000000001 sec.
# Number of executed MR Jobs: 0.
# hadoop jar SystemDS.jar -f CV_LogisticRegression.dml -args itau/logreg/X_10k_10 itau/logreg/y_10k 4 0 50 1000.0 1.0 1.0 1.0 1.0
# 200K Dataset w/ 4-fold CV
# Total time: 66.17157 sec.
# Number of executed MR Jobs: 4.
# hadoop jar SystemDS.jar -f CV_LogisticRegression.dml -args itau/logreg/X_200k_50 itau/logreg/y_200k 4 0 50 1000.0 1.0 1.0 1.0 1.0
#X = Rand(rows=1000000, cols=100);
#y = Rand(rows=1000000, cols=1);
X = read( $1 );
y = read( $2 );
m = nrow( X );
n = ncol( X );
k = $3;
#parameters for model training
intercept = $4
maxiter = $5
C = $6
#parameters for model scoring
value_TP = $7;
cost_FP = $8;
value_TN = $9;
cost_FN = $10;
P = Rand(rows=m, cols=1, min=0.0, max=1.0, pdf = "uniform");
P = round(0.5+P*k);
ones = matrix(1, rows=1, cols=n);
stats = matrix(0, rows=k, cols=40); #k-folds x 40-stats
parfor( i in 1:k )
{
#prepare train/test fold projections
vPxi = (P == i);
mPxi = (vPxi %*% ones);
#nvPxi = (P != i);
#nmPxi = (nvPxi %*% ones); #note: inefficient for sparse data
#create train/test folds
Xi = X * mPxi; # Create the TEST set with 1/k of all the rows
yi = y * vPxi; # Create the labels for the TEST set
nXi = X - Xi; # Create the TRAINING set with (k-1)/k of the rows
nyi = y - yi; # Create the labels for the TRAINING set
Xyi = cbind(Xi,yi); #keep alignment on removeEmpty
Xyi = removeEmpty( target=Xyi, margin="rows" );
Xi = Xyi[ , 1:n];
yi = Xyi[ , n+1];
nXyi = cbind(nXi,nyi); #keep alignment on removeEmpty
nXyi = removeEmpty( target=nXyi, margin="rows" );
nXi = nXyi[ , 1:n];
nyi = nXyi[ , n+1];
#train logistic regession model per fold, use the TRAINING set
wi = logisticRegression( nXi, nyi, intercept, maxiter, C );
#score logistic regression model per fold, use both the TRAINING and the TEST sets
score = scoreLogRegModel(nXi, nyi, wi, Xi, yi, value_TP, cost_FP, value_TN, cost_FN );
stats[i,] = score;
}
# printed output of stats
z = printFoldStatistics( stats );
################################################################################
logisticRegression = function (Matrix[double] X, Matrix[double] y, Integer in_intercept, Integer in_maxiter, Double in_C)
return (Matrix[double] ret)
{
# internal parameters
tol = 0.001
eta0 = 0.0001
eta1 = 0.25
eta2 = 0.75
sigma1 = 0.25
sigma2 = 0.5
sigma3 = 4.0
psi = 0.1
# read training data files
intercept = in_intercept
D = ncol(X)
#initialize w
w = Rand(rows=D, cols=1, min=0.0, max=0.0);
zeros_D = Rand(rows = D, cols = 1, min = 0.0, max = 0.0);
if (intercept == 1) {
num_samples = nrow(X);
ones = Rand(rows=num_samples, cols=1, min=1, max=1, pdf="uniform");
X = cbind(X, ones);
zero_matrix = Rand(rows=1, cols=1, min=0.0, max=0.0);
w = t(cbind(t(w), zero_matrix));
zeros_D = t(cbind(t(zeros_D), zero_matrix));
}
N = nrow(X)
maxiter = in_maxiter
maxinneriter = 1000
C = in_C
e = Rand(rows=1, cols=1, min=1.0, max=1.0);
o = X %*% w
logistic = 1.0/(1.0 + exp( -y * o))
obj = 0.5 * t(w) %*% w + C*sum(-log(logistic))
grad = w + C*t(X) %*% ((logistic - 1)*y)
logisticD = logistic*(1-logistic)
delta = sqrt(sum(grad*grad))
# number of iterations
iter = 0
# starting point for CG
# VS: change
zeros_N = Rand(rows = N, cols = 1, min = 0.0, max = 0.0);
# boolean for convergence check
converge = (delta < tol) | (iter > maxiter)
norm_r2 = sum(grad*grad)
# VS: change
norm_grad = sqrt(norm_r2)
norm_grad_initial = norm_grad
alpha = t(w) %*% w
alpha2 = alpha
while(!converge) {
norm_grad = sqrt(sum(grad*grad))
#print("-- Outer Iteration = " + iter)
objScalar = as.scalar(obj)
#print(" Iterations = " + iter + ", Objective = " + objScalar + ", Gradient Norm = " + norm_grad)
# SOLVE TRUST REGION SUB-PROBLEM
s = zeros_D
os = zeros_N
r = -grad
d = r
inneriter = 0
innerconverge = ( sqrt(sum(r*r)) <= psi * norm_grad)
while (!innerconverge) {
inneriter = inneriter + 1
norm_r2 = sum(r*r)
od = X %*% d
Hd = d + C*(t(X) %*% (logisticD*od))
alpha_deno = t(d) %*% Hd
alpha = norm_r2 / alpha_deno
s = s + as.scalar(alpha) * d
os = os + as.scalar(alpha) * od
sts = t(s) %*% s
delta2 = delta*delta
stsScalar = as.scalar(sts)
shouldBreak = FALSE; # to mimic "break" in the following 'if' condition
if (stsScalar > delta2) {
#print(" --- cg reaches trust region boundary")
s = s - as.scalar(alpha) * d
os = os - as.scalar(alpha) * od
std = t(s) %*% d
dtd = t(d) %*% d
sts = t(s) %*% s
rad = sqrt(std*std + dtd*(delta2 - sts))
stdScalar = as.scalar(std)
tau = 0; #TODO
if(stdScalar >= 0) {
tau = (delta2 - sts)/(std + rad)
}
else {
tau = (rad - std)/dtd
}
s = s + as.scalar(tau) * d
os = os + as.scalar(tau) * od
r = r - as.scalar(tau) * Hd
#break
shouldBreak = TRUE;
innerconverge = TRUE;
}
if (!shouldBreak) {
r = r - as.scalar(alpha) * Hd
old_norm_r2 = norm_r2
norm_r2 = sum(r*r)
beta = norm_r2/old_norm_r2
d = r + beta*d
innerconverge = (sqrt(norm_r2) <= psi * norm_grad) | (inneriter > maxinneriter)
}
}
#print(" --- Inner CG Iteration = " + inneriter)
# END TRUST REGION SUB-PROBLEM
# compute rho, update w, obtain delta
gs = t(s) %*% grad
qk = -0.5*(gs - (t(s) %*% r))
wnew = w + s
onew = o + os
logisticnew = 1.0/(1.0 + exp(-y * onew ))
objnew = 0.5 * t(wnew) %*% wnew + C * sum(-log(logisticnew))
actred = (obj - objnew)
actredScalar = as.scalar(actred)
rho = actred / qk
qkScalar = as.scalar(qk)
rhoScalar = as.scalar(rho);
snorm = sqrt(sum( s * s ))
#print(" Actual = " + actredScalar)
#print(" Predicted = " + qkScalar)
if (iter==0) {
delta = min(delta, snorm)
}
alpha2 = objnew - obj - gs
alpha2Scalar = as.scalar(alpha2)
if (alpha2Scalar <= 0) {
alpha = sigma3*e
}
else {
ascalar = max(sigma1, -0.5*as.scalar(gs)/alpha2Scalar)
alpha = ascalar*e
}
if (rhoScalar > eta0) {
w = wnew
o = onew
grad = w + C*t(X) %*% ((logisticnew - 1) * y )
norm_grad = sqrt(sum(grad*grad))
logisticD = logisticnew * (1 - logisticnew)
obj = objnew
}
alphaScalar = as.scalar(alpha)
if (rhoScalar < eta0){
delta = min(max( alphaScalar , sigma1) * snorm, sigma2 * delta )
}
else {
if (rhoScalar < eta1){
delta = max(sigma1 * delta, min( alphaScalar * snorm, sigma2 * delta))
}
else {
if (rhoScalar < eta2) {
delta = max(sigma1 * delta, min( alphaScalar * snorm, sigma3 * delta))
}
else {
delta = max(delta, min( alphaScalar * snorm, sigma3 * delta))
}
}
}
o2 = y * o
correct = sum(o2 > 0)
accuracy = correct*100.0/N
iter = iter + 1
#converge = (norm_grad < (tol * norm_grad_initial)) | (iter > maxiter)
converge = (norm_grad < tol) | (iter > maxiter)
#print(" Delta = " + delta)
#print(" Training Accuracy = " + accuracy)
#print(" Correct = " + correct)
#print(" OuterIter = " + iter)
#print(" Converge = " + converge)
}
ret = w; #return
}
################################################################################
scoreLogRegModel = function (Matrix[double] X_train, Matrix[double] y_train, Matrix[double] w_train,
Matrix[double] X_test, Matrix[double] y_test,
double value_TP, double cost_FP, double value_TN, double cost_FN )
return (Matrix[double] stats)
# X_train = training-set feature matrix file
# y_train = training-set label vector file (each label is -1 or +1)
# w_train = training-set logistic regression weights file; may have an intercept weight at the end
# X_test = test-set feature matrix file
# y_test = test-set label vector file
# value_TP = the value of a true positive (label +1 predicted as +1)
# cost_FP = the cost of a false positive (label -1 predicted as +1), enter as a positive number
# value_TN = the value of a true negative (label -1 predicted as -1)
# cost_FN = the cost of a false negative (label +1 predicted as -1), enter as a positive number
# fold = the row in the output matrix that should contain all statistics
# num_folds= the number of folds = the number of rows in the output matrix
{
one = matrix(1, rows=1, cols=1);
stats = matrix(0, rows=1, cols=40);
num_train_records = nrow (X_train);
num_test_records = nrow (X_test);
num_features = ncol (X_test);
w = w_train;
if (nrow (w) == 1) {
w = t(w);
}
b = 0.0;
if (nrow (w) > num_features) {
b = as.scalar (w [num_features + 1, 1]);
}
# TRAINING DATA - ESTIMATE PROBABILITIES:
# Estimate the label probabilities and assign predicted labels to maximize the estimated $-value
linear_train = X_train %*% w [1:num_features, 1] + b;
prob_train = 1.0 / (1.0 + exp (- linear_train));
est_value_POS_train = value_TP * prob_train - cost_FP * (1.0 - prob_train);
est_value_NEG_train = value_TN * (1.0 - prob_train) - cost_FN * prob_train;
y_train_pred = 2 * (est_value_POS_train > est_value_NEG_train) - 1;
# Compute the estimated number of true/false positives/negatives
est_num_TP_train = sum ((y_train_pred + 1) * prob_train) / 2;
est_num_FP_train = sum ((y_train_pred + 1) * (1 - prob_train)) / 2;
est_num_TN_train = sum ((1 - y_train_pred) * (1 - prob_train)) / 2;
est_num_FN_train = sum ((1 - y_train_pred) * prob_train) / 2;
stats [1, 1] = one*est_num_TP_train;
stats [1, 2] = one*est_num_FP_train;
stats [1, 3] = one*est_num_TN_train;
stats [1, 4] = one*est_num_FN_train;
# Compute the estimated precision, recall = sensitivity, specificity, and value
est_precision_train = 100.0 * est_num_TP_train / (est_num_TP_train + est_num_FP_train);
est_recall_train = 100.0 * est_num_TP_train / (est_num_TP_train + est_num_FN_train);
est_specificity_train = 100.0 * est_num_TN_train / (est_num_TN_train + est_num_FP_train);
est_value_train =
est_num_TP_train * value_TP - est_num_FP_train * cost_FP + est_num_TN_train * value_TN - est_num_FN_train * cost_FN;
stats [1, 5] = one*est_precision_train;
stats [1, 6] = one*est_recall_train;
stats [1, 7] = one*est_specificity_train;
stats [1, 8] = one*est_value_train;
# TRAINING DATA - COMPARE WITH ACTUAL LABELS:
# Compute the actual number of true/false positives/negatives
num_TP_train = sum ((y_train_pred + 1) * (y_train + 1)) / 4;
num_FP_train = sum ((y_train_pred + 1) * (1 - y_train)) / 4;
num_TN_train = sum ((1 - y_train_pred) * (1 - y_train)) / 4;
num_FN_train = sum ((1 - y_train_pred) * (y_train + 1)) / 4;
stats [1, 11] = one*num_TP_train;
stats [1, 12] = one*num_FP_train;
stats [1, 13] = one*num_TN_train;
stats [1, 14] = one*num_FN_train;
# Compute the actual precision, recall = sensitivity, specificity, and value
precision_train = 100.0 * num_TP_train / (num_TP_train + num_FP_train);
recall_train = 100.0 * num_TP_train / (num_TP_train + num_FN_train);
specificity_train = 100.0 * num_TN_train / (num_TN_train + num_FP_train);
value_train = num_TP_train * value_TP - num_FP_train * cost_FP + num_TN_train * value_TN - num_FN_train * cost_FN;
stats [1, 15] = one*precision_train;
stats [1, 16] = one*recall_train;
stats [1, 17] = one*specificity_train;
stats [1, 18] = one*value_train;
# TEST DATA - ESTIMATE PROBABILITIES:
# Estimate the label probabilities and assign predicted labels to maximize the estimated value
linear_test = X_test %*% w [1:num_features, 1] + b;
prob_test = 1.0 / (1.0 + exp (- linear_test));
est_value_POS_test = value_TP * prob_test - cost_FP * (1.0 - prob_test);
est_value_NEG_test = value_TN * (1.0 - prob_test) - cost_FN * prob_test;
y_test_pred = 2 * (est_value_POS_test > est_value_NEG_test) - 1;
# Compute the estimated number of true/false positives/negatives
est_num_TP_test = sum ((y_test_pred + 1) * prob_test) / 2;
est_num_FP_test = sum ((y_test_pred + 1) * (1 - prob_test)) / 2;
est_num_TN_test = sum ((1 - y_test_pred) * (1 - prob_test)) / 2;
est_num_FN_test = sum ((1 - y_test_pred) * prob_test) / 2;
stats [1, 21] = one*est_num_TP_test;
stats [1, 22] = one*est_num_FP_test;
stats [1, 23] = one*est_num_TN_test;
stats [1, 24] = one*est_num_FN_test;
# Compute the estimated precision, recall = sensitivity, specificity, and value
est_precision_test = 100.0 * est_num_TP_test / (est_num_TP_test + est_num_FP_test);
est_recall_test = 100.0 * est_num_TP_test / (est_num_TP_test + est_num_FN_test);
est_specificity_test = 100.0 * est_num_TN_test / (est_num_TN_test + est_num_FP_test);
est_value_test = est_num_TP_test * value_TP - est_num_FP_test * cost_FP + est_num_TN_test * value_TN - est_num_FN_test * cost_FN;
stats [1, 25] = one*est_precision_test;
stats [1, 26] = one*est_recall_test;
stats [1, 27] = one*est_specificity_test;
stats [1, 28] = one*est_value_test;
# TEST DATA - COMPARE WITH ACTUAL LABELS:
# Compute the actual number of true/false positives/negatives
num_TP_test = sum ((y_test_pred + 1) * (y_test + 1)) / 4;
num_FP_test = sum ((y_test_pred + 1) * (1 - y_test)) / 4;
num_TN_test = sum ((1 - y_test_pred) * (1 - y_test)) / 4;
num_FN_test = sum ((1 - y_test_pred) * (y_test + 1)) / 4;
stats [1, 31] = one*num_TP_test;
stats [1, 32] = one*num_FP_test;
stats [1, 33] = one*num_TN_test;
stats [1, 34] = one*num_FN_test;
# Compute the actual precision, recall = sensitivity, specificity, and value
precision_test = 100.0 * num_TP_test / (num_TP_test + num_FP_test);
recall_test = 100.0 * num_TP_test / (num_TP_test + num_FN_test);
specificity_test = 100.0 * num_TN_test / (num_TN_test + num_FP_test);
value_test = num_TP_test * value_TP - num_FP_test * cost_FP + num_TN_test * value_TN - num_FN_test * cost_FN;
stats [1, 35] = one*precision_test;
stats [1, 36] = one*recall_test;
stats [1, 37] = one*specificity_test;
stats [1, 38] = one*value_test;
}
printFoldStatistics = function (Matrix[double] stats ) return( Integer err )
{
stats_avg = round (colMeans(stats) * 10000.0) / 10000.0;
stats_max = round (colMaxs (stats) * 10000.0) / 10000.0;
stats_min = round (colMins (stats) * 10000.0) / 10000.0;
print ("Training Data, Label comparison statistics:");
print (" True Positives: Min = " + as.scalar (stats_min [1, 11]) + ", Avg = " + as.scalar (stats_avg [1, 11]) + ", Max = " + as.scalar (stats_max [1, 11]));
print (" False Positives: Min = " + as.scalar (stats_min [1, 12]) + ", Avg = " + as.scalar (stats_avg [1, 12]) + ", Max = " + as.scalar (stats_max [1, 12]));
print (" True Negatives: Min = " + as.scalar (stats_min [1, 13]) + ", Avg = " + as.scalar (stats_avg [1, 13]) + ", Max = " + as.scalar (stats_max [1, 13]));
print (" False Negatives: Min = " + as.scalar (stats_min [1, 14]) + ", Avg = " + as.scalar (stats_avg [1, 14]) + ", Max = " + as.scalar (stats_max [1, 14]));
print (" Precision %: Min = " + as.scalar (stats_min [1, 15]) + ", Avg = " + as.scalar (stats_avg [1, 15]) + ", Max = " + as.scalar (stats_max [1, 15]));
print ("Recall (Sensit-y)%: Min = " + as.scalar (stats_min [1, 16]) + ", Avg = " + as.scalar (stats_avg [1, 16]) + ", Max = " + as.scalar (stats_max [1, 16]));
print (" Specificity %: Min = " + as.scalar (stats_min [1, 17]) + ", Avg = " + as.scalar (stats_avg [1, 17]) + ", Max = " + as.scalar (stats_max [1, 17]));
print (" Value - Cost: Min = " + as.scalar (stats_min [1, 18]) + ", Avg = " + as.scalar (stats_avg [1, 18]) + ", Max = " + as.scalar (stats_max [1, 18]));
print (" ");
print(" ")
while(FALSE){}
print ("TEST Data, Label comparison statistics:");
print (" True Positives: Min = " + as.scalar (stats_min [1, 31]) + ", Avg = " + as.scalar (stats_avg [1, 31]) + ", Max = " + as.scalar (stats_max [1, 31]));
print (" False Positives: Min = " + as.scalar (stats_min [1, 32]) + ", Avg = " + as.scalar (stats_avg [1, 32]) + ", Max = " + as.scalar (stats_max [1, 32]));
print (" True Negatives: Min = " + as.scalar (stats_min [1, 33]) + ", Avg = " + as.scalar (stats_avg [1, 33]) + ", Max = " + as.scalar (stats_max [1, 33]));
print (" False Negatives: Min = " + as.scalar (stats_min [1, 34]) + ", Avg = " + as.scalar (stats_avg [1, 34]) + ", Max = " + as.scalar (stats_max [1, 34]));
print (" Precision %: Min = " + as.scalar (stats_min [1, 35]) + ", Avg = " + as.scalar (stats_avg [1, 35]) + ", Max = " + as.scalar (stats_max [1, 35]));
print ("Recall (Sensit-y)%: Min = " + as.scalar (stats_min [1, 36]) + ", Avg = " + as.scalar (stats_avg [1, 36]) + ", Max = " + as.scalar (stats_max [1, 36]));
print (" Specificity %: Min = " + as.scalar (stats_min [1, 37]) + ", Avg = " + as.scalar (stats_avg [1, 37]) + ", Max = " + as.scalar (stats_max [1, 37]));
print (" Value - Cost: Min = " + as.scalar (stats_min [1, 38]) + ", Avg = " + as.scalar (stats_avg [1, 38]) + ", Max = " + as.scalar (stats_max [1, 38]));
err = 0;
}