blob: 0981cb8b4d744c243298de1ef55d9e63bf4ede97 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# How to invoke this dml script L2SVM.dml?
# Assume L2SVM_HOME is set to the home of the dml script
# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
# hadoop jar SystemDS.jar -f $L2SVM_HOME/L2SVM.pydml -nvargs X="$INPUT_DIR/X" Y="$INPUT_DIR/Y" icpt=0 tol=1.0E-8 reg=1.0 maxiter=3 model="$OUTPUT_DIR/w" Log="$OUTPUT_DIR/Log"
# Note about inputs:
# Assumes that labels (entries in Y)
# are set to either -1 or +1
# or the result of recoding
cmdLine_fmt=ifdef($fmt,"text")
cmdLine_icpt=ifdef($icpt, 0)
cmdLine_tol=ifdef($tol, 0.001)
cmdLine_reg=ifdef($reg, 1.0)
cmdLine_maxiter=ifdef($maxiter, 100)
X = read($X)
Y = round(read($Y))
check_min = min(Y)
check_max = max(Y)
num_min = sum(Y == check_min)
num_max = sum(Y == check_max)
if(num_min + num_max != nrow(Y)) {
stop("Error please please check Y, it should contain only 2 labels")
}
else{
if(check_min != -1 | check_max != +1)
Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min)
}
epsilon = cmdLine_tol
lambda = cmdLine_reg
maxiterations = cmdLine_maxiter
intercept = cmdLine_icpt
num_samples = nrow(X)
dimensions = ncol(X)
if (intercept == 1) {
ones = matrix(1, rows=num_samples, cols=1)
X = cbind(X, ones);
}
num_rows_in_w = dimensions
if(intercept == 1){
num_rows_in_w = num_rows_in_w + 1
}
w = matrix(0, rows=num_rows_in_w, cols=1)
g_old = t(X) %*% Y
s = g_old
Xw = matrix(0, rows=nrow(X), cols=1)
debug_str = "# Iter, Obj"
iter = 0
continue = 1
while(continue == 1 & iter < maxiterations) {
# minimizing primal obj along direction s
step_sz = 0
Xd = X %*% s
wd = lambda * sum(w * s)
dd = lambda * sum(s * s)
continue1 = 1
while(continue1 == 1){
tmp_Xw = Xw + step_sz*Xd
out = 1 - Y * (tmp_Xw)
sv = (out > 0)
out = out * sv
g = wd + step_sz*dd - sum(out * Y * Xd)
h = dd + sum(Xd * sv * Xd)
step_sz = step_sz - g/h
if (g*g/h < 0.0000000001){
continue1 = 0
}
}
#update weights
w = w + step_sz*s
Xw = Xw + step_sz*Xd
out = 1 - Y * Xw
sv = (out > 0)
out = sv * out
obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w)
g_new = t(X) %*% (out * Y) - lambda * w
print("OBJ = " + obj)
debug_str = append(debug_str, iter + "," + obj)
tmp = sum(s * g_old)
if(step_sz*tmp < epsilon*obj){
continue = 0
}
#non-linear CG step
be = sum(g_new * g_new)/sum(g_old * g_old)
s = be * s + g_new
g_old = g_new
iter = iter + 1
}
write(w, $model, format=cmdLine_fmt)
write(debug_str, $Log)