| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # How to invoke this dml script L2SVM.dml? |
| # Assume L2SVM_HOME is set to the home of the dml script |
| # Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR |
| # hadoop jar SystemDS.jar -f $L2SVM_HOME/L2SVM.pydml -nvargs X="$INPUT_DIR/X" Y="$INPUT_DIR/Y" icpt=0 tol=1.0E-8 reg=1.0 maxiter=3 model="$OUTPUT_DIR/w" Log="$OUTPUT_DIR/Log" |
| |
| # Note about inputs: |
| # Assumes that labels (entries in Y) |
| # are set to either -1 or +1 |
| # or the result of recoding |
| |
| cmdLine_fmt=ifdef($fmt,"text") |
| cmdLine_icpt=ifdef($icpt, 0) |
| cmdLine_tol=ifdef($tol, 0.001) |
| cmdLine_reg=ifdef($reg, 1.0) |
| cmdLine_maxiter=ifdef($maxiter, 100) |
| |
| X = read($X) |
| Y = round(read($Y)) |
| |
| check_min = min(Y) |
| check_max = max(Y) |
| num_min = sum(Y == check_min) |
| num_max = sum(Y == check_max) |
| if(num_min + num_max != nrow(Y)) { |
| stop("Error please please check Y, it should contain only 2 labels") |
| } |
| else{ |
| if(check_min != -1 | check_max != +1) |
| Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min) |
| } |
| |
| epsilon = cmdLine_tol |
| lambda = cmdLine_reg |
| maxiterations = cmdLine_maxiter |
| intercept = cmdLine_icpt |
| |
| num_samples = nrow(X) |
| dimensions = ncol(X) |
| |
| if (intercept == 1) { |
| ones = matrix(1, rows=num_samples, cols=1) |
| X = cbind(X, ones); |
| } |
| |
| num_rows_in_w = dimensions |
| if(intercept == 1){ |
| num_rows_in_w = num_rows_in_w + 1 |
| } |
| w = matrix(0, rows=num_rows_in_w, cols=1) |
| |
| g_old = t(X) %*% Y |
| s = g_old |
| |
| Xw = matrix(0, rows=nrow(X), cols=1) |
| debug_str = "# Iter, Obj" |
| iter = 0 |
| continue = 1 |
| while(continue == 1 & iter < maxiterations) { |
| # minimizing primal obj along direction s |
| step_sz = 0 |
| Xd = X %*% s |
| wd = lambda * sum(w * s) |
| dd = lambda * sum(s * s) |
| continue1 = 1 |
| while(continue1 == 1){ |
| tmp_Xw = Xw + step_sz*Xd |
| out = 1 - Y * (tmp_Xw) |
| sv = (out > 0) |
| out = out * sv |
| g = wd + step_sz*dd - sum(out * Y * Xd) |
| h = dd + sum(Xd * sv * Xd) |
| step_sz = step_sz - g/h |
| if (g*g/h < 0.0000000001){ |
| continue1 = 0 |
| } |
| } |
| |
| #update weights |
| w = w + step_sz*s |
| Xw = Xw + step_sz*Xd |
| |
| out = 1 - Y * Xw |
| sv = (out > 0) |
| out = sv * out |
| obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w) |
| g_new = t(X) %*% (out * Y) - lambda * w |
| |
| print("OBJ = " + obj) |
| debug_str = append(debug_str, iter + "," + obj) |
| |
| tmp = sum(s * g_old) |
| if(step_sz*tmp < epsilon*obj){ |
| continue = 0 |
| } |
| |
| #non-linear CG step |
| be = sum(g_new * g_new)/sum(g_old * g_old) |
| s = be * s + g_new |
| g_old = g_new |
| |
| iter = iter + 1 |
| } |
| |
| write(w, $model, format=cmdLine_fmt) |
| write(debug_str, $Log) |