blob: 59565315d2b8397f99143194d7a74c7025cb13b7 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This script can be used to compute label predictions
# Meant for use with a model learnt using l2-svm.dml
#
# Given ground truth labels, the script will compute an
# accuracy (%) for the predictions
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- Location to read the matrix X of feature vectors
# model String --- Location of the existing model generated by l2-svm
# fmt String "text" The output format of the output, such as "text" or "csv"
# Y String --- [OPTIONAL] Location to read the true label matrix Y. Only needed
# for evaluating performance (accuracy, confusion) of the model.
# confusion String --- [OPTIONAL] Location to write confusion matrix, valid if Y supplied
# accuracy String --- [OPTIONAL] Location to write accuracy matrix, valid if Y supplied
# scores String --- [OPTIONAL] Location to write model predictions
# ---------------------------------------------------------------------------------------------
#
# Example Usage:
# hadoop jar SystemDS.jar -f l2-svm-predict.dml -nvargs X=data Y=labels model=model scores=scores accuracy=accuracy confusion=confusion fmt="text"
#
# Note about inputs:
# labels (entries in Y) should either be set to +1/-1
# or be the result of recoding
# anything else may prompt an error message from this script
cmdLine_Y = ifdef($Y, " ")
cmdLine_confusion = ifdef($confusion, " ")
cmdLine_accuracy = ifdef($accuracy, " ")
cmdLine_scores = ifdef($scores, " ")
cmdLine_scoring_only = ifdef($scoring_only, FALSE)
cmdLine_fmt = ifdef($fmt, "text")
X = read($X)
w = read($model)
dimensions = as.scalar(w[nrow(w),1])
if(dimensions != ncol(X))
stop("Stopping due to invalid input: Model dimensions do not seem to match input data dimensions")
intercept = as.scalar(w[nrow(w)-1,1])
negative_label = as.scalar(w[nrow(w)-2,1])
positive_label = as.scalar(w[nrow(w)-3,1])
w = w[1:(nrow(w)-4),]
b = 0.0
if(intercept == 1)
b = as.scalar(w[nrow(w),1])
scores = b + (X %*% w[1:ncol(X),])
if(cmdLine_scores != " ")
write(scores, cmdLine_scores, format=cmdLine_fmt)
if(!cmdLine_scoring_only){
Y = read(cmdLine_Y)
pred = (scores >= 0)
pred_labels = pred*positive_label + (1-pred)*negative_label
num_correct = sum(pred_labels == Y)
acc = 100*num_correct/nrow(X)
acc_str = "Accuracy (%): " + acc
print(acc_str)
if(cmdLine_accuracy != " ")
write(acc_str, cmdLine_accuracy)
if(cmdLine_confusion != " "){
pred = 2*pred - 1
if(negative_label != -1 | positive_label != +1)
Y = 2/(positive_label - negative_label)*Y - (negative_label + positive_label)/(positive_label - negative_label)
pred_is_minus = (pred == -1)
pred_is_plus = 1 - pred_is_minus
y_is_minus = (Y == -1)
y_is_plus = 1 - y_is_minus
check_min_y_minus = sum(pred_is_minus*y_is_minus)
check_min_y_plus = sum(pred_is_minus*y_is_plus)
check_max_y_minus = sum(pred_is_plus*y_is_minus)
check_max_y_plus = sum(pred_is_plus*y_is_plus)
confusion_mat = matrix(0, rows=2, cols=2)
confusion_mat[1,1] = check_min_y_minus
confusion_mat[1,2] = check_min_y_plus
confusion_mat[2,1] = check_max_y_minus
confusion_mat[2,2] = check_max_y_plus
write(confusion_mat, cmdLine_confusion, format="csv")
}
}