blob: 8b05456fcbd7c74184843af1605c820af29c238d [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This builtin function computes the area under the ROC curve (AUC)
# for binary classifiers.
#
# INPUT:
# ------------------------------------------------------------------------------
# Y Binary response vector (shape: n x 1), in -1/+1 or 0/1 encoding
# P Prediction scores (predictor such as estimated probabilities)
# for true class (shape: n x 1), assumed in [0,1]
# ------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------
# auc Area under the ROC curve (AUC)
# ------------------------------------------------------------------------------
m_auc = function(Matrix[Double] Y, Matrix[Double] P)
return(Double auc)
{
minv = min(Y)
maxv = max(Y)
# check input parameter assertions
if(minv == maxv)
stop("AUC: stopping because only one class label existing in Y")
if(sum(Y==minv) + sum(Y==maxv) < nrow(Y))
stop("AUC: stopping because more than two class labels existing in Y")
# convert -1/1 to 0/1 if necessary
if( minv < 0 )
Y = (Y+1) != 0;
# compute true and false positive rates per unique threshold
[tpr, fpr] = cumsumROC(Y, P);
# compute AUC via Trapezoidal rule
nd = nrow(tpr);
auc = as.scalar(tpr[1] * fpr[1])
+ sum((fpr[2:nd]-fpr[1:(nd-1)]) * (tpr[2:nd]+tpr[1:(nd-1)])/2);
}
cumsumROC = function(Matrix[Double] Y, Matrix[Double] P)
return(Matrix[Double] tpr, Matrix[Double] fpr)
{
pos = sum(Y);
neg = nrow(Y) - pos;
# compute ROC curve for distinct threshold scores
# (cut-offs > and <= choosen to match R-pROC-package behavior)
# vectorized implementation via cumsum of ordered scores P
YP = order(target=cbind(Y, P), by=2);
oY = YP[,1]; oP = YP[,2];
tp = pos - cumsum(oY); # true positives until certain threshold (row)
fp = cumsum(!oY); # false positives until certain threshold
# indicator of unique thresholds for at end of range
uI = (oP != rbind(oP[2:nrow(oP)],as.matrix(0)));
# extract true/false positves for unique thresholds
tp = removeEmpty(target=tp, margin="rows", select=uI);
fp = removeEmpty(target=fp, margin="rows", select=uI);
tpr = tp / pos; # true positive rate, increasing
fpr = fp / neg; # false postive rate, increasing
}
naiveROC = function(Matrix[Double] Y, Matrix[Double] P)
return(Matrix[Double] tpr, Matrix[Double] fpr)
{
pos = sum(Y);
neg = nrow(Y) - pos;
# compute ROC curve for distinct threshold scores
# (cut-offs > and <= choosen to match R-pROC-package behavior)
dP = order(target=unique(P)); # distinct P thresholds, increasing
nd = nrow(dP)
tp = matrix(0, nd, 1);
fp = matrix(0, nd, 1);
parfor(i in 1:nd) {
tp[i] = sum(P>dP[i] & Y)
fp[i] = sum(P<=dP[i] & !Y)
}
tpr = tp / pos; # true positive rate, decreasing
fpr = fp / neg; # false postive rate, decreasing
}