blob: 50638ebb8cf82f12bc2f8eb853e5ef582733cbb9 [file] [log] [blame]
#!/usr/bin/perl
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
use Getopt::Long;
our ($opt_lambda, $opt_fn, $opt_fp, $opt_spam, $opt_ham);
GetOptions("lambda=f", "fn=f", "fp=f", "spam=f", "ham=f");
# lambda value for TCR equation, representing the cost of of an FP vs. the
# cost of a FN. Some example values are: 1 = tagged only, 9 = mailed back
# to sender asking for token, 999 = blocking or deleting a message.
#
# We roughly aim for a value representing "moved to infrequently-read folder".
my $lambda = 50; if ($opt_lambda) { $lambda = $opt_lambda; }
if (!$opt_spam && !$opt_ham) {
die "usage: fn-fp-to-tcr [-lambda l] -fn n -fp n -spam n -ham n\n\n";
}
# convert to the TCR metrics used in the published lit
my $n_yy = $opt_spam;
my $n_yn = $opt_fn;
my $n_ny = $opt_fp;
my $n_nn = $opt_ham;
my $n_ham = $opt_ham + $opt_fp;
my $n_spam = $opt_spam + $opt_fn;
my $werr = ($lambda * $n_ny + $n_yn)
/ ($lambda * $n_ham + $n_spam);
my $werr_base = $n_spam
/ ($lambda * $n_ham + $n_spam);
$werr ||= 0.000001; # avoid / by 0
my $tcr = $werr_base / $werr;
printf "# TCR(l=%d): %3.6f\n\n",
$lambda, $tcr;
printf "# Correctly non-spam: %6d %6.3f%%\n",
$n_nn, ($n_nn / $n_ham) * 100.0;
printf "# Correctly spam: %6d %6.3f%%\n",
$n_yy, ($n_yy / $n_spam) * 100.0;
printf "# False positives: %6d %6.3f%%\n",
$n_ny, ($n_ny / $n_ham) * 100.0;
printf "# False negatives: %6d %6.3f%%\n\n",
$n_yn, ($n_yn / $n_spam) * 100.0;
my $sr = ($n_yy / $n_spam) * 100.0;
my $sp = ($n_yy / ($n_yy + $n_ny)) * 100.0;
printf "# SpamRecall: %6.3f%%\n",
$sr;
printf "# SpamPrecision: %6.3f%%\n\n",
$sp;
my $ppv = $n_spam / ($n_spam + $n_ny);
my $npv = $n_ham / ($n_ham + $n_yn);
my $sensitivity = $n_spam / ($n_spam + $n_yn);
my $specificity = $n_ham / ($n_ham + $n_ny);
my $efficiency = ($n_spam + $n_ham) / ($n_spam + $n_ham + $n_ny + $n_yn);
printf "# PPV: %6.3f%%\n", $ppv*100;
printf "# NPV: %6.3f%%\n", $npv*100;
printf "# Sensitivity: %6.3f%%\n", $sensitivity*100;
printf "# Specificity: %6.3f%%\n", $specificity*100;
printf "# Efficiency: %6.3f%%\n\n", $efficiency*100;
printf "# Batting Average: %.5f/%.5f\n",
(1 - ($n_yn / $n_spam)),
($n_ny / $n_ham);