blob: 161b4badf464079d0868d38bcce60070a87ee7b5 [file] [log] [blame]
#!/usr/bin/perl -w
# hacked version of hit-frequencies - Allen
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
use Getopt::Std;
getopts("l:L:h");
our ($opt_h, $opt_l, $opt_L);
sub usage {
die "find-extremes [-l LC] [-L LC] [spam log] [nonspam log]
-l LC also print language specific rules for lang code LC (or 'all')
-L LC only print language specific rules for lang code LC (or 'all')
options -l and -L are mutually exclusive.
if either the spam or and nonspam logs are unspecified, the defaults
are \"spam.log\" and \"nonspam.log\" in the cwd.
";
}
usage() if($opt_h || ($opt_l && $opt_L));
$lower = 1;
#$threshold = 5;
$higher = 9;
$min_expected = 2; # Should not be set to more than 5 or less than 2
my %freq_spam = (); # how often non-nice found in spam
my %freq_over_higher_falsepos = (); # how often non-nice found in ones over
# higher threshold that are false positives
my %freq_nonspam = (); # how often nice found in nonspam
my %freq_under_lower_falseneg = (); # how often nice found in ones under
# lower threshold that are false negatives
my %over_expected_falsepos = (); # how much over/under non-nice are than
# their expected freq_over_higher_falsepos
my %over_expected_falseneg = (); # how much over/under nice are than
# their expected freq_under_lower_falseneg
my %ratio_expected_falsepos = (); # ratio version of above
my %ratio_expected_falseneg = (); # ditto
my $num_spam = 0;
my $num_nonspam = 0;
my $num_over_higher_falsepos = 0;
my $num_under_lower_falseneg = 0;
my $ok_lang = '';
readscores();
$ok_lang = lc ($opt_l || $opt_L || '');
if ($ok_lang eq 'all') { $ok_lang = '.'; }
foreach my $key (keys %rules) {
next if ($key eq '_scoreset');
if ( ($opt_L && !$rules{$key}->{lang}) ||
($rules{$key}->{lang} &&
(!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
) ) {
delete $rules{$key} ; next;
}
if ($rules{$key}->{tflags} =~ m/net/) {
delete $rules{$key};
next;
}
if ($rules{$key}->{tflags} !~ m/userconf/) {
if ($rules{$key}->{tflags} =~ m/nice/) {
$freq_nonspam{$key} = 0;
$freq_under_lower_falseneg{$key} = 0;
} else {
$freq_spam{$key} = 0;
$freq_over_higher_falsepos{$key} = 0;
}
}
}
readlogs();
unless (($num_over_higher_falsepos >= $min_expected)
&& ($num_under_lower_falseneg >= $min_expected)) {
die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .
" " . $num_under_lower_falseneg . "); stopped";
}
# http://bmj.com/collections/statsbk/8.shtml
sub chisquare {
my($a,$b,$c,$d) = @_;
my $chisquare =
((($a*$d - $b*$c)**2)*($a + $b + $c + $d))/
(($a + $d)*($c + $d)*($b + $d)*($a + $c));
if ($chisquare >= 6.64) {
return $chisquare,0.01;
} elsif ($chisquare >= 3.841) {
return $chisquare,0.05;
} else {
return $chisquare,.5;
}
}
my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;
my $ratio_falseneg = $num_under_lower_falseneg/$num_nonspam;
my $skipped_non_nice = 0;
foreach $rule (keys %freq_spam) {
my $expected = $freq_spam{$rule}*$ratio_falsepos;
if ($expected <= $min_expected) {
$skipped_non_nice++;
next;
}
$over_expected_falsepos{$rule} =
$freq_over_higher_falsepos{$rule} - $expected;
$ratio_expected_falsepos{$rule} =
$freq_over_higher_falsepos{$rule}/$expected;
($chisquare{$rule},$prob{$rule}) =
chisquare($num_spam,$num_over_higher_falsepos,
$freq_spam{$rule},$freq_over_higher_falsepos{$rule});
if ($freq_over_higher_falsepos{$rule} < $expected) {
$chisquare{$rule} *= -1;
}
}
warn "Skipped non-nice: $skipped_non_nice\n";
my $skipped_nice = 0;
foreach $rule (keys %freq_nonspam) {
my $expected = $freq_nonspam{$rule}*$ratio_falseneg;
if ($expected <= $min_expected) {
$skipped_nice++;
next;
}
$over_expected_falseneg{$rule} =
$freq_under_lower_falseneg{$rule} - $expected;
$ratio_expected_falseneg{$rule} =
$freq_under_lower_falseneg{$rule}/$expected;
($chisquare{$rule},$prob{$rule}) =
chisquare($num_nonspam,$num_under_lower_falseneg,
$freq_nonspam{$rule},$freq_under_lower_falseneg{$rule});
if ($freq_under_lower_falseneg{$rule} < $expected) {
$chisquare{$rule} *= -1;
}
}
warn "Skipped nice: $skipped_nice\n";
@rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);
if (scalar(@rules_falsepos)) {
print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";
my(@rules_falsepos_bad) =
grep {$chisquare{$_} > 0} (@rules_falsepos);
if (scalar(@rules_falsepos_bad)) {
@rules_falsepos_bad =
sort {
($chisquare{$b} <=> $chisquare{$a}) ||
($ratio_expected_falsepos{$b} <=>
$ratio_expected_falsepos{$a}) ||
($over_expected_falsepos{$b} <=>
$over_expected_falsepos{$a}) ||
($freq_over_higher_falsepos{$b} <=>
$freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);
foreach $rule (@rules_falsepos_bad) {
print $rule . "\t" . $prob{$rule} . "\t" .
$ratio_expected_falsepos{$rule} . "\t" .
$over_expected_falsepos{$rule} . "\t" .
$freq_over_higher_falsepos{$rule} . "\n";
}
}
my(@rules_falsepos_good) =
grep {$chisquare{$_} < 0} (@rules_falsepos);
if (scalar(@rules_falsepos_good)) {
print "###\n";
@rules_falsepos_good =
sort {
($chisquare{$a} <=> $chisquare{$b}) ||
($ratio_expected_falsepos{$a} <=>
$ratio_expected_falsepos{$b}) ||
($freq_spam{$b} <=>
$freq_spam{$a})} (@rules_falsepos_good);
foreach $rule (@rules_falsepos_good) {
print $rule . "\t" . $prob{$rule} . "\t" .
$ratio_expected_falsepos{$rule} . "\t" .
$over_expected_falsepos{$rule} . "\t" .
$freq_over_higher_falsepos{$rule} . "\n";
}
}
} else {
warn "No over-falsepos to print\n";
}
@rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);
if (scalar(@rules_falseneg)) {
print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";
my(@rules_falseneg_bad) =
grep {$chisquare{$_} > 0} (@rules_falseneg);
if (scalar(@rules_falseneg_bad)) {
@rules_falseneg_bad =
sort {
($chisquare{$b} <=> $chisquare{$a}) ||
($ratio_expected_falseneg{$b} <=>
$ratio_expected_falseneg{$a}) ||
($over_expected_falseneg{$b} <=>
$over_expected_falseneg{$a}) ||
($freq_under_lower_falseneg{$b} <=>
$freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);
foreach $rule (@rules_falseneg_bad) {
print $rule . "\t" . $prob{$rule} . "\t" .
$ratio_expected_falseneg{$rule} . "\t" .
$over_expected_falseneg{$rule} . "\t" .
$freq_under_lower_falseneg{$rule} . "\n";
}
}
my(@rules_falseneg_good) =
grep {$chisquare{$_} < 0} (@rules_falseneg);
if (scalar(@rules_falseneg_good)) {
print "###\n";
@rules_falseneg_good =
sort {
($chisquare{$a} <=> $chisquare{$b}) ||
($ratio_expected_falseneg{$a} <=>
$ratio_expected_falseneg{$b}) ||
($freq_spam{$b} <=>
$freq_spam{$a})} (@rules_falseneg_good);
foreach $rule (@rules_falseneg_good) {
print $rule . "\t" . $prob{$rule} . "\t" .
$ratio_expected_falseneg{$rule} . "\t" .
$over_expected_falseneg{$rule} . "\t" .
$freq_under_lower_falseneg{$rule} . "\n";
}
}
} else {
warn "No over-falseneg to print\n";
}
exit;
sub readlogs {
my $spam = $ARGV[0] || "spam.log";
my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");
(open(NONSPAM,$nonspam)) ||
(die "Couldn't open file '$nonspam': $!; stopped");
while (defined($line = <NONSPAM>)) {
if ($line =~ m/^\s*\#/) {
next;
} elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
my $tests = $1;
my $hits = 0;
my(@tests) = ();
foreach $test (grep {length($_)} (split(/,+/,$tests))) {
if (exists($rules{$test})) {
push @tests, $test;
$hits += $rules{$test}->{score};
}
}
if (scalar(@tests)) {
$num_nonspam++;
foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) {
$freq_nonspam{$test}++;
}
if ($hits >= $higher) {
$num_over_higher_falsepos++;
foreach $test (grep
{exists($freq_over_higher_falsepos{$_})} (@tests)) {
$freq_over_higher_falsepos{$test}++;
}
}
}
} elsif ($line =~ m/\S/) {
chomp($line);
warn "Can't interpret line '$line'; skipping";
}
}
close(NONSPAM);
(open(SPAM,$spam)) || (die "Couldn't open file '$spam': $!; stopped");
while (defined($line = <SPAM>)) {
if ($line =~ m/^\s*\#/) {
next;
} elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
my $tests = $1;
my $hits = 0;
my $plus_hits = 0;
my(@tests) = ();
foreach $test (grep {length($_)} (split(/,+/,$tests))) {
if (exists($rules{$test})) {
push @tests, $test;
$hits += $rules{$test}->{score};
if ($rules{$test}->{score} > 0) {
$plus_hits += $rules{$test}->{score};
}
}
}
if (scalar(@tests)) {
$num_spam++;
foreach $test (grep {exists($freq_spam{$_})} (@tests)) {
$freq_spam{$test}++;
}
if (($hits <= $lower) && $plus_hits &&
($plus_hits >= $lower)) {
$num_under_lower_falseneg++;
foreach $test (grep
{exists($freq_under_lower_falseneg{$_})} (@tests)) {
$freq_under_lower_falseneg{$test}++;
}
}
}
} elsif ($line =~ m/\S/) {
chomp($line);
warn "Can't interpret line '$line'; skipping";
}
}
close(SPAM);
}
sub readscores {
system ("../build/parse-rules-for-masses") and
die "Couldn't do parse-rules-for-masses: $?; stopped";
require "./tmp/rules.pl";
}