| #!/usr/bin/perl -w |
| |
| # hacked version of hit-frequencies - Allen |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| use Getopt::Std; |
| getopts("l:L:h"); |
| |
| our ($opt_h, $opt_l, $opt_L); |
| |
| sub usage { |
| die "find-extremes [-l LC] [-L LC] [spam log] [nonspam log] |
| |
| -l LC also print language specific rules for lang code LC (or 'all') |
| -L LC only print language specific rules for lang code LC (or 'all') |
| |
| options -l and -L are mutually exclusive. |
| |
| if either the spam or and nonspam logs are unspecified, the defaults |
| are \"spam.log\" and \"nonspam.log\" in the cwd. |
| |
| "; |
| } |
| |
| usage() if($opt_h || ($opt_l && $opt_L)); |
| |
| $lower = 1; |
| #$threshold = 5; |
| $higher = 9; |
| $min_expected = 2; # Should not be set to more than 5 or less than 2 |
| |
| my %freq_spam = (); # how often non-nice found in spam |
| my %freq_over_higher_falsepos = (); # how often non-nice found in ones over |
| # higher threshold that are false positives |
| my %freq_nonspam = (); # how often nice found in nonspam |
| my %freq_under_lower_falseneg = (); # how often nice found in ones under |
| # lower threshold that are false negatives |
| |
| my %over_expected_falsepos = (); # how much over/under non-nice are than |
| # their expected freq_over_higher_falsepos |
| my %over_expected_falseneg = (); # how much over/under nice are than |
| # their expected freq_under_lower_falseneg |
| my %ratio_expected_falsepos = (); # ratio version of above |
| my %ratio_expected_falseneg = (); # ditto |
| |
| my $num_spam = 0; |
| my $num_nonspam = 0; |
| my $num_over_higher_falsepos = 0; |
| my $num_under_lower_falseneg = 0; |
| my $ok_lang = ''; |
| |
| readscores(); |
| |
| $ok_lang = lc ($opt_l || $opt_L || ''); |
| if ($ok_lang eq 'all') { $ok_lang = '.'; } |
| |
| foreach my $key (keys %rules) { |
| next if ($key eq '_scoreset'); |
| |
| if ( ($opt_L && !$rules{$key}->{lang}) || |
| ($rules{$key}->{lang} && |
| (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i) |
| ) ) { |
| delete $rules{$key} ; next; |
| } |
| |
| if ($rules{$key}->{tflags} =~ m/net/) { |
| delete $rules{$key}; |
| next; |
| } |
| if ($rules{$key}->{tflags} !~ m/userconf/) { |
| if ($rules{$key}->{tflags} =~ m/nice/) { |
| $freq_nonspam{$key} = 0; |
| $freq_under_lower_falseneg{$key} = 0; |
| } else { |
| $freq_spam{$key} = 0; |
| $freq_over_higher_falsepos{$key} = 0; |
| } |
| } |
| } |
| |
| readlogs(); |
| |
| unless (($num_over_higher_falsepos >= $min_expected) |
| && ($num_under_lower_falseneg >= $min_expected)) { |
| die "Insufficient extremes in dataset (" . $num_over_higher_falsepos . |
| " " . $num_under_lower_falseneg . "); stopped"; |
| } |
| |
| # http://bmj.com/collections/statsbk/8.shtml |
| |
| sub chisquare { |
| my($a,$b,$c,$d) = @_; |
| my $chisquare = |
| ((($a*$d - $b*$c)**2)*($a + $b + $c + $d))/ |
| (($a + $d)*($c + $d)*($b + $d)*($a + $c)); |
| if ($chisquare >= 6.64) { |
| return $chisquare,0.01; |
| } elsif ($chisquare >= 3.841) { |
| return $chisquare,0.05; |
| } else { |
| return $chisquare,.5; |
| } |
| } |
| |
| my $ratio_falsepos = $num_over_higher_falsepos/$num_spam; |
| my $ratio_falseneg = $num_under_lower_falseneg/$num_nonspam; |
| |
| my $skipped_non_nice = 0; |
| |
| foreach $rule (keys %freq_spam) { |
| my $expected = $freq_spam{$rule}*$ratio_falsepos; |
| if ($expected <= $min_expected) { |
| $skipped_non_nice++; |
| next; |
| } |
| |
| $over_expected_falsepos{$rule} = |
| $freq_over_higher_falsepos{$rule} - $expected; |
| $ratio_expected_falsepos{$rule} = |
| $freq_over_higher_falsepos{$rule}/$expected; |
| ($chisquare{$rule},$prob{$rule}) = |
| chisquare($num_spam,$num_over_higher_falsepos, |
| $freq_spam{$rule},$freq_over_higher_falsepos{$rule}); |
| if ($freq_over_higher_falsepos{$rule} < $expected) { |
| $chisquare{$rule} *= -1; |
| } |
| } |
| |
| warn "Skipped non-nice: $skipped_non_nice\n"; |
| |
| my $skipped_nice = 0; |
| |
| foreach $rule (keys %freq_nonspam) { |
| my $expected = $freq_nonspam{$rule}*$ratio_falseneg; |
| if ($expected <= $min_expected) { |
| $skipped_nice++; |
| next; |
| } |
| |
| $over_expected_falseneg{$rule} = |
| $freq_under_lower_falseneg{$rule} - $expected; |
| $ratio_expected_falseneg{$rule} = |
| $freq_under_lower_falseneg{$rule}/$expected; |
| ($chisquare{$rule},$prob{$rule}) = |
| chisquare($num_nonspam,$num_under_lower_falseneg, |
| $freq_nonspam{$rule},$freq_under_lower_falseneg{$rule}); |
| if ($freq_under_lower_falseneg{$rule} < $expected) { |
| $chisquare{$rule} *= -1; |
| } |
| } |
| |
| warn "Skipped nice: $skipped_nice\n"; |
| |
| @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos); |
| |
| if (scalar(@rules_falsepos)) { |
| print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n"; |
| my(@rules_falsepos_bad) = |
| grep {$chisquare{$_} > 0} (@rules_falsepos); |
| if (scalar(@rules_falsepos_bad)) { |
| @rules_falsepos_bad = |
| sort { |
| ($chisquare{$b} <=> $chisquare{$a}) || |
| ($ratio_expected_falsepos{$b} <=> |
| $ratio_expected_falsepos{$a}) || |
| ($over_expected_falsepos{$b} <=> |
| $over_expected_falsepos{$a}) || |
| ($freq_over_higher_falsepos{$b} <=> |
| $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad); |
| foreach $rule (@rules_falsepos_bad) { |
| print $rule . "\t" . $prob{$rule} . "\t" . |
| $ratio_expected_falsepos{$rule} . "\t" . |
| $over_expected_falsepos{$rule} . "\t" . |
| $freq_over_higher_falsepos{$rule} . "\n"; |
| } |
| } |
| my(@rules_falsepos_good) = |
| grep {$chisquare{$_} < 0} (@rules_falsepos); |
| if (scalar(@rules_falsepos_good)) { |
| print "###\n"; |
| @rules_falsepos_good = |
| sort { |
| ($chisquare{$a} <=> $chisquare{$b}) || |
| ($ratio_expected_falsepos{$a} <=> |
| $ratio_expected_falsepos{$b}) || |
| ($freq_spam{$b} <=> |
| $freq_spam{$a})} (@rules_falsepos_good); |
| foreach $rule (@rules_falsepos_good) { |
| print $rule . "\t" . $prob{$rule} . "\t" . |
| $ratio_expected_falsepos{$rule} . "\t" . |
| $over_expected_falsepos{$rule} . "\t" . |
| $freq_over_higher_falsepos{$rule} . "\n"; |
| } |
| } |
| } else { |
| warn "No over-falsepos to print\n"; |
| } |
| |
| @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg); |
| |
| if (scalar(@rules_falseneg)) { |
| print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n"; |
| my(@rules_falseneg_bad) = |
| grep {$chisquare{$_} > 0} (@rules_falseneg); |
| if (scalar(@rules_falseneg_bad)) { |
| @rules_falseneg_bad = |
| sort { |
| ($chisquare{$b} <=> $chisquare{$a}) || |
| ($ratio_expected_falseneg{$b} <=> |
| $ratio_expected_falseneg{$a}) || |
| ($over_expected_falseneg{$b} <=> |
| $over_expected_falseneg{$a}) || |
| ($freq_under_lower_falseneg{$b} <=> |
| $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad); |
| foreach $rule (@rules_falseneg_bad) { |
| print $rule . "\t" . $prob{$rule} . "\t" . |
| $ratio_expected_falseneg{$rule} . "\t" . |
| $over_expected_falseneg{$rule} . "\t" . |
| $freq_under_lower_falseneg{$rule} . "\n"; |
| } |
| } |
| my(@rules_falseneg_good) = |
| grep {$chisquare{$_} < 0} (@rules_falseneg); |
| if (scalar(@rules_falseneg_good)) { |
| print "###\n"; |
| @rules_falseneg_good = |
| sort { |
| ($chisquare{$a} <=> $chisquare{$b}) || |
| ($ratio_expected_falseneg{$a} <=> |
| $ratio_expected_falseneg{$b}) || |
| ($freq_spam{$b} <=> |
| $freq_spam{$a})} (@rules_falseneg_good); |
| foreach $rule (@rules_falseneg_good) { |
| print $rule . "\t" . $prob{$rule} . "\t" . |
| $ratio_expected_falseneg{$rule} . "\t" . |
| $over_expected_falseneg{$rule} . "\t" . |
| $freq_under_lower_falseneg{$rule} . "\n"; |
| } |
| } |
| } else { |
| warn "No over-falseneg to print\n"; |
| } |
| |
| exit; |
| |
| sub readlogs { |
| my $spam = $ARGV[0] || "spam.log"; |
| my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log"); |
| |
| |
| (open(NONSPAM,$nonspam)) || |
| (die "Couldn't open file '$nonspam': $!; stopped"); |
| |
| while (defined($line = <NONSPAM>)) { |
| if ($line =~ m/^\s*\#/) { |
| next; |
| } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) { |
| my $tests = $1; |
| my $hits = 0; |
| my(@tests) = (); |
| foreach $test (grep {length($_)} (split(/,+/,$tests))) { |
| if (exists($rules{$test})) { |
| push @tests, $test; |
| $hits += $rules{$test}->{score}; |
| } |
| } |
| |
| if (scalar(@tests)) { |
| $num_nonspam++; |
| foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) { |
| $freq_nonspam{$test}++; |
| } |
| if ($hits >= $higher) { |
| $num_over_higher_falsepos++; |
| foreach $test (grep |
| {exists($freq_over_higher_falsepos{$_})} (@tests)) { |
| $freq_over_higher_falsepos{$test}++; |
| } |
| } |
| } |
| } elsif ($line =~ m/\S/) { |
| chomp($line); |
| warn "Can't interpret line '$line'; skipping"; |
| } |
| } |
| |
| close(NONSPAM); |
| |
| (open(SPAM,$spam)) || (die "Couldn't open file '$spam': $!; stopped"); |
| |
| while (defined($line = <SPAM>)) { |
| if ($line =~ m/^\s*\#/) { |
| next; |
| } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) { |
| my $tests = $1; |
| my $hits = 0; |
| my $plus_hits = 0; |
| my(@tests) = (); |
| foreach $test (grep {length($_)} (split(/,+/,$tests))) { |
| if (exists($rules{$test})) { |
| push @tests, $test; |
| $hits += $rules{$test}->{score}; |
| if ($rules{$test}->{score} > 0) { |
| $plus_hits += $rules{$test}->{score}; |
| } |
| } |
| } |
| |
| if (scalar(@tests)) { |
| $num_spam++; |
| foreach $test (grep {exists($freq_spam{$_})} (@tests)) { |
| $freq_spam{$test}++; |
| } |
| if (($hits <= $lower) && $plus_hits && |
| ($plus_hits >= $lower)) { |
| $num_under_lower_falseneg++; |
| foreach $test (grep |
| {exists($freq_under_lower_falseneg{$_})} (@tests)) { |
| $freq_under_lower_falseneg{$test}++; |
| } |
| } |
| } |
| } elsif ($line =~ m/\S/) { |
| chomp($line); |
| warn "Can't interpret line '$line'; skipping"; |
| } |
| } |
| |
| close(SPAM); |
| } |
| |
| |
| sub readscores { |
| system ("../build/parse-rules-for-masses") and |
| die "Couldn't do parse-rules-for-masses: $?; stopped"; |
| require "./tmp/rules.pl"; |
| } |
| |