blob: 24c201c24a31e870f38382194c22e00d0c2ca0b8 [file] [log] [blame]
#!/usr/bin/perl -w
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
=head1 NAME
logs-to-c - Convert a mass-check log into perceptron format
=head1 SYNOPSIS
logs-to-c [options]
Options:
-c,--cffile=path Use path as the rules directory
-s,--scoreset=n Use scoreset n
--spam=file Location of spam mass-check log
--ham=file Location of ham mass-check log
=head1 DESCRIPTION
B<logs-to-c> will read the mass-check logs F<spam.log> and F<ham.log>
or as specified by the B<--spam> and B<--ham> options, and convert it
into the format needed by the perceptron. This is a format that is
simple for the perceptron to parse, but is not very readable to
humans.
=head1 BUGS
Please report bugs to http://bugzilla.spamassassin.org/
=head1 SEE ALSO
L<mass-check(1)>, L<perceptron(1)>
=cut
use Getopt::Long qw(:config auto_help bundling);
use strict;
our $opt_cffile = "../rules";
our $opt_spam = 'spam.log';
our $opt_ham = 'ham.log';
our $opt_scoreset = 0;
GetOptions("cffile=s", "spam=s", "ham=s", "scoreset=i");
my $is_spam = ''; # vec aligned with @tests_hit
my @tests_hit = ();
my %mutable_tests = ();
our (%rules, %allrules, %scores);
my (%ignored_rule, %range_lo, %range_hi);
my %rule_to_index;
readscores();
print "Reading per-message hit stat logs and scores...\n";
my ($num_tests, $num_spam, $num_ham);
read_ranges();
readlogs();
print "Writing logs and current scores as C code...\n";
writescores_c();
# show memory usage before we exit
# print "Running \"ps aux\"...\n";
# open(PS, "ps aux|");
# while(<PS>) {
# print if $. == 1 || /\b$$\b/;
# }
# close(PS);
exit 0;
# code to freeze/thaw test lines in as little space as possible
# this could be faster, but improves memory usage by a phenomenal
# amount over arrayrefs or strings of comma-separated-values
my $short_index = 1;
my %long_to_short;
my @short_to_long;
sub new_short {
$short_index++;
$long_to_short{$_[0]} = $short_index;
$short_to_long[$short_index] = $_[0];
return $short_index;
}
# uses less than half the memory of join on ',' and even better
# compared to Storable::freeze
sub freeze_tests {
return pack("w*", map
{
$long_to_short{$_} || new_short($_);
} @{$_[0]})
}
sub thaw_tests {
return map { $short_to_long[$_] } unpack("w*", $_[0]);
}
sub readlogs {
my $msgline;
my $count = 0;
$num_spam = $num_ham = 0;
foreach my $file ($opt_spam, $opt_ham) {
open (IN, "<$file") || die "Could not open file '$file': $!";
my $isspam = ($file eq $opt_spam);
my $caught; # 1st parameter of log line
my $rules; # 4th parameter of log line
my $restofline; # intermediate parse buffer
while (defined($msgline = <IN>)) {
# faster log-reading code from hit-frequencies.
# the additional split() is for this case:
# ". -20 /path time=1112116980,scantime=0,format=f,reuse=no"
# in other words, no hits. split(' ') cannot deal with this
# correctly, seeing (".", "-20", "/path", "time=...etc"). Work
# around this by using a literal / / regexp split to discard
# the csv stuff we don't want out of the rest of the line.
($caught, undef, $restofline) = split(' ', $msgline, 3);
next unless ($caught =~ /^[Y\.]$/ && $restofline);
(undef, $rules) = split(/ /, $restofline, 3);
# get tests, but ignore unknown tests and subrules
my @tests;
foreach my $r (split(/,/, $rules)) {
my $hits = 1;
# Support compacted RULE(hitcount) format
if ($r =~ s/\((\d+)\)$//) {
$hits = $1;
}
next unless (defined $scores{$r} && !$allrules{$r}->{issubrule});
push @tests, $r for (1 .. $hits);
}
if ($isspam) {
$num_spam++;
vec($is_spam, $count, 1) = 1;
}
else {
$num_ham++;
vec($is_spam, $count, 1) = 0;
}
# inlined for speed.
# ORIGINAL: $tests_hit[$count] = freeze_tests(\@tests);
$tests_hit[$count] = pack("w*", map
{
$long_to_short{$_} || new_short($_);
} @tests);
# TODO: benchmark using foreach(), map() is often slower
$count++; # increment line
}
close IN;
}
$num_tests = $count;
}
sub readscores {
print "Reading scores from \"$opt_cffile\"...\n";
my $tmpf = "./tmp/rules$$.pl";
system "../build/parse-rules-for-masses ".
"-d \"$opt_cffile\" -s $opt_scoreset -o $tmpf" and die;
require $tmpf;
unlink $tmpf;
%allrules = %rules; # ensure it stays global
}
sub writescores_c {
my $output = '';
my $size = 0;
my $mutable = 0;
my $i;
# jm: now, score-ranges-from-freqs has tflags to work from, so
# it will always list all mutable tests.
my @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
($mutable_tests{$b} <=> $mutable_tests{$a}) ||
($a cmp $b)} (keys %scores);
my $max_hits_per_msg = 0;
for (my $file = 0; $file < $num_tests; $file++) {
my(@hits) =
grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (thaw_tests($tests_hit[$file]));
if ((scalar(@hits)+1) > $max_hits_per_msg) {
$max_hits_per_msg = scalar(@hits)+1;
}
}
for ($i = 0; $i <= $#index_to_rule; $i++) {
my $name = $index_to_rule[$i];
$rule_to_index{$name} = $i;
if ($ignored_rule{$name}) { next; }
if ($mutable_tests{$name} == 0) {
$range_lo{$name} = $range_hi{$name} = $scores{$name};
} else {
$mutable++;
if ($range_lo{$name} > $range_hi{$name}) {
($range_lo{$name},$range_hi{$name}) =
($range_hi{$name},$range_lo{$name});
}
#$range_lo{$name} ||= 0.1;
#$range_hi{$name} ||= 1.5;
# no default score found? set it to max and let GA adjust downwards. this
# seems to help avoid a load of really good rules getting 1.0 scores
if ($allrules{$name}->{no_score_found}) {
$scores{$name} = ($range_hi{$name} + $range_lo{$name}) / 2.0;
}
}
$output .= ".".$i."\n".
"n".$name."\n".
"b".$scores{$name}."\n".
"m".$mutable_tests{$name}."\n".
"l".$range_lo{$name}."\n".
"h".$range_hi{$name}."\n";
$size++;
}
open (DAT, ">tmp/scores.data");
print DAT "N$size\n", "M$mutable\n", # informational only
$output;
close DAT;
open (OUT, ">tmp/scores.h");
print OUT "
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int num_scores = $size;
int num_mutable = $mutable;
unsigned char is_mutable[$size];
double range_lo[$size];
double range_hi[$size];
double bestscores[$size];
char *score_names[$size];
double tmp_scores[$size][2];
unsigned char ny_hit[$mutable];
unsigned char yn_hit[$mutable];
double lookup[$mutable];
/* readscores() is defined in tests.h */
";
close OUT;
writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
}
sub writetests_c {
my $max_hits_per_msg = $_[0];
my(%uniq_files) = ();
my(%count_keys) = ();
my(%file_key) = ();
my $file;
for ($file = 0; $file < $num_tests; $file++)
{
my $uniq_key = vec($is_spam, $file, 1) . " ";
my (@good_tests) =
grep {length($_) && (! $ignored_rule{$_}) &&
(defined($rule_to_index{$_}))} (thaw_tests($tests_hit[$file]));
@good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));
$uniq_key .= join(" ",@good_tests);
if (exists($count_keys{$uniq_key})) {
$count_keys{$uniq_key}++;
} else {
$count_keys{$uniq_key} = 1;
$file_key{$file} = $uniq_key;
$uniq_files{$file} = scalar(keys(%count_keys)) - 1;
}
}
my $num_nondup = scalar(keys(%uniq_files));
open (TOP, ">tmp/tests.h");
print TOP "
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int num_tests = $num_tests;
int num_nondup = $num_nondup;
int num_spam = $num_spam;
int num_ham = $num_ham;
int max_hits_per_msg = $max_hits_per_msg;
unsigned char num_tests_hit[$num_nondup];
unsigned char is_spam[$num_nondup];
unsigned short tests_hit[$num_nondup][$max_hits_per_msg];
double scores[$num_nondup];
double tmp_total[$num_nondup];
int tests_count[$num_nondup];
";
$_ = join ('', <DATA>);
print TOP $_;
close TOP;
open (DAT, ">tmp/tests.data");
foreach $file (sort {$a <=> $b} (keys %uniq_files)) {
print DAT ".".$uniq_files{$file}."\n";
my $out = '';
$out .= "s".vec($is_spam, $file, 1)."\n";
my $base_score = 0;
my $num_tests_hit = 0;
foreach my $test (thaw_tests($tests_hit[$file])) {
if ($test eq '') { next; }
if ($ignored_rule{$test}) {
# this is not a log-worthy event anymore, since we have a lot
# of T_ test rules that are ignored during perceptron runs
# warn "ignored rule $test got a hit in $file!\n";
next;
}
if (!defined $rule_to_index{$test}) {
warn "test with no C index: $test\n";
next;
}
if ($mutable_tests{$test}) {
$num_tests_hit++;
$out .= "t".$rule_to_index{$test}."\n";
if ($num_tests_hit >= $max_hits_per_msg) {
die "Need to increase \$max_hits_per_msg";
}
} else {
$base_score += $scores{$test};
}
}
$out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests
$out .= "c" . $count_keys{$file_key{$file}} . "\n";
print DAT "n".$num_tests_hit."\n".$out;
}
close DAT;
}
sub read_ranges {
if (!-f 'tmp/ranges.data') {
die "need to make 'tmp/ranges.data' first";
}
# read ranges, and mutableness, from ranges.data.
open (IN, "<tmp/ranges.data")
or die "need to run score-ranges-from-freqs first!";
my $count = 0;
while (<IN>) {
/^(\S+) (\S+) (\d+) (\S+)$/ or next;
my $t = $4;
$range_lo{$t} = $1+0;
$range_hi{$t} = $2+0;
my $mut = $3+0;
if ($allrules{$t}->{issubrule}) {
# warn "$t: ignoring, is sub-rule\n"; # no need to warn
$ignored_rule{$t} = 1;
$mutable_tests{$t} = 0;
next;
}
if ($t =~ /^T_/) {
# warn "$t: ignoring, is T_ test rule\n"; # no need to warn
$ignored_rule{$t} = 1;
$mutable_tests{$t} = 0;
$range_lo{$t} = 0.01; # clamp to insignificant range
$range_hi{$t} = 0.01;
next;
}
if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
warn "$t: ignoring, score and range == 0\n";
$ignored_rule{$t} = 1;
$mutable_tests{$t} = 0;
next;
}
$ignored_rule{$t} = 0;
if (!$mut) {
$mutable_tests{$t} = 0;
} elsif ($range_lo{$t} == $range_hi{$t}) {
$mutable_tests{$t} = 0;
} elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
$mutable_tests{$t} = 0;
} else {
$mutable_tests{$t} = 1;
}
unless ($mutable_tests{$t} || $scores{$t}) {
warn "$t: ignoring, immutable and score == 0 in this scoreset\n";
$ignored_rule{$t} = 1;
}
}
close IN;
# catch up on the ones missed; seems to be userconf or 0-hitters mostly.
foreach my $t (sort keys %allrules) {
next if ($t eq '_scoreset');
next if (exists($range_lo{$t}));
if ($allrules{$t}->{issubrule}) {
if (!$ignored_rule{$t}) {
# warn "$t: ignoring, is sub-rule\n"; # no need to warn here
$ignored_rule{$t} = 1;
}
$mutable_tests{$t} = 0;
next;
}
if ($t =~ /^T_/) {
if (!$ignored_rule{$t}) {
# warn "$t: ignoring, is T_ test rule\n"; # no need to warn here
$ignored_rule{$t} = 1;
$range_lo{$t} = 0.01; # clamp to insignificant range
$range_hi{$t} = 0.01;
}
$mutable_tests{$t} = 0;
next;
}
$ignored_rule{$t} = 0;
unless (exists($mutable_tests{$t}) &&
($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
$mutable_tests{$t} = 0;
}
unless ($mutable_tests{$t} || $scores{$t}) {
if (!$ignored_rule{$t}) {
warn "$t: ignoring, immutable and score == 0 in this scoreset\n";
$ignored_rule{$t} = 1;
}
}
}
foreach my $t (keys %range_lo) {
next if ($ignored_rule{$t});
if ($mutable_tests{$t}) {
if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
$scores{$t} = -1;
} elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
$scores{$t} = -0.01;
}
if ($scores{$t} >= $range_hi{$t}) {
$scores{$t} = $range_hi{$t} - 0.001;
} elsif ($scores{$t} <= $range_lo{$t}) {
$scores{$t} = $range_lo{$t} + 0.001;
}
} else {
if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
next;
} elsif ($range_lo{$t} == $range_hi{$t}) {
$scores{$t} = $range_lo{$t};
next;
}
if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
$scores{$t} = -1;
} elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
$scores{$t} = -0.01;
}
if ($scores{$t} > $range_hi{$t}) {
$scores{$t} = $range_hi{$t};
} elsif ($scores{$t} < $range_lo{$t}) {
$scores{$t} = $range_lo{$t};
}
}
}
}
__DATA__
void loadtests (void) {
FILE *fin = fopen ("tmp/tests.data", "r");
char buf[256];
int file = 0;
int tnum = 0;
while (fgets (buf, 255, fin) != NULL) {
char cmd;
long arg;
float argd;
cmd = (char) *buf;
arg = strtol (buf+1, NULL, 10);
argd = (float)strtod (buf+1, NULL);
if (cmd == '.') {
file = arg;
} else if (cmd == 'n') {
tnum = 0;
num_tests_hit[file] = arg;
} else if (cmd == 's') {
is_spam[file] = arg;
} else if (cmd == 'b') {
scores[file] = argd;
} else if (cmd == 't') {
tests_hit[file][tnum] = arg; tnum++;
} else if (cmd == 'c') {
tests_count[file] = arg;
}
}
fclose(fin);
printf ("Read test results for %d messages (%d total).\n", file+1,
num_tests);
}
void loadscores (void) {
FILE *fin = fopen ("tmp/scores.data", "r");
char buf[256];
int snum = 0;
while (fgets (buf, 255, fin) != NULL) {
char cmd;
long arg;
float argd;
char *str, *white;
cmd = (char) *buf;
arg = strtol (buf+1, NULL, 10);
argd = (float)strtod (buf+1, NULL);
str = buf+1;
while ((white = strchr (str, '\n')) != NULL) {
*white = '\0';
}
if (cmd == '.') {
snum = arg;
} else if (cmd == 'b') {
bestscores[snum] = argd;
} else if (cmd == 'l') {
range_lo[snum] = argd;
} else if (cmd == 'h') {
range_hi[snum] = argd;
} else if (cmd == 'n') {
score_names[snum] = strdup (str); /* leaky leak ;) */
} else if (cmd == 'm') {
is_mutable[snum] = arg;
}
}
fclose(fin);
printf ("Read scores for %d tests.\n", num_scores);
}