| #!/usr/bin/perl -w |
| # |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| =head1 NAME |
| |
| logs-to-c - Convert a mass-check log into perceptron format |
| |
| =head1 SYNOPSIS |
| |
| logs-to-c [options] |
| |
| Options: |
| -c,--cffile=path Use path as the rules directory |
| -s,--scoreset=n Use scoreset n |
| --spam=file Location of spam mass-check log |
| --ham=file Location of ham mass-check log |
| |
| =head1 DESCRIPTION |
| |
| B<logs-to-c> will read the mass-check logs F<spam.log> and F<ham.log> |
| or as specified by the B<--spam> and B<--ham> options, and convert it |
| into the format needed by the perceptron. This is a format that is |
| simple for the perceptron to parse, but is not very readable to |
| humans. |
| |
| =head1 BUGS |
| |
| Please report bugs to http://bugzilla.spamassassin.org/ |
| |
| =head1 SEE ALSO |
| |
| L<mass-check(1)>, L<perceptron(1)> |
| |
| =cut |
| |
| use Getopt::Long qw(:config auto_help bundling); |
| use strict; |
| |
| our $opt_cffile = "../rules"; |
| our $opt_spam = 'spam.log'; |
| our $opt_ham = 'ham.log'; |
| our $opt_scoreset = 0; |
| |
| GetOptions("cffile=s", "spam=s", "ham=s", "scoreset=i"); |
| |
| my $is_spam = ''; # vec aligned with @tests_hit |
| my @tests_hit = (); |
| my %mutable_tests = (); |
| |
| our (%rules, %allrules, %scores); |
| |
| my (%ignored_rule, %range_lo, %range_hi); |
| my %rule_to_index; |
| |
| readscores(); |
| |
| print "Reading per-message hit stat logs and scores...\n"; |
| my ($num_tests, $num_spam, $num_ham); |
| |
| read_ranges(); |
| readlogs(); |
| |
| print "Writing logs and current scores as C code...\n"; |
| writescores_c(); |
| |
| # show memory usage before we exit |
| # print "Running \"ps aux\"...\n"; |
| # open(PS, "ps aux|"); |
| # while(<PS>) { |
| # print if $. == 1 || /\b$$\b/; |
| # } |
| # close(PS); |
| |
| exit 0; |
| |
| # code to freeze/thaw test lines in as little space as possible |
| # this could be faster, but improves memory usage by a phenomenal |
| # amount over arrayrefs or strings of comma-separated-values |
| my $short_index = 1; |
| my %long_to_short; |
| my @short_to_long; |
| |
| sub new_short { |
| $short_index++; |
| $long_to_short{$_[0]} = $short_index; |
| $short_to_long[$short_index] = $_[0]; |
| return $short_index; |
| } |
| |
| # uses less than half the memory of join on ',' and even better |
| # compared to Storable::freeze |
| sub freeze_tests { |
| return pack("w*", map |
| { |
| $long_to_short{$_} || new_short($_); |
| } @{$_[0]}) |
| } |
| |
| sub thaw_tests { |
| return map { $short_to_long[$_] } unpack("w*", $_[0]); |
| } |
| |
| sub readlogs { |
| my $msgline; |
| |
| my $count = 0; |
| $num_spam = $num_ham = 0; |
| |
| foreach my $file ($opt_spam, $opt_ham) { |
| open (IN, "<$file") || die "Could not open file '$file': $!"; |
| |
| my $isspam = ($file eq $opt_spam); |
| my $caught; # 1st parameter of log line |
| my $rules; # 4th parameter of log line |
| my $restofline; # intermediate parse buffer |
| |
| while (defined($msgline = <IN>)) { |
| # faster log-reading code from hit-frequencies. |
| # the additional split() is for this case: |
| # ". -20 /path time=1112116980,scantime=0,format=f,reuse=no" |
| # in other words, no hits. split(' ') cannot deal with this |
| # correctly, seeing (".", "-20", "/path", "time=...etc"). Work |
| # around this by using a literal / / regexp split to discard |
| # the csv stuff we don't want out of the rest of the line. |
| |
| ($caught, undef, $restofline) = split(' ', $msgline, 3); |
| next unless ($caught =~ /^[Y\.]$/ && $restofline); |
| (undef, $rules) = split(/ /, $restofline, 3); |
| |
| # get tests, but ignore unknown tests and subrules |
| my @tests; |
| foreach my $r (split(/,/, $rules)) { |
| my $hits = 1; |
| # Support compacted RULE(hitcount) format |
| if ($r =~ s/\((\d+)\)$//) { |
| $hits = $1; |
| } |
| next unless (defined $scores{$r} && !$allrules{$r}->{issubrule}); |
| push @tests, $r for (1 .. $hits); |
| } |
| |
| if ($isspam) { |
| $num_spam++; |
| vec($is_spam, $count, 1) = 1; |
| } |
| else { |
| $num_ham++; |
| vec($is_spam, $count, 1) = 0; |
| } |
| |
| # inlined for speed. |
| # ORIGINAL: $tests_hit[$count] = freeze_tests(\@tests); |
| $tests_hit[$count] = pack("w*", map |
| { |
| $long_to_short{$_} || new_short($_); |
| } @tests); |
| |
| # TODO: benchmark using foreach(), map() is often slower |
| |
| $count++; # increment line |
| } |
| close IN; |
| } |
| $num_tests = $count; |
| } |
| |
| sub readscores { |
| print "Reading scores from \"$opt_cffile\"...\n"; |
| my $tmpf = "./tmp/rules$$.pl"; |
| system "../build/parse-rules-for-masses ". |
| "-d \"$opt_cffile\" -s $opt_scoreset -o $tmpf" and die; |
| require $tmpf; |
| unlink $tmpf; |
| %allrules = %rules; # ensure it stays global |
| } |
| |
| sub writescores_c { |
| my $output = ''; |
| my $size = 0; |
| my $mutable = 0; |
| my $i; |
| |
| # jm: now, score-ranges-from-freqs has tflags to work from, so |
| # it will always list all mutable tests. |
| |
| my @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) || |
| ($mutable_tests{$b} <=> $mutable_tests{$a}) || |
| ($a cmp $b)} (keys %scores); |
| my $max_hits_per_msg = 0; |
| for (my $file = 0; $file < $num_tests; $file++) { |
| my(@hits) = |
| grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (thaw_tests($tests_hit[$file])); |
| if ((scalar(@hits)+1) > $max_hits_per_msg) { |
| $max_hits_per_msg = scalar(@hits)+1; |
| } |
| } |
| |
| for ($i = 0; $i <= $#index_to_rule; $i++) { |
| my $name = $index_to_rule[$i]; |
| $rule_to_index{$name} = $i; |
| |
| if ($ignored_rule{$name}) { next; } |
| |
| if ($mutable_tests{$name} == 0) { |
| $range_lo{$name} = $range_hi{$name} = $scores{$name}; |
| } else { |
| $mutable++; |
| if ($range_lo{$name} > $range_hi{$name}) { |
| ($range_lo{$name},$range_hi{$name}) = |
| ($range_hi{$name},$range_lo{$name}); |
| } |
| #$range_lo{$name} ||= 0.1; |
| #$range_hi{$name} ||= 1.5; |
| |
| # no default score found? set it to max and let GA adjust downwards. this |
| # seems to help avoid a load of really good rules getting 1.0 scores |
| if ($allrules{$name}->{no_score_found}) { |
| $scores{$name} = ($range_hi{$name} + $range_lo{$name}) / 2.0; |
| } |
| } |
| |
| $output .= ".".$i."\n". |
| "n".$name."\n". |
| "b".$scores{$name}."\n". |
| "m".$mutable_tests{$name}."\n". |
| "l".$range_lo{$name}."\n". |
| "h".$range_hi{$name}."\n"; |
| $size++; |
| } |
| |
| |
| open (DAT, ">tmp/scores.data"); |
| print DAT "N$size\n", "M$mutable\n", # informational only |
| $output; |
| close DAT; |
| |
| open (OUT, ">tmp/scores.h"); |
| print OUT " |
| #include <stdio.h> |
| #include <string.h> |
| #include <stdlib.h> |
| |
| int num_scores = $size; |
| int num_mutable = $mutable; |
| unsigned char is_mutable[$size]; |
| double range_lo[$size]; |
| double range_hi[$size]; |
| double bestscores[$size]; |
| char *score_names[$size]; |
| double tmp_scores[$size][2]; |
| unsigned char ny_hit[$mutable]; |
| unsigned char yn_hit[$mutable]; |
| |
| double lookup[$mutable]; |
| |
| /* readscores() is defined in tests.h */ |
| |
| "; |
| close OUT; |
| |
| writetests_c($max_hits_per_msg); # make sure $rule_to_index is around |
| } |
| |
| sub writetests_c { |
| my $max_hits_per_msg = $_[0]; |
| |
| my(%uniq_files) = (); |
| my(%count_keys) = (); |
| my(%file_key) = (); |
| |
| my $file; |
| |
| for ($file = 0; $file < $num_tests; $file++) |
| { |
| my $uniq_key = vec($is_spam, $file, 1) . " "; |
| |
| my (@good_tests) = |
| grep {length($_) && (! $ignored_rule{$_}) && |
| (defined($rule_to_index{$_}))} (thaw_tests($tests_hit[$file])); |
| |
| @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests)); |
| |
| $uniq_key .= join(" ",@good_tests); |
| |
| if (exists($count_keys{$uniq_key})) { |
| $count_keys{$uniq_key}++; |
| } else { |
| $count_keys{$uniq_key} = 1; |
| $file_key{$file} = $uniq_key; |
| $uniq_files{$file} = scalar(keys(%count_keys)) - 1; |
| } |
| } |
| |
| my $num_nondup = scalar(keys(%uniq_files)); |
| |
| open (TOP, ">tmp/tests.h"); |
| print TOP " |
| #include <stdio.h> |
| #include <string.h> |
| #include <stdlib.h> |
| |
| int num_tests = $num_tests; |
| int num_nondup = $num_nondup; |
| int num_spam = $num_spam; |
| int num_ham = $num_ham; |
| int max_hits_per_msg = $max_hits_per_msg; |
| unsigned char num_tests_hit[$num_nondup]; |
| unsigned char is_spam[$num_nondup]; |
| unsigned short tests_hit[$num_nondup][$max_hits_per_msg]; |
| double scores[$num_nondup]; |
| double tmp_total[$num_nondup]; |
| int tests_count[$num_nondup]; |
| |
| "; |
| $_ = join ('', <DATA>); |
| print TOP $_; |
| close TOP; |
| |
| open (DAT, ">tmp/tests.data"); |
| |
| foreach $file (sort {$a <=> $b} (keys %uniq_files)) { |
| print DAT ".".$uniq_files{$file}."\n"; |
| |
| my $out = ''; |
| $out .= "s".vec($is_spam, $file, 1)."\n"; |
| |
| my $base_score = 0; |
| my $num_tests_hit = 0; |
| foreach my $test (thaw_tests($tests_hit[$file])) { |
| if ($test eq '') { next; } |
| |
| if ($ignored_rule{$test}) { |
| # this is not a log-worthy event anymore, since we have a lot |
| # of T_ test rules that are ignored during perceptron runs |
| # warn "ignored rule $test got a hit in $file!\n"; |
| next; |
| } |
| |
| if (!defined $rule_to_index{$test}) { |
| warn "test with no C index: $test\n"; |
| next; |
| } |
| |
| if ($mutable_tests{$test}) { |
| $num_tests_hit++; |
| $out .= "t".$rule_to_index{$test}."\n"; |
| |
| if ($num_tests_hit >= $max_hits_per_msg) { |
| die "Need to increase \$max_hits_per_msg"; |
| } |
| } else { |
| $base_score += $scores{$test}; |
| } |
| } |
| |
| $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests |
| $out .= "c" . $count_keys{$file_key{$file}} . "\n"; |
| |
| print DAT "n".$num_tests_hit."\n".$out; |
| } |
| close DAT; |
| } |
| |
| sub read_ranges { |
| if (!-f 'tmp/ranges.data') { |
| die "need to make 'tmp/ranges.data' first"; |
| } |
| |
| # read ranges, and mutableness, from ranges.data. |
| open (IN, "<tmp/ranges.data") |
| or die "need to run score-ranges-from-freqs first!"; |
| |
| my $count = 0; |
| while (<IN>) { |
| /^(\S+) (\S+) (\d+) (\S+)$/ or next; |
| my $t = $4; |
| $range_lo{$t} = $1+0; |
| $range_hi{$t} = $2+0; |
| my $mut = $3+0; |
| |
| if ($allrules{$t}->{issubrule}) { |
| # warn "$t: ignoring, is sub-rule\n"; # no need to warn |
| $ignored_rule{$t} = 1; |
| $mutable_tests{$t} = 0; |
| next; |
| } |
| if ($t =~ /^T_/) { |
| # warn "$t: ignoring, is T_ test rule\n"; # no need to warn |
| $ignored_rule{$t} = 1; |
| $mutable_tests{$t} = 0; |
| $range_lo{$t} = 0.01; # clamp to insignificant range |
| $range_hi{$t} = 0.01; |
| next; |
| } |
| if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) { |
| warn "$t: ignoring, score and range == 0\n"; |
| $ignored_rule{$t} = 1; |
| $mutable_tests{$t} = 0; |
| next; |
| } |
| |
| $ignored_rule{$t} = 0; |
| |
| if (!$mut) { |
| $mutable_tests{$t} = 0; |
| } elsif ($range_lo{$t} == $range_hi{$t}) { |
| $mutable_tests{$t} = 0; |
| } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) { |
| $mutable_tests{$t} = 0; |
| } else { |
| $mutable_tests{$t} = 1; |
| } |
| unless ($mutable_tests{$t} || $scores{$t}) { |
| warn "$t: ignoring, immutable and score == 0 in this scoreset\n"; |
| $ignored_rule{$t} = 1; |
| } |
| } |
| close IN; |
| |
| # catch up on the ones missed; seems to be userconf or 0-hitters mostly. |
| foreach my $t (sort keys %allrules) { |
| next if ($t eq '_scoreset'); |
| next if (exists($range_lo{$t})); |
| |
| if ($allrules{$t}->{issubrule}) { |
| if (!$ignored_rule{$t}) { |
| # warn "$t: ignoring, is sub-rule\n"; # no need to warn here |
| $ignored_rule{$t} = 1; |
| } |
| $mutable_tests{$t} = 0; |
| next; |
| } |
| if ($t =~ /^T_/) { |
| if (!$ignored_rule{$t}) { |
| # warn "$t: ignoring, is T_ test rule\n"; # no need to warn here |
| $ignored_rule{$t} = 1; |
| $range_lo{$t} = 0.01; # clamp to insignificant range |
| $range_hi{$t} = 0.01; |
| } |
| $mutable_tests{$t} = 0; |
| next; |
| } |
| $ignored_rule{$t} = 0; |
| unless (exists($mutable_tests{$t}) && |
| ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) { |
| $mutable_tests{$t} = 0; |
| } |
| unless ($mutable_tests{$t} || $scores{$t}) { |
| if (!$ignored_rule{$t}) { |
| warn "$t: ignoring, immutable and score == 0 in this scoreset\n"; |
| $ignored_rule{$t} = 1; |
| } |
| } |
| } |
| foreach my $t (keys %range_lo) { |
| next if ($ignored_rule{$t}); |
| if ($mutable_tests{$t}) { |
| if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { |
| $scores{$t} = -1; |
| } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) && |
| ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { |
| $scores{$t} = -0.01; |
| } |
| if ($scores{$t} >= $range_hi{$t}) { |
| $scores{$t} = $range_hi{$t} - 0.001; |
| } elsif ($scores{$t} <= $range_lo{$t}) { |
| $scores{$t} = $range_lo{$t} + 0.001; |
| } |
| } else { |
| if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) { |
| next; |
| } elsif ($range_lo{$t} == $range_hi{$t}) { |
| $scores{$t} = $range_lo{$t}; |
| next; |
| } |
| if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { |
| $scores{$t} = -1; |
| } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) && |
| ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { |
| $scores{$t} = -0.01; |
| } |
| if ($scores{$t} > $range_hi{$t}) { |
| $scores{$t} = $range_hi{$t}; |
| } elsif ($scores{$t} < $range_lo{$t}) { |
| $scores{$t} = $range_lo{$t}; |
| } |
| } |
| } |
| } |
| |
| |
| __DATA__ |
| |
| void loadtests (void) { |
| FILE *fin = fopen ("tmp/tests.data", "r"); |
| char buf[256]; |
| int file = 0; |
| int tnum = 0; |
| |
| while (fgets (buf, 255, fin) != NULL) { |
| char cmd; |
| long arg; |
| float argd; |
| |
| cmd = (char) *buf; |
| arg = strtol (buf+1, NULL, 10); |
| argd = (float)strtod (buf+1, NULL); |
| |
| if (cmd == '.') { |
| file = arg; |
| |
| } else if (cmd == 'n') { |
| tnum = 0; |
| num_tests_hit[file] = arg; |
| |
| } else if (cmd == 's') { |
| is_spam[file] = arg; |
| |
| } else if (cmd == 'b') { |
| scores[file] = argd; |
| |
| } else if (cmd == 't') { |
| tests_hit[file][tnum] = arg; tnum++; |
| |
| } else if (cmd == 'c') { |
| tests_count[file] = arg; |
| |
| } |
| } |
| fclose(fin); |
| |
| printf ("Read test results for %d messages (%d total).\n", file+1, |
| num_tests); |
| } |
| |
| void loadscores (void) { |
| FILE *fin = fopen ("tmp/scores.data", "r"); |
| char buf[256]; |
| int snum = 0; |
| |
| while (fgets (buf, 255, fin) != NULL) { |
| char cmd; |
| long arg; |
| float argd; |
| char *str, *white; |
| |
| cmd = (char) *buf; |
| arg = strtol (buf+1, NULL, 10); |
| argd = (float)strtod (buf+1, NULL); |
| str = buf+1; |
| |
| while ((white = strchr (str, '\n')) != NULL) { |
| *white = '\0'; |
| } |
| |
| if (cmd == '.') { |
| snum = arg; |
| |
| } else if (cmd == 'b') { |
| bestscores[snum] = argd; |
| |
| } else if (cmd == 'l') { |
| range_lo[snum] = argd; |
| |
| } else if (cmd == 'h') { |
| range_hi[snum] = argd; |
| |
| } else if (cmd == 'n') { |
| score_names[snum] = strdup (str); /* leaky leak ;) */ |
| |
| } else if (cmd == 'm') { |
| is_mutable[snum] = arg; |
| } |
| } |
| fclose(fin); |
| |
| printf ("Read scores for %d tests.\n", num_scores); |
| } |