| #!/usr/bin/perl -w |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| # |
| # Produces an output file containing a sparse matrix to be loaded into |
| # evolve_metarules. This particular configuration looks for the __FRAUD_AAA |
| # rules, but you can change the regex to be whatever you want. |
| # |
| # Usage: preproc.pl {ham.log} {spam.log} {rules.dat} {hits.dat} |
| # |
| # Output file format for rules.dat: |
| # rule_name |
| # ... |
| # |
| # Output file format (unsigned ascii integers) for hits.dat: |
| # num_rules |
| # max_hits |
| # num_patterns |
| # is_spam pattern_count pattern_size (rule_no){pattern_size} |
| # ... |
| |
| use strict; |
| |
| # Search for matching rules in the SpamAssassin rules directory. |
| my %rules; |
| open (RULE_OUT, ">", $ARGV[2] || "rules.dat") || die $!; |
| foreach my $file (<../../rules/*.cf>) { |
| open (CONFIG, "<", $file) || die $!; |
| while (<CONFIG>) { |
| if (/^(?:header|body|uri|rawbody|full|meta)\s+(__FRAUD_[A-Z]{3})\s/) { |
| $rules{$1} = (scalar keys %rules); |
| printf RULE_OUT "%s\n", $1; |
| } |
| } |
| close (CONFIG) || die $!; |
| } |
| close (RULE_OUT) || die $!; |
| |
| # This is to find the pattern hitting the most rules. |
| my $largest_pattern = 0; |
| |
| # ham_patterns: Hash containing all of the unique ham patterns that we have |
| # seen so far and how many of each we have seen. |
| # ham_pattern_len: How many entries are in each pattern. This is really only |
| # here so that I can be lazy later. |
| my (%ham_patterns, %ham_pattern_len); |
| open (HAM, "<", $ARGV[0] || "ham.log" ) || die $!; |
| while (<HAM>) { |
| # Ignore comments. |
| next if /^#/; |
| |
| # Rule hits are in the fourth field. |
| my (undef,undef,undef, $test_str, undef) = split /\s/; |
| |
| # Extract the relevant rule hits and sort them by column number. |
| my @tests; |
| foreach my $r (split(/,/, $test_str)) { |
| my $hits = 1; |
| # Support compacted RULE(hitcount) format |
| if ($r =~ s/\((\d+)\)$//) { |
| $hits = $1; |
| } |
| next unless exists $rules{$r}; |
| push @tests, $r for (1 .. $hits); |
| } |
| my @hits = sort map { $rules{$_} } @tests; |
| |
| # Count the number of occurrences and size of this pattern. |
| $ham_patterns{join (' ', @hits)}++; |
| $ham_pattern_len{join (' ', @hits)} = scalar(@hits); |
| |
| # Keep track of the largest pattern that we have seen thus far. |
| if ( scalar(@hits) > $largest_pattern) { |
| $largest_pattern = scalar(@hits); |
| } |
| } |
| close (HAM); |
| delete $ham_patterns{''}; |
| |
| # spam_patterns: Hash containing all of the unique spam patterns that we have |
| # seen so far and how many of each we have seen. |
| # spam_pattern_len: How many entries are in each pattern. This is really only |
| # here so that I can be lazy later. |
| my (%spam_patterns, %spam_pattern_len); |
| open (SPAM, "<", $ARGV[1] || "spam.log") || die $!; |
| while (<SPAM>) { |
| # Ignore comments. |
| next if /^#/; |
| |
| # Rule hits are in the fourth field. |
| my (undef,undef,undef, $test_str, undef) = split /\s/; |
| |
| # Extract the relevant rule hits and sort them by column number. |
| my @tests; |
| foreach my $r (split(/,/, $test_str)) { |
| my $hits = 1; |
| # Support compacted RULE(hitcount) format |
| if ($r =~ s/\((\d+)\)$//) { |
| $hits = $1; |
| } |
| next unless exists $rules{$r}; |
| push @tests, $r for (1 .. $hits); |
| } |
| my @hits = sort map { $rules{$_} } @tests; |
| |
| # Count the number of occurrences and size of this pattern. |
| $spam_patterns{join (' ', @hits)}++; |
| $spam_pattern_len{join (' ', @hits)} = scalar(@hits); |
| |
| # Keep track of the largest pattern that we have seen thus far. |
| if ( scalar(@hits) > $largest_pattern) { |
| $largest_pattern = scalar(@hits); |
| } |
| } |
| close (SPAM); |
| delete $spam_patterns{''}; |
| |
| # Write things out to the data file in the format mentioned above. |
| open (DAT, ">", $ARGV[3] || "hits.dat") || die $!; |
| |
| printf DAT "%d\n", scalar(keys %rules); |
| printf DAT "%d\n", $largest_pattern; |
| printf DAT "%d\n", scalar(keys %ham_patterns) + scalar(keys %spam_patterns); |
| |
| foreach my $pattern (keys %ham_patterns) { |
| printf DAT "0 %d %d %s\n", $ham_patterns{$pattern}, $ham_pattern_len{$pattern}, $pattern; |
| } |
| |
| foreach my $pattern (keys %spam_patterns) { |
| printf DAT "1 %d %d %s\n", $spam_patterns{$pattern}, $spam_pattern_len{$pattern}, $pattern; |
| } |
| |
| close (DAT); |