blob: f362915389fc74a6c27887713267faef051b8d41 [file] [log] [blame]
#!/usr/bin/perl
=head1 NAME
seek-phrases-in-log - extract good-looking rules from a text-dump mc log
=cut
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
# ---------------------------------------------------------------------------
sub usage {
die "
usage: seek-phrases-in-log [--reqhitrate n] [--reqpatlength n] [ --maxreqpatlength n]
[--rules] [--ruletype 'type'] [--ruleprefix FOO]
[--maxtextread n] --ham hamlog --spam spamlog
--reqhitrate: percentage hit-rate against spam required (default: 0.5)
(multiple values can be specified, separated by spaces)
--reqpatlength: required pattern length, in characters (default: 0)
--maxreqpatlength: maximum pattern length, in characters (default: 1024)
--maxtextread: bytes of message text examined (default: 32768)
--rules: generate SpamAssassin rule output (default: 0)
--ruleprefix: specify prefix string for rules (default: 'SEEK_')
--ruletype 'type': generate rules of type: 'header', 'body'
";
}
# ---------------------------------------------------------------------------
use warnings;
use strict;
use Getopt::Long qw(:config no_ignore_case);
use Data::Dumper;
use Digest::SHA qw(sha1_base64);
sub logmsg;
my %opt = ();
$opt{reqhitrate} = 0.5;
$opt{reqpatlength} = 0;
$opt{maxreqpatlength} = 1024;
$opt{maxtextread} = 32768;
$opt{rules} = 0;
$opt{ruleprefix} = 'SEEK_';
$opt{ruletype} = 'body';
my $fs;
my $fh;
my @files = ();
GetOptions(
"rules" => \$opt{rules},
"ruleprefix=s" => \$opt{ruleprefix},
"reqhitrate=s" => \$opt{reqhitrate},
"reqpatlength=s" => \$opt{reqpatlength},
"maxreqpatlength=s" => \$opt{maxreqpatlength},
"ruletype=s" => \$opt{ruletype},
"maxtextread=s" => \$opt{maxtextread},
"phase2=s" => \$opt{phase2},
"ham=s" => \$fh,
"spam=s" => \$fs,
'help' => \&usage
) or usage();
usage() unless (($fs && $fh) || $opt{phase2});
my @hitratethresholds = ();
if ($opt{reqhitrate} =~ /\S\s+\S/) { # multiple values
@hitratethresholds = reverse sort {$a<=>$b} split (' ', $opt{reqhitrate});
$opt{reqhitrate} = pop @hitratethresholds; # the lowest
# @hitratethresholds is now sorted, highest to lowest
}
# n-gram reading state
my %word2sym = ('' => '');
my %sym2word = ('' => '');
my $sym_acc = 'a'; # symbols are represented using IDs from this counter
my $msgcount = 0;
# these are shared with the assembly stage (via $asmstate)
my @text_string = ();
my %ngram_count = ();
my %msg_subset_hit = ();
my $asmstate;
if ($opt{phase2}) {
$asmstate = load_state($opt{phase2});
}
else {
logmsg "reading $fs...";
open IN, "<$fs" or die "cannot open spam log $fs";
while (<IN>) {
/^text: (.*)$/ and proc_text_spam($1);
}
close IN;
# only do this if we have read enough spam messages, otherwise we could
# discard tokens for which reqhitrate has been achieved
if ($msgcount > 2 * (100 / $opt{reqhitrate})) {
discard_hapaxes();
}
logmsg "n-grams active: ".(scalar keys %ngram_count);
logmsg "reading $fh...";
open IN, "<$fh" or die "cannot open ham log $fh";
while (<IN>) {
/^text: (.*)$/ and proc_text_ham($1);
}
close IN;
logmsg "n-grams active: ".(scalar keys %ngram_count);
# move onto the next step; assembly. free stuff we no longer need here
undef %word2sym; # free this, no longer needed
# create the assembly state object
$asmstate = { };
filter_into_message_subsets();
undef %sym2word; # no longer needed after that
write_state($asmstate, "assemble.state");
}
assemble_regexps();
exit;
# ---------------------------------------------------------------------------
# PHASE 1: PARSING, NGRAM PROBABILITIES
sub proc_text_spam {
my ($text) = @_;
# we only need to chop off the end of spam samples
if (length($text) > $opt{maxtextread}) {
$text = substr $text, 0, $opt{maxtextread}; # chop!
}
$text =~ s/ +/ /gs; # single spaces, please
# we only need to save spam samples in memory, ignore ham samples
push @text_string, $text;
my $cp = pack "l", $msgcount;
$msgcount++;
(($msgcount % 1000) == 999) and discard_hapaxes();
my %tokens = ();
foreach my $line (split(/\[p\]/, $text)) {
my $w1 = '';
my $w2 = '';
my $w3 = '';
foreach my $w (split(' ', $line)) {
# if (length $w > 20) { $w = "sk:".substr($w, 0, 5); }
$w3 = $w2;
$w2 = $w1;
$w1 = $word2sym{$w};
if (!$w1) {
$word2sym{$w} = $w1 = $sym_acc;
$sym2word{$sym_acc} = $w;
$sym_acc++;
}
# simple bayesian N-grams to start
$tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
}
# deal with leftovers
if ($w2 && $w1) {
$tokens{"$w2.$w1"} = 1;
}
}
foreach my $tok (keys %tokens) {
#warn "JMD adding $tok => ".decode_sym2words($tok);
# incr the counter for this token
$ngram_count{$tok}++;
$msg_subset_hit{$tok} .= $cp; # the message subset hit by this tok
}
}
sub discard_hapaxes {
my $before = (scalar keys %ngram_count);
foreach my $tok (keys %ngram_count) {
if ($ngram_count{$tok} <= 1) {
delete $ngram_count{$tok};
delete $msg_subset_hit{$tok};
}
}
my $after = (scalar keys %ngram_count);
my $killed = ($before - $after);
logmsg "shrunk dbs: $killed hapaxes killed, kept $after entries";
}
sub proc_text_ham {
my ($text) = @_;
my %tokens = ();
foreach my $line (split(/\[p\]/, $text)) {
my $w1 = '';
my $w2 = '';
my $w3 = '';
foreach my $w (split(' ', $line)) {
$w3 = $w2;
$w2 = $w1;
$w1 = $word2sym{$w};
if (!$w1) {
# since we're deleting, there's no need to add new words
# to the dictionary; just use the final $sym_acc to mean
# "unknown ham word", and don't increment it
$w1 = $sym_acc;
}
$tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = $tokens{".$w3"} = 1;
}
# deal with leftovers
if ($w2 && $w1) {
$tokens{"$w2.$w1"} = 1;
}
}
foreach my $tok (keys %tokens) {
# we're not tracking hits; we're killing false positives.
# as soon as a single FP appears, kill all record of that token,
# it cannot be used
remove_fp_ing_token($tok);
}
}
sub remove_fp_ing_token {
my $tok = shift;
#warn "JMD removing $tok => ".decode_sym2words($tok);
delete $ngram_count{$tok};
delete $msg_subset_hit{$tok};
delete $ngram_count{".".$tok};
delete $msg_subset_hit{".".$tok};
delete $ngram_count{"..".$tok};
delete $msg_subset_hit{"..".$tok};
}
sub filter_into_message_subsets {
logmsg "filtering into message subsets...";
$asmstate = { };
# hash all msg_subset_hit lists; we don't need the full data, so this
# saves space
foreach my $id (keys %msg_subset_hit) {
$msg_subset_hit{$id} = unpack("%32C*", $msg_subset_hit{$id});
}
# note: we don't care about stuff that hits *any* ham at all
$asmstate->{msg_count_spam} = scalar @text_string;
$asmstate->{msg_count_spam} ||= 0.000001;
$asmstate->{all_patterns_for_set} = { };
foreach my $id (keys %ngram_count) {
my $count = $ngram_count{$id};
my $bad;
# must occur more than once!
if (!defined $count || $count <= 1) {
$bad++;
}
# require N% spam hits
elsif (($count*100) / $asmstate->{msg_count_spam} < $opt{reqhitrate}) {
$bad++;
}
if ($bad) {
# we don't need to remember anything about this pattern after here
remove_fp_ing_token($id);
next;
}
my $set = $msg_subset_hit{$id};
push @{$asmstate->{all_patterns_for_set}->{$set}}, decode_sym2words($id);
}
logmsg "message subsets found: ".(scalar
keys %{$asmstate->{all_patterns_for_set}});
$asmstate->{ngram_count} = \%ngram_count;
$asmstate->{msg_subset_hit} = \%msg_subset_hit;
$asmstate->{text_string} = \@text_string;
}
sub decode_sym2words {
my $ids = shift;
my $r;
if ($ids =~ /^([^.]*)\.([^.]*)\.([^.]*)$/) {
$r = "$sym2word{$1} $sym2word{$2} $sym2word{$3}";
}
elsif ($ids =~ /^([^.]*)\.([^.]*)$/) {
$r = "$sym2word{$1} $sym2word{$2}";
}
else {
warn "bad decode_sym2words format: '$ids'";
}
$r =~ s/^\s+//;
return $r;
}
sub write_state {
my ($state, $statefile) = @_;
my $dump = Data::Dumper->new ([ $state ]);
$dump->Deepcopy(1);
$dump->Purity(1);
$dump->Indent(1);
my $text = $dump->Dump.";1;";
open (OUT, ">$statefile") or die "cannot write $statefile";
print OUT $text;
close OUT or die "cannot write $statefile";
}
# ---------------------------------------------------------------------------
# PHASE 2: REGEXP ASSEMBLY
sub load_state {
my $file = shift;
if (open(IN, "<".$file)) {
my $str = join("", <IN>); close IN;
my $VAR1; # Data::Dumper
if (eval $str) {
return $VAR1; # Data::Dumper's naming
}
}
die "failed to load state from $file";
}
sub assemble_regexps {
my %done_set = ();
my @done_pats = ();
logmsg "deduping and assembling regexps...";
if (!$opt{rules}) {
printf ("%6s %6s %6s %s\n", "RATIO", "SPAM%", "HAM%", "DATA");
}
$| = 1;
my $count = 0;
my $count_out = 0;
foreach my $id (sort {
$asmstate->{ngram_count}->{$b} <=> $asmstate->{ngram_count}->{$a}
} keys %{$asmstate->{ngram_count}})
{
my $set = $asmstate->{msg_subset_hit}->{$id};
next if $done_set{$set}; $done_set{$set}++;
if ($count++ % 200 == 0) {
logmsg "working on message subset $count ($count_out)...";
}
# we now have several patterns. see if we can expand them sideways
# to make the pattern bigger, and collapse into a smaller number of
# pats at the same time
my @pats = collapse_pats($asmstate->{all_patterns_for_set}->{$set});
# my @pats = @{$asmstate->{all_patterns_for_set}->{$set}};
@pats = quote_all_metas(@pats);
@pats = subsume_with_dotstars(@pats);
@pats = expand_with_dots(@pats);
@pats = ensure_reqpatlength(@pats);
# at this point, patterns are all valid regexps
# now check to see if any of these pats have been subsumed in an
# already-output pattern (one with more hits!)
my @pats_new = ();
foreach my $pat (@pats) {
my $subsumed = 0;
foreach my $done (@done_pats, @pats_new) {
# pattern == existing pattern, or existing pattern is contained by
# pattern, or pattern is contained in existing pattern
if ($pat eq $done || $pat =~ /\Q${done}\E/ || $done =~ /\Q${pat}\E/) {
$subsumed=1;
last;
}
# or one pattern contains the other (but interpreted as a regexp!)
# this deals with /foo.{0,10} bar/ vs /foo ish bar/
if ($pat =~ /$done/) {
$subsumed=1;
last;
}
if ($done =~ /$pat/) {
$subsumed=1;
last;
}
}
next if $subsumed;
# add to the current list
push @pats_new, $pat;
}
@pats = @pats_new;
push @done_pats, @pats_new;
# if we have no non-subsumed pats at this stage, skip this set
next unless @pats;
if (($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam} <
$opt{reqhitrate})
{
# quit if we hit the final hitrate threshold
return;
}
if (defined($hitratethresholds[0]) &&
($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam} < $hitratethresholds[0])
{
print "# passed hit-rate threshold: $hitratethresholds[0]\n";
shift @hitratethresholds;
}
if ($opt{rules}) {
printf "# %6.3f %6.3f %6.3f\n",
1.0, ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam}, 0;
# sort, to ensure ordering always remains the same
foreach my $pat (sort @pats) {
my $name = generate_rule_name($pat);
if ($opt{ruletype} eq 'header') {
# deal with header-specific munging.
# "\[\\n\]" is the result of "[\n]", at this stage
$pat =~ s/\Q\[\\n\]\E/\\n/gs;
$pat =~ s/\Q\[\\t\]\E/\\t/gs;
}
if ($opt{ruletype} eq 'body') {
$pat =~ s/^\s/\\s/;
}
if ($opt{ruletype} eq 'header') {
print "$opt{ruletype} $opt{ruleprefix}${name} ALL =~ /$pat/\n";
} else {
print "$opt{ruletype} $opt{ruleprefix}${name} /$pat/\n";
}
$count_out++;
}
} else {
my $pats = '/'.join ('/, /', map { s/\//\\\//gs; $_; } @pats).'/';
printf "%6.3f %6.3f %6.3f %s\n",
1.0, ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam}, 0, $pats;
$count_out++;
}
}
}
sub collapse_pats {
my $pataryref = $_[0];
my @ret = ();
while (1) {
my $pat = shift(@{$pataryref});
last unless defined($pat);
#warn "JMD collapse [$pat]";
$pat =~ s/^\s+//;
# TODO: optimise, second-slowest line
my @hits = grep /\Q$pat\E/, @{$asmstate->{text_string}};
if (scalar @hits == 0) {
warn "supposed pattern /$pat/ is 0-hitter";
warn "JMD strings:\n ".join("\n ", @{$asmstate->{text_string}})."\n";
push @ret, "[BROKEN: 0 hitter]$pat";
next;
}
# we don't have all day!
my $pat_maxlen = $opt{maxtextread};
my $s = $hits[0];
# Now, expand the pattern using a BLAST-style algorithm
# expand towards start of string
while (1) {
my $l = length($pat);
last if ($l > $pat_maxlen); # too long
my $found;
if ($s =~ /(.)\Q$pat\E/s && $s !~ /\[p\]\Q$pat\E/s) { $found = $1; }
if (!defined $found) {
# start of string. break
last;
}
# give up if there are a differing number of hits for the new pat
my $newpat = $found.$pat;
# note: we can search just in @hits, instead of in
# @{$asmstate->{text_string}}, because there's no way we can
# *increase* the hitrate in the corpus; we can only reduce it.
# this is double-checked after these 2 expansion loops anyway
if (scalar (grep /\Q$newpat\E/, @hits) != scalar @hits) { last; }
$pat = $newpat; # and carry on
}
# warn "JMD $pat";
# expand towards end of string
while (1) {
if (length($pat) > $pat_maxlen
|| $s =~ /\Q$pat\E\[p\]/s
|| $s !~ /\Q$pat\E(.)/s)
{
# end of string. break
last;
}
my $newpat = $pat.$1;
if (scalar (grep /\Q$newpat\E/, @hits) != scalar @hits) { last; }
$pat = $newpat; # and carry on
}
# double-check to ensure we haven't somehow INCREASED our hitrate
# beyond the initial pattern
if (scalar (grep /\Q$pat\E/, @{$asmstate->{text_string}}) != scalar @hits) {
die "oops! went too far";
}
# now remove subsumed patterns
@{$pataryref} = grep { $pat !~ /\Q$_\E/s } @{$pataryref};
# warn "JMD $pat";
# skip recording this if it's already inside one of the results
next if grep { $_ =~ /\Q$pat\E/s } @ret;
# also, remove cases where this pattern contains previous results
@ret = grep { $pat !~ /\Q$_\E/s } @ret;
# warn "JMD $pat";
push (@ret, $pat);
}
return @ret;
}
sub quote_all_metas {
return map {
s/([!-+\`\?\^\~\\\/\|\.\{\}\(\)\[\]\@])/\\$1/gs;
$_;
} @_;
}
sub subsume_with_dotstars {
my @working = @_;
# attempt to resolve . and .*
my @ret = ();
while (1) {
my $p1 = shift(@working);
last unless defined($p1);
# TODO: optimise, 4th-slowest line
my @hits = grep /$p1/, @{$asmstate->{text_string}};
if (scalar @hits == 0) {
warn "supposed pattern /$p1/ is 0-hitter";
push @ret, "[BROKEN: 0 hitter]$p1";
next;
}
foreach my $p2 (@working) {
next if ($p1 eq $p2);
DOTLOOP: for my $dotstar
( ".", ".?", ".{0,3}", ".{0,5}", ".{0,20}", ".{0,40}" )
{
my $newpatcapture = $p1."(".$dotstar.")".$p2;
my $newpat = $p1.$dotstar.$p2;
# drop the attempt if any of the existing hits becomes a miss,
# or the .* includes a paragraph boundary in any of the hits
foreach my $t (@hits) {
next DOTLOOP if ($t !~ /$newpatcapture/);
next DOTLOOP if ($1 =~ /\[p\]/);
}
next if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) != scalar @hits);
# it works! yay
# TODO: for /./, see if we can capture the options and
# construct a /[oO0]/ char class instead
$p1 = $newpat;
}
}
# skip recording this if it's already inside one of the results
next if grep { $_ =~ /\Q$p1\E/s } @ret;
push (@ret, $p1);
}
return @ret;
}
sub expand_with_dots {
my @working = @_;
my @ret = ();
my $pat_maxlen = $opt{maxtextread};
while (1) {
my $p1 = shift(@working);
last unless defined($p1);
# TODO: optimise, 5th-slowest line
my @hits = grep /$p1/, @{$asmstate->{text_string}};
if (scalar @hits == 0) {
warn "supposed pattern /$p1/ is 0-hitter";
push @ret, "[BROKEN: 0 hitter]$p1";
next;
}
my $s = $hits[0];
# expand towards end of string (with .)
while (1) {
last if (length($p1) >= $pat_maxlen);
if ($s !~ /$p1\[p\]/s && $s =~ /$p1(.)/s)
{
my $extn = $1; quote_all_metas($extn);
my $newpat = $p1.$extn;
# TODO: optimise, 3rd-slowest line
if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
$p1 = $newpat; next;
}
}
if ($s !~ /$p1.?\[p\]/s && $s =~ /$p1[^\\](.)/s)
{
my $extn = $1; quote_all_metas($extn);
my $newpat = $p1.".".$extn;
if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
$p1 = $newpat; next;
}
}
if ($s !~ /$p1.{0,2}\[p\]/s && $s =~ /$p1[^\\][^\\](.)/s)
{
my $extn = $1; quote_all_metas($extn);
my $newpat = $p1."..".$extn;
if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
$p1 = $newpat; next;
}
}
last;
}
push (@ret, $p1);
}
@ret;
}
sub ensure_reqpatlength {
my @ret = @_;
if ($opt{reqpatlength}) {
@ret = grep { (length($_) >= $opt{reqpatlength}) && (length($_) < $opt{maxreqpatlength}) } @ret;
return () unless @ret;
}
return @ret;
}
sub generate_rule_name {
my $str = shift;
$str = sha1_base64($str);
$str =~ s/^(.{6}).*$/$1/gs;
$str =~ tr/a-z/A-Z/;
$str =~ s/[^A-Z0-9]/_/gs;
return $str;
}
# ---------------------------------------------------------------------------
# used by all phases
sub logmsg {
warn "".(scalar localtime time).": $_[0]\n";
}