blob: 89b980ddf87e0ad4030f38fd4cc138a7b9d9afd6 [file] [log] [blame]
#!/usr/bin/perl
=head1 NAME
seek-phrases-in-corpus - given a corpus of spam, seek out common phrases
=head1 SYNOPSIS
seek-phrases-in-corpus [--grep 'pattern'] ham:dir:/path spam:dir:/path2 ...
=head1 DESCRIPTION
Given a _small_ corpus of ham and spam mails (specified in mass-check format),
this will attempt to find patterns that appear in at least 2 spams, then list
out all the patterns that have a 1.0 S/O ratio (ie. hit spam and no ham).
The output format looks like:
1.000 8.633 0.000 /pattern/, /pattern2/, /pattern3/
1.000 8.633 0.000 /pattern4/
1.000 10.000 0.000 /pattern5/
First field is S/O (and will always be 1.000). Second, the SPAM%
figure -- how much of the spam corpus, as a percentage, contains the
pattern. Third is the list of one or more pattern(s) that hit this
subset of messages.
Note that patterns that hit a different subset of the messages in the spam
corpus, are listed on separate lines; e.g., in the example above, /pattern3/
and /pattern4/ both hit 8.633% of the spam corpus -- however, they hit a
different 8.633%, not the same subset of messages. On the other hand,
/pattern2/ and /pattern3/ are hitting exactly the same messages.
The patterns are simple substrings, not regular expressions; don't
be misled by the use of "/" as a delimiter. The body text is rendered
as SpamAssassin "body" rendering.
=cut
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
# ---------------------------------------------------------------------------
use Getopt::Long;
use Carp qw(croak);
use FindBin;
use strict;
use warnings;
my $sadir = "$FindBin::Bin/../..";
my $tmpdir = "/tmp/findpats.tmp.$$";
my %opt = ();
GetOptions(
'grep=s' => \$opt{grep},
) or die "see perldoc for usage";
my $mcargs = ' '.join(' ', @ARGV).' ';
# extract just the ham or spam targets
my $mcargs_h = $mcargs; $mcargs_h =~ s/ spam:\S+ //gs;
my $mcargs_s = $mcargs; $mcargs_s =~ s/ ham:\S+ //gs;
if ($mcargs_h !~ /\bham:/) {
die "seek-phrases-in-corpus: no 'ham:type:path' corpus specifier found!\n";
}
if ($mcargs_s !~ /\bspam:/) {
die "seek-phrases-in-corpus: no 'spam:type:path' corpus specifier found!\n";
}
my $re = $opt{grep};
# ---------------------------------------------------------------------------
(-d "$tmpdir/cor") and run ("rm -rf $tmpdir/cor");
(-d "$tmpdir/cor") or run ("mkdir -p $tmpdir/cor");
# note: -c=/dev/null so no rules ever run
# don't grep the ham set!
run("cd $sadir/masses && ".
"./mass-check --cf='loadplugin Dumptext plugins/Dumptext.pm' ".
" --cf='loadplugin Mail::SpamAssassin::Plugin::Check' ".
" --cf='loadplugin GrepRenderedBody plugins/GrepRenderedBody.pm' ".
" -n -o --showdots -c=/dev/null ".
" $mcargs_h > $tmpdir/w.h");
# *do* grep the spam, though
run("cd $sadir/masses && ".
"./mass-check --cf='loadplugin Dumptext plugins/Dumptext.pm' ".
" --cf='loadplugin Mail::SpamAssassin::Plugin::Check' ".
" --cf='loadplugin GrepRenderedBody plugins/GrepRenderedBody.pm' ".
($re ? " --cf='grep $re' " : "").
" -n -o --showdots -c=/dev/null ".
" $mcargs_s > $tmpdir/w.s");
run("perl -w $sadir/masses/rule-dev/seek-phrases-in-log ".
"--ham $tmpdir/w.h --spam $tmpdir/w.s > $tmpdir/result");
run("cat $tmpdir/result");
exit;
# ---------------------------------------------------------------------------
sub run {
my $cmd = shift;
warn "[$cmd]\n";
system $cmd;
($? >> 8 != 0) and Carp::croak("command failed");
}