blob: ac4cefcf52d1eb4ed645514c2e6a5f5ac1e45aa1 [file] [log] [blame]
#!/usr/bin/perl -w
use FindBin;
use lib "$FindBin::Bin/../lib";
use strict;
use Mail::SpamAssassin::ArchiveIterator;
use Getopt::Std;
use FileHandle;
###########
sub usage {
print STDERR "split-corpora [-n num_buckets] [-p outfile_prefix] ".
"[-l max_messages] ".
"folder1 ....\n";
exit(1);
} # usage()
###########
our ($opt_n, $opt_p, $opt_h, $opt_l);
getopt('n:p:l:h');
usage() if ($opt_h);
my $num_buckets = $opt_n || 2;
my $prefix = $opt_p || "bucket";
my @IN_FILES = @ARGV;
usage() if (@IN_FILES == 0);
my @targets = ();
foreach (@IN_FILES) {
if (-d $_) {
push (@targets, "ham:dir:$_");
} else {
push (@targets, "ham:mbox:$_");
}
}
my @bucket_fhs = ();
foreach my $bucket (1 .. $num_buckets) {
my $bucket_fh = new FileHandle();
if (!$bucket_fh->open(">$prefix.$bucket")) {
die "Could not open '$prefix.$bucket' for writing: $!\n";
}
push(@bucket_fhs, $bucket_fh);
} # foreach my $bucket (1 .. $num_buckets)
my $current_bucket = 0;
my $iter = new Mail::SpamAssassin::ArchiveIterator({
'opt_all' => 1,
});
$iter->set_functions(\&wanted, sub { });
my $messagecount = 0;
eval {
$iter->run(@targets);
};
if ($@) { die $@ unless ($@ =~ /HITLIMIT/); }
foreach my $fh (@bucket_fhs) {
$fh->close();
}
if ($opt_l && $messagecount < $opt_l) {
warn "warning: only found $messagecount messages instead of $opt_l\n";
}
#############################################
sub wanted {
my (undef, $msg_id, $time, $data_ref) = @_;
if ($opt_l && $messagecount++ > $opt_l) { die 'HITLIMIT'; }
# Make sure message can be used for outputing mbox format
if ($data_ref->[0] !~ /^From \S+ +... ... /) {
unshift(@$data_ref, "From abc\@xyz.com Mon Jan 1 00:00:00 2000\n");
}
$bucket_fhs[$current_bucket]->print( join("", @$data_ref) );
$current_bucket = ($current_bucket + 1) % $num_buckets;
} # wanted()