| #!/usr/bin/perl -w |
| |
| use FindBin; |
| use lib "$FindBin::Bin/../lib"; |
| |
| use strict; |
| |
| use Mail::SpamAssassin::ArchiveIterator; |
| use Getopt::Std; |
| use FileHandle; |
| |
| ########### |
| |
| sub usage { |
| print STDERR "split-corpora [-n num_buckets] [-p outfile_prefix] ". |
| "[-l max_messages] ". |
| "folder1 ....\n"; |
| exit(1); |
| } # usage() |
| |
| ########### |
| |
| our ($opt_n, $opt_p, $opt_h, $opt_l); |
| |
| getopt('n:p:l:h'); |
| |
| usage() if ($opt_h); |
| |
| my $num_buckets = $opt_n || 2; |
| my $prefix = $opt_p || "bucket"; |
| my @IN_FILES = @ARGV; |
| |
| usage() if (@IN_FILES == 0); |
| |
| my @targets = (); |
| foreach (@IN_FILES) { |
| if (-d $_) { |
| push (@targets, "ham:dir:$_"); |
| } else { |
| push (@targets, "ham:mbox:$_"); |
| } |
| } |
| |
| my @bucket_fhs = (); |
| foreach my $bucket (1 .. $num_buckets) { |
| my $bucket_fh = FileHandle->new(); |
| |
| if (!$bucket_fh->open(">$prefix.$bucket")) { |
| die "Could not open '$prefix.$bucket' for writing: $!\n"; |
| } |
| |
| push(@bucket_fhs, $bucket_fh); |
| } # foreach my $bucket (1 .. $num_buckets) |
| |
| my $current_bucket = 0; |
| |
| my $iter = Mail::SpamAssassin::ArchiveIterator->new({ |
| 'opt_all' => 1, |
| }); |
| |
| $iter->set_functions(\&wanted, sub { }); |
| my $messagecount = 0; |
| |
| eval { |
| $iter->run(@targets); |
| }; |
| if ($@) { die $@ unless ($@ =~ /HITLIMIT/); } |
| |
| foreach my $fh (@bucket_fhs) { |
| $fh->close(); |
| } |
| if ($opt_l && $messagecount < $opt_l) { |
| warn "warning: only found $messagecount messages instead of $opt_l\n"; |
| } |
| |
| ############################################# |
| |
| sub wanted { |
| my (undef, $msg_id, $time, $data_ref) = @_; |
| |
| if ($opt_l && $messagecount++ > $opt_l) { die 'HITLIMIT'; } |
| |
| # Make sure message can be used for outputing mbox format |
| if ($data_ref->[0] !~ /^From \S+ +... ... /) { |
| unshift(@$data_ref, "From abc\@xyz.com Mon Jan 1 00:00:00 2000\n"); |
| } |
| |
| $bucket_fhs[$current_bucket]->print( join("", @$data_ref) ); |
| |
| $current_bucket = ($current_bucket + 1) % $num_buckets; |
| } # wanted() |
| |