masses/corpora/mk-corpus-link-farm - spamassassin - Git at Google

 #!/usr/bin/perl
 #
 # mk-corpus-link-farm - distribute a bunch of mail tidily into a set of corpora
 # (see EOF for an example/testcase)
 #
 # Note: creates symbolic links only; renaming/moving the originals will
 # cause breakage.
 #
 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>


 use strict;
 use warnings;

 sub usage {
   die "
 usage: mk-corpus-link-farm [options] [dest] src [...]

 dest:
   -dest outputdir [-num num]

 options:
   -most_recent:  select the most recent messages (default)
   -after=N       only test mails received after time_t N (negative values
                  are an offset from current time, e.g. -86400 = last day)
                  or after date as parsed by Time::ParseDate (e.g. '-6 months')

 ";
 }

 use Time::ParseDate;
 use Time::Local;


 use Cwd;
 use File::Path;
 use File::Find;
 use File::Basename;
 use Data::Dumper;
 use SDBM_File;
 use Fcntl;

 my $DEBUG;# $DEBUG=1;

 my @classes = qw(ham spam);
 my $srcs = [ ];
 my $dests = [ ];
 my $mbox_tmpdir = $ENV{TMPDIR} || "/tmp";

 sub dbg;

 use Getopt::Long;
 our ($opt_most_recent, $opt_after, $opt_before);

 $opt_most_recent = 0;
 tz_init();

 my $curdest;
 GetOptions(
     'dest=s' => sub {
       my ($switch, $dir) = @_;
       $curdest = {
           ham => { },
           spam => { },
           dir => $dir
         };
       push (@$dests, $curdest);
     },

     'num=i' => sub {
       my ($switch, $num) = @_;
       $curdest->{num_msgs} = $num;
     },

     'most_recent' => \$opt_most_recent,
     'before=s' => \$opt_before,
     'after=s' => \$opt_after,
 ) or usage();

 foreach my $arg (@ARGV) {
   push (@$srcs, { dir => $arg });
 }

 # Deal with --before and --after
 foreach my $time ($opt_before, $opt_after) {
   if ($time && $time =~ /^-\d+$/) {
     $time = time + $time;
   }
   elsif ($time && $time !~ /^-?\d+$/) {
     $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1);
   }
 }


 # test data: $srcs = [ { dir => "/src1", ham => { dests => [ ], dir =>
 # "/src1/ham", num => 100 }, spam => { dests => [ ], dir => "/src1/spam", num
 # => 100 }, }, { dir => "/src2", ham => { dests => [ ], dir => "/src2/ham", num
 # => 300 }, spam => { dests => [ ], dir => "/src2/spam", num => 300 }, }, { dir
 # => "/src3", ham => { dests => [ ], dir => "/src3/ham", num => 500 }, spam =>
 # { dests => [ ], dir => "/src3/spam", num => 500 }, } ];

 my $cwd = cwd();

 # use an on-disk file -- this list can get pretty big!
 my $poss_del_path = "$mbox_tmpdir/dels.$$";
 my $poss_delete;

 {
   my %h;
   tie(%h, 'SDBM_File', $poss_del_path, O_RDWR|O_CREAT, 0600)
       or die "Couldn't tie SDBM file $poss_del_path: $!; aborting";
   $poss_delete = \%h;
 }

 my $mbox_work = "$mbox_tmpdir/mboxes.d";
 if (-d $mbox_work) {
   mark_for_poss_deletion($mbox_work);
 }

 main();

 # and clean up again.
 unlink $poss_del_path;
 unlink "$poss_del_path.dir";
 unlink "$poss_del_path.pag";
 exit;


 sub main {
   find_srcs();
   dist_across_dests();
   make_links_in_dests();
   perform_poss_deletion();
 }


 sub find_srcs {
   foreach my $src (@$srcs) {
     my $num_files;
     my @mboxes = ();

     my $cb = sub {
       if (-f $_ && -r _) {
         if ($_ =~ /\.mbox/i) {
          push @mboxes, $File::Find::name;
         } else {
           $num_files++;
         }
       }
     };

     $src->{ham}  = { num => 0, dests => [ ] };
     $src->{spam} = { num => 0, dests => [ ] };

     my $try_dir = "$src->{dir}/ham";
     if (-d $try_dir) {
       $num_files = 0;
       @mboxes = ();

       File::Find::find({ wanted => $cb, follow => 1 }, $try_dir);

       foreach my $mbox (@mboxes) {
        $num_files += mbox_count($mbox);
       }

       $src->{ham}{subdir} = $try_dir;
       $src->{ham}{num} = $num_files;
     }

     $try_dir = "$src->{dir}/spam";
     if (-d $try_dir) {
       $num_files = 0;
       @mboxes = ();

       File::Find::find({ wanted => $cb, follow => 1 }, $try_dir);

       foreach my $mbox (@mboxes) {
        $num_files += mbox_count($mbox);
       }

       $src->{spam}{subdir} = $try_dir;
       $src->{spam}{num} = $num_files;
     }

     print "$src->{dir}: found $src->{ham}{num} ham, $src->{spam}{num} spam\n";
   }
 }

 sub dist_across_dests {
   my @srcorder = @$srcs;

   foreach my $dest (@$dests) {
     my %want = ();
 my $wantnum = $dest->{num_msgs} || 99999999;
     foreach my $class (@classes) {
       $want{$class} = $wantnum;
     }

     $dest->{srcs} = [ ];
     print "\n$dest->{dir}: want $wantnum messages\n";

     foreach my $class (@classes) {
       foreach my $src (@srcorder) {
         last unless ($want{$class} > 0);
        allocate ($src, $dest, \$want{$class}, $class);
       }
     }

     foreach my $class (@classes) {
       print "$class:";
       ($class eq 'ham') and print " ";
       my $added = 0;
       foreach my $src (@{$dest->{$class}{srcs}}) {
         print "  $src->{num} of $src->{from}{$class}{subdir}";
         $added += $src->{num};
       }
       print "\n";

       if ($want{$class} > 0) {
         warn "       WARNING: failed to fill $dest->{dir}/$class: ".
                 "only $added, wanted $want{$class} more\n";
       }
     }

     # for the next dest, try to take some more entries from
     # other sources as well.  do this by moving the source that's
     # currently at the head of the list, to the end.

     my $first = shift @srcorder;
     @srcorder = (@srcorder, $first);
   }
 }

 sub make_links_in_dests {
   foreach my $class (@classes) {
     foreach my $dest (@$dests) {
       my $dir = $dest->{dir}.'/'.$class;
       if (-d $dir) {
         mark_for_poss_deletion($dir);
       }
       else {
         mkpath($dir) or warn "cannot mkdir $dir: $!";
       }
     }
     foreach my $src (@$srcs) {
       _mklink($class, $src);
     }
   }
 }

 sub _mklink {
   my ($class, $src) = @_;

   my $srcdir = $src->{$class}{subdir};
   if (!$srcdir) {
     dbg "no srcdir, skipping $src";
     return;
   }
   if (!-d $srcdir) {
     warn "cannot read $srcdir, ignoring: $!";
     return;
   }

   dbg "linking from $srcdir";

   # create a hash of modtime -> filepath, so we can be sure we pick up
   # "new" files first if so desired. note that -M gives (now - modtime) in
   # days, so larger numbers means earlier.

   my %files = ();
   File::Find::find({ follow => 1, wanted => sub {
     return unless (-f $_ && -r _);    # not a file

     my @stat = stat _;
     my $mtime = $stat[9];
     return unless message_is_useful_by_date($mtime);

     if (!exists $files{$mtime}) {
       $files{$mtime} = [ ];
     }
     if ($_ =~ /\.mbox/i) {
       push(@{$files{$mtime}}, mbox_extract_all($_));
     } else {
       push(@{$files{$mtime}}, $File::Find::name);
     }

   } }, $srcdir);

   my @files = ();
   foreach my $key (sort { $b <=> $a } keys %files) {
     push (@files, @{$files{$key}});
   }
   undef %files;     # no longer need that

   # @files is now sorted with the "youngest" files first.  check:
   if (scalar @files && $files[0] && $files[1] && -M $files[0] > -M $files[-1])
   {
     warn "oops! files out of order, should be youngest first: ".
                     join(' ',@files);
   }

   foreach my $destobj (@{$src->{$class}{dests}}) {
     my $dest = $destobj->{dest};
     my $num = $destobj->{num};
     my $destdir = $dest->{dir};

     dbg "  linking $num into $destdir";

     my $i;
     for ($i = 0; $i < $num; $i++)
     {
       my $srcname = shift @files;
       if (!$srcname) {
         # die "oops! ran out of srcs. at $i / $num. dump: ".Dumper($destobj);
         last;
       }

       my $dstname = $srcname;
       $dstname =~ s/[^-_\.A-Za-z0-9]/_/gs;
       $dstname =~ s/_+/_/gs;
       $dstname =~ s/^_//gs;
       $dstname = $destdir."/".$class."/".$dstname;

       if ($srcname !~ m,^/,) {           # unrooted.  root it
         $srcname = $cwd.'/'.$srcname;
       }

       remove_from_poss_delete($dstname);


       if (-l $dstname) {
         my $link = readlink($dstname);
         if ($link eq $srcname) {
           dbg "  $srcname already linked to $dstname";
           next;
         }
         unlink $dstname;
       }

       if (symlink($srcname, $dstname)) {
         dbg "  $srcname -> $dstname";
       } else {
         warn "symlink $srcname -> $dstname failed: $!";
       }
     }
   }
 }

 sub allocate {
   my ($src, $dest, $nhamref, $class) = @_;
   my $nsrc = $src->{$class}{num};

   dbg "$class nsrc=$nsrc nwanted=$$nhamref";
   if ($nsrc == 0) {
     dbg "already exhausted src";
   }
   elsif ($nsrc <= $$nhamref) {
     dbg "exhausted src";
     push (@{$dest->{$class}{srcs}}, { from => $src, num => $nsrc });
     push (@{$src->{$class}{dests}}, { dest => $dest, num => $nsrc });
     $$nhamref -= $nsrc;
     $src->{$class}{num} = 0;
   }
   else {
     dbg "filled dest, some left in src";
     push (@{$dest->{$class}{srcs}}, { from => $src, num => $$nhamref });
     push (@{$src->{$class}{dests}}, { dest => $dest, num => $$nhamref });
     $src->{$class}{num} -= $$nhamref;
     $$nhamref = 0;
   }
 }

 sub mark_for_poss_deletion {
   my ($dir) = @_;

   File::Find::find({ follow => 1, wanted => sub {

       return if (/mboxcountcache$/);
       if (!-d $_) {
         my $fname = $File::Find::name;
         $poss_delete->{$fname} = 1;
         dbg("marked as deleteable: $fname");
       } else {
         # TODO: delete dirs?  for now, leave 'em behind
       }

   } }, $dir);
 }

 sub perform_poss_deletion {
   foreach my $fname (keys %{$poss_delete}) {
     unlink $fname or warn "cannot unlink $fname";
   }
 }

 sub remove_from_poss_delete {
   my ($fname) = @_;
   if (exists $poss_delete->{$fname}) {
     delete $poss_delete->{$fname};
     return 1;
   } else {
     return 0;
   }
 }

 sub mbox_count {
   my ($mboxpath) = @_;
   print "counting mbox: $mboxpath\n";
   return _mbox_extract_all($mboxpath, 1);
 }

 sub mbox_extract_all {
   my ($mboxpath) = @_;
   print "extracting mbox: $mboxpath\n";
   return _mbox_extract_all($mboxpath, 0);
 }

 sub _mbox_extract_all {
   my ($mboxpath, $justcount) = @_;

   # create an area to hold extracted mbox files
   # this cannot use $$, it must remain the same between runs
   if (!-d $mbox_work) {
     mkdir $mbox_work or die "cannot create tmpdir: $mbox_work";
     # fatal error, could be an attack
   }

   my $countcache = get_mbox_name ($mboxpath, 0);
   $countcache =~ s/OFF\d+$/mboxcountcache/gs;
   if ($justcount && -f $countcache && -M $countcache < -M $mboxpath) {
     open (CACHE, "<$countcache");
     my $count = <CACHE> + 0;
     close CACHE;
     return $count;
   }

   my $counter = 0;
   my @created_files = ();

   open (INPUT, "<$mboxpath") or die "cannot read $mboxpath";
   binmode INPUT;

   # get stat details for the input mbox
   my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
                $atime,$mtime,$ctime) = stat INPUT;

   # assumption: get_mbox_name() uses the same dir for all offsets
   my $newname = get_mbox_name ($mboxpath, 0);
   my $dir = dirname ($newname);
   if (!-d $dir) {
     mkdir $dir or die "cannot mkdir $dir";
   }
   chmod 0755, $dir or warn "cannot chmod $dir";

   my $start = 0;            # start of a message
   my $where = 0;            # current byte offset
   my $in_header = 0;        # are in we a header?
   my $fromline;

   while (!eof INPUT) {
     my $offset = $start;    # byte offset of this message

     while (<INPUT>) {

 nextfrom:
       last unless defined($_);
       if (substr($_,0,5) eq "From ") {
         $in_header = 1;
         $start = $where;
         $where = tell INPUT;
         $fromline = $_;
         last;
       }
     }
     last unless defined($_);

     # dbg "mbox From: $counter $start $where $fromline";

     if ($fromline && mbox_new_enough($fromline))
     {
       $counter++;

       if (!$justcount) {
         $newname = get_mbox_name ($mboxpath, $offset);

         if (-f $newname && (-M _ >= -M INPUT)) {
           # no need to recreate it, it's fresh

           my $past = 0;
           while (<INPUT>) {
             if ($past) {
               last if (!defined($_) || substr($_,0,5) eq "From ");
             } else {
               $past = 1;
             }
           }
         }
         else {
           seek (INPUT, $where, 0);
           open (OUTPUT, ">$newname") or die "cannot write to $newname";
           binmode OUTPUT;

           my $past = 0;
           while (<INPUT>) {
             if ($past) {
               last if (!defined($_) || substr($_,0,5) eq "From ");
             } else {
               $past = 1;
             }
             print OUTPUT;
           }

           close OUTPUT or die "failed to write to $newname";

           chmod 0644, $newname or warn "cannot chmod $newname";

           utime $atime, $mtime, $newname
                           or warn "failed to touch $newname";
         }

         push @created_files, $newname;
         remove_from_poss_delete($newname);

         $where = tell INPUT;
         $offset = $where;

         # we've already read the next "From " line, parse it now
         goto nextfrom;
       }
     }
   }
   close INPUT;

   if ($justcount) {
     open (CACHE, ">$countcache");
     print CACHE $counter;
     close CACHE;
     return $counter;
   }
   else {
     print "extracted: $mboxpath: $counter files\n";
     return @created_files;
   }
 }

 sub get_mbox_name {
   my ($mboxpath, $where) = @_;

   my $dstname = $mboxpath;
   $dstname =~ s/[^-_\.A-Za-z0-9]/_/gs;
   $dstname =~ s/_+/_/gs;
   $dstname =~ s/^_//gs;
   $dstname = $mbox_work."/".$dstname."/OFF".$where;
   return $dstname;
 }

 sub mbox_new_enough {
   my ($fromline) = @_;

   # From xscludshmkjgc@yahoo.com  Thu Apr 29 20:02:18 2004
   return unless ($fromline && $fromline =~ /^From \S+ +(.*)$/);

   $fromline = $1;
   $fromline .= " ".local_tz() unless $fromline =~ /(?:[-+]\d{4}|\b[A-Z]{2,4}\b)/;
   my $time = first_date($fromline);
   return message_is_useful_by_date($time);
 }

 sub message_is_useful_by_date {
   my ($date) = @_;

   return 0 unless $date;        # undef or 0 date = unusable

   if (!$opt_after && !$opt_before) {
     # Not using the feature
     return 1;
   }
   elsif (!$opt_before) {
     # Just care about after
     return $date > $opt_after;
   }
   else {
     return (($date < $opt_before) && ($date > $opt_after));
   }
 }

 sub dbg {
   return unless $DEBUG;
   warn "debug: ".join("", @_)."\n";
 }

 sub first_date {
   my (@strings) = @_;

   foreach my $string (@strings) {
     my $time = parse_rfc822_date($string);
     return $time if defined($time) && $time;
   }
   return undef;
 }

 ###########################################################################

 my %TZ;
 my %MONTH;
 my $LOCALTZ;

 sub tz_init {

 # timezone mappings: in case of conflicts, use RFC 2822, then most
 # common and least conflicting mapping
 %TZ = (
 	# standard
 	'UT'   => '+0000',
 	'UTC'  => '+0000',
 	# US and Canada
 	'NDT'  => '-0230',
 	'AST'  => '-0400',
 	'ADT'  => '-0300',
 	'NST'  => '-0330',
 	'EST'  => '-0500',
 	'EDT'  => '-0400',
 	'CST'  => '-0600',
 	'CDT'  => '-0500',
 	'MST'  => '-0700',
 	'MDT'  => '-0600',
 	'PST'  => '-0800',
 	'PDT'  => '-0700',
 	'HST'  => '-1000',
 	'AKST' => '-0900',
 	'AKDT' => '-0800',
 	'HADT' => '-0900',
 	'HAST' => '-1000',
 	# Europe
 	'GMT'  => '+0000',
 	'BST'  => '+0100',
 	'IST'  => '+0100',
 	'WET'  => '+0000',
 	'WEST' => '+0100',
 	'CET'  => '+0100',
 	'CEST' => '+0200',
 	'EET'  => '+0200',
 	'EEST' => '+0300',
 	'MSK'  => '+0300',
 	'MSD'  => '+0400',
 	'MET'  => '+0100',
 	'MEZ'  => '+0100',
 	'MEST' => '+0200',
 	'MESZ' => '+0200',
 	# South America
 	'BRST' => '-0200',
 	'BRT'  => '-0300',
 	# Australia
 	'AEST' => '+1000',
 	'AEDT' => '+1100',
 	'ACST' => '+0930',
 	'ACDT' => '+1030',
 	'AWST' => '+0800',
 	# New Zealand
 	'NZST' => '+1200',
 	'NZDT' => '+1300',
 	# Asia
 	'JST'  => '+0900',
 	'KST'  => '+0900',
 	'HKT'  => '+0800',
 	'SGT'  => '+0800',
 	'PHT'  => '+0800',
 	# Middle East
 	'IDT'  => '+0300',
 	);

 # month mappings
 %MONTH = (jan => 1, feb => 2, mar => 3, apr => 4, may => 5, jun => 6,
 	     jul => 7, aug => 8, sep => 9, oct => 10, nov => 11, dec => 12);

 }

 sub local_tz {
   return $LOCALTZ if defined($LOCALTZ);

   # standard method for determining local timezone
   my $time = time;
   my @g = gmtime($time);
   my @t = localtime($time);
   my $z = $t[1]-$g[1]+($t[2]-$g[2])*60+($t[7]-$g[7])*1440+($t[5]-$g[5])*525600;
   $LOCALTZ = sprintf("%+.2d%.2d", $z/60, $z%60);
   return $LOCALTZ;
 }

 sub parse_rfc822_date {
   my ($date) = @_;
   local ($_);
   my ($yyyy, $mmm, $dd, $hh, $mm, $ss, $mon, $tzoff);

   # make it a bit easier to match
   $_ = " $date "; s/, */ /gs; s/\s+/ /gs;

   # now match it in parts.  Date part first:
   if (s/ (\d+) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{4}) / /i) {
     $dd = $1; $mon = lc($2); $yyyy = $3;
   } elsif (s/ (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) +(\d+) \d+:\d+:\d+ (\d{4}) / /i) {
     $dd = $2; $mon = lc($1); $yyyy = $3;
   } elsif (s/ (\d+) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{2,3}) / /i) {
     $dd = $1; $mon = lc($2); $yyyy = $3;
   } else {
     dbg("util: time cannot be parsed: $date");
     return undef;
   }

   # handle two and three digit dates as specified by RFC 2822
   if (defined $yyyy) {
     if (length($yyyy) == 2 && $yyyy < 50) {
       $yyyy += 2000;
     }
     elsif (length($yyyy) != 4) {
       # three digit years and two digit years with values between 50 and 99
       $yyyy += 1900;
     }
   }

   # hh:mm:ss
   if (s/ (\d?\d):(\d\d)(:(\d\d))? / /) {
     $hh = $1; $mm = $2; $ss = $4 || 0;
   }

   # numeric timezones
   if (s/ ([-+]\d{4}) / /) {
     $tzoff = $1;
   }
   # common timezones
   elsif (s/\b([A-Z]{2,4}(?:-DST)?)\b/ / && exists $TZ{$1}) {
     $tzoff = $TZ{$1};
   }
   # all other timezones are considered equivalent to "-0000"
   $tzoff ||= '-0000';

   # months
   if (exists $MONTH{$mon}) {
     $mmm = $MONTH{$mon};
   }

   $hh ||= 0; $mm ||= 0; $ss ||= 0; $dd ||= 0; $mmm ||= 0; $yyyy ||= 0;

   # Time::Local (v1.10 at least) throws warnings when the dates cause
   # a 32-bit overflow.  So force a min/max for year.
   if ($yyyy > 2037) {
     dbg("util: date after supported range, forcing year to 2037: $date");
     $yyyy = 2037;
   }
   elsif ($yyyy < 1970) {
     dbg("util: date before supported range, forcing year to 1970: $date");
     $yyyy = 1971;
   }

   my $time;
   eval {		# could croak
     $time = timegm($ss, $mm, $hh, $dd, $mmm-1, $yyyy);
   };

   if ($@) {
     dbg("util: time cannot be parsed: $date, $yyyy-$mmm-$dd $hh:$mm:$ss: $@");
     return undef;
   }

   if ($tzoff =~ /([-+])(\d\d)(\d\d)$/)	# convert to seconds difference
   {
     $tzoff = (($2 * 60) + $3) * 60;
     if ($1 eq '-') {
       $time += $tzoff;
     } else {
       $time -= $tzoff;
     }
   }

   return $time;
 }

 sub time_to_rfc822_date {
   my($time) = @_;

   my @days = qw/Sun Mon Tue Wed Thu Fri Sat/;
   my @months = qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/;
   my @localtime = localtime($time || time);
   $localtime[5]+=1900;

   sprintf("%s, %02d %s %4d %02d:%02d:%02d %s", $days[$localtime[6]], $localtime[3],
     $months[$localtime[4]], @localtime[5,2,1,0], local_tz());
 }

 ###########################################################################

 __DATA__

 Quick test/demo.   Given the following input structure:

   src1/{ham,spam}/{1,2,3}
   src2/{ham,spam}/{1,2}
   src4/{ham,spam}/1

 and this command:

   ../mk-corpus-link-farm \
       -dest ./out1 -num 1 -dest ./out2 -num 2 -dest ./out3 -num 5 \
       src*

 we want:

   out1/{ham,spam}/1
   out2/{ham,spam}/{1,2}
   out3/{ham,spam}/{1,2,3}

   [and a warning that we exhausted the sources, because we actually
   asked for 5 mails in each class of out3.]

 test commands:

   mkdir t_splitcorpus; cd t_splitcorpus; mkdir -p src{1,2,3}/{ham,spam}
   for f in src1/{ham,spam}/{1,2,3} src2/{ham,spam}/{1,2} src3/{ham,spam}/1
   do echo > $f ; done;
   ../mk-corpus-link-farm \
       -dest ./out1 -num 1 -dest ./out2 -num 2 -dest ./out3 -num 5 \
       src*

   ../mk-corpus-link-farm \
       -dest ./out1 -num 1 -dest ./out2 -num 2 -dest ./out3 -num 5 \
       src1/*.mbox src2 src3
	#!/usr/bin/perl
	#
	# mk-corpus-link-farm - distribute a bunch of mail tidily into a set of corpora
	# (see EOF for an example/testcase)
	#
	# Note: creates symbolic links only; renaming/moving the originals will
	# cause breakage.
	#
	# <@LICENSE>
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to you under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at:
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# </@LICENSE>


	use strict;
	use warnings;

	sub usage {
	die "
	usage: mk-corpus-link-farm [options] [dest] src [...]

	dest:
	-dest outputdir [-num num]

	options:
	-most_recent: select the most recent messages (default)
	-after=N only test mails received after time_t N (negative values
	are an offset from current time, e.g. -86400 = last day)
	or after date as parsed by Time::ParseDate (e.g. '-6 months')

	";
	}

	use Time::ParseDate;
	use Time::Local;


	use Cwd;
	use File::Path;
	use File::Find;
	use File::Basename;
	use Data::Dumper;
	use SDBM_File;
	use Fcntl;

	my $DEBUG;# $DEBUG=1;

	my @classes = qw(ham spam);
	my $srcs = [ ];
	my $dests = [ ];
	my $mbox_tmpdir = $ENV{TMPDIR} \|\| "/tmp";

	sub dbg;

	use Getopt::Long;
	our ($opt_most_recent, $opt_after, $opt_before);

	$opt_most_recent = 0;
	tz_init();

	my $curdest;
	GetOptions(
	'dest=s' => sub {
	my ($switch, $dir) = @_;
	$curdest = {
	ham => { },
	spam => { },
	dir => $dir
	};
	push (@$dests, $curdest);
	},

	'num=i' => sub {
	my ($switch, $num) = @_;
	$curdest->{num_msgs} = $num;
	},

	'most_recent' => \$opt_most_recent,
	'before=s' => \$opt_before,
	'after=s' => \$opt_after,
	) or usage();

	foreach my $arg (@ARGV) {
	push (@$srcs, { dir => $arg });
	}

	# Deal with --before and --after
	foreach my $time ($opt_before, $opt_after) {
	if ($time && $time =~ /^-\d+$/) {
	$time = time + $time;
	}
	elsif ($time && $time !~ /^-?\d+$/) {
	$time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1);
	}
	}


	# test data: $srcs = [ { dir => "/src1", ham => { dests => [ ], dir =>
	# "/src1/ham", num => 100 }, spam => { dests => [ ], dir => "/src1/spam", num
	# => 100 }, }, { dir => "/src2", ham => { dests => [ ], dir => "/src2/ham", num
	# => 300 }, spam => { dests => [ ], dir => "/src2/spam", num => 300 }, }, { dir
	# => "/src3", ham => { dests => [ ], dir => "/src3/ham", num => 500 }, spam =>
	# { dests => [ ], dir => "/src3/spam", num => 500 }, } ];

	my $cwd = cwd();

	# use an on-disk file -- this list can get pretty big!
	my $poss_del_path = "$mbox_tmpdir/dels.$$";
	my $poss_delete;

	{
	my %h;
	tie(%h, 'SDBM_File', $poss_del_path, O_RDWR\|O_CREAT, 0600)
	or die "Couldn't tie SDBM file $poss_del_path: $!; aborting";
	$poss_delete = \%h;
	}

	my $mbox_work = "$mbox_tmpdir/mboxes.d";
	if (-d $mbox_work) {
	mark_for_poss_deletion($mbox_work);
	}

	main();

	# and clean up again.
	unlink $poss_del_path;
	unlink "$poss_del_path.dir";
	unlink "$poss_del_path.pag";
	exit;


	sub main {
	find_srcs();
	dist_across_dests();
	make_links_in_dests();
	perform_poss_deletion();
	}



	sub find_srcs {
	foreach my $src (@$srcs) {
	my $num_files;
	my @mboxes = ();

	my $cb = sub {
	if (-f $_ && -r _) {
	if ($_ =~ /\.mbox/i) {
	push @mboxes, $File::Find::name;
	} else {
	$num_files++;
	}
	}
	};

	$src->{ham} = { num => 0, dests => [ ] };
	$src->{spam} = { num => 0, dests => [ ] };

	my $try_dir = "$src->{dir}/ham";
	if (-d $try_dir) {
	$num_files = 0;
	@mboxes = ();

	File::Find::find({ wanted => $cb, follow => 1 }, $try_dir);

	foreach my $mbox (@mboxes) {
	$num_files += mbox_count($mbox);
	}

	$src->{ham}{subdir} = $try_dir;
	$src->{ham}{num} = $num_files;
	}

	$try_dir = "$src->{dir}/spam";
	if (-d $try_dir) {
	$num_files = 0;
	@mboxes = ();

	File::Find::find({ wanted => $cb, follow => 1 }, $try_dir);

	foreach my $mbox (@mboxes) {
	$num_files += mbox_count($mbox);
	}

	$src->{spam}{subdir} = $try_dir;
	$src->{spam}{num} = $num_files;
	}

	print "$src->{dir}: found $src->{ham}{num} ham, $src->{spam}{num} spam\n";
	}
	}

	sub dist_across_dests {
	my @srcorder = @$srcs;

	foreach my $dest (@$dests) {
	my %want = ();
	my $wantnum = $dest->{num_msgs} \|\| 99999999;
	foreach my $class (@classes) {
	$want{$class} = $wantnum;
	}

	$dest->{srcs} = [ ];
	print "\n$dest->{dir}: want $wantnum messages\n";

	foreach my $class (@classes) {
	foreach my $src (@srcorder) {
	last unless ($want{$class} > 0);
	allocate ($src, $dest, \$want{$class}, $class);
	}
	}

	foreach my $class (@classes) {
	print "$class:";
	($class eq 'ham') and print " ";
	my $added = 0;
	foreach my $src (@{$dest->{$class}{srcs}}) {
	print " $src->{num} of $src->{from}{$class}{subdir}";
	$added += $src->{num};
	}
	print "\n";

	if ($want{$class} > 0) {
	warn " WARNING: failed to fill $dest->{dir}/$class: ".
	"only $added, wanted $want{$class} more\n";
	}
	}

	# for the next dest, try to take some more entries from
	# other sources as well. do this by moving the source that's
	# currently at the head of the list, to the end.

	my $first = shift @srcorder;
	@srcorder = (@srcorder, $first);
	}
	}

	sub make_links_in_dests {
	foreach my $class (@classes) {
	foreach my $dest (@$dests) {
	my $dir = $dest->{dir}.'/'.$class;
	if (-d $dir) {
	mark_for_poss_deletion($dir);
	}
	else {
	mkpath($dir) or warn "cannot mkdir $dir: $!";
	}
	}
	foreach my $src (@$srcs) {
	_mklink($class, $src);
	}
	}
	}

	sub _mklink {
	my ($class, $src) = @_;

	my $srcdir = $src->{$class}{subdir};
	if (!$srcdir) {
	dbg "no srcdir, skipping $src";
	return;
	}
	if (!-d $srcdir) {
	warn "cannot read $srcdir, ignoring: $!";
	return;
	}

	dbg "linking from $srcdir";

	# create a hash of modtime -> filepath, so we can be sure we pick up
	# "new" files first if so desired. note that -M gives (now - modtime) in
	# days, so larger numbers means earlier.

	my %files = ();
	File::Find::find({ follow => 1, wanted => sub {
	return unless (-f $_ && -r _); # not a file

	my @stat = stat _;
	my $mtime = $stat[9];
	return unless message_is_useful_by_date($mtime);

	if (!exists $files{$mtime}) {
	$files{$mtime} = [ ];
	}
	if ($_ =~ /\.mbox/i) {
	push(@{$files{$mtime}}, mbox_extract_all($_));
	} else {
	push(@{$files{$mtime}}, $File::Find::name);
	}

	} }, $srcdir);

	my @files = ();
	foreach my $key (sort { $b <=> $a } keys %files) {
	push (@files, @{$files{$key}});
	}
	undef %files; # no longer need that

	# @files is now sorted with the "youngest" files first. check:
	if (scalar @files && $files[0] && $files[1] && -M $files[0] > -M $files[-1])
	{
	warn "oops! files out of order, should be youngest first: ".
	join(' ',@files);
	}

	foreach my $destobj (@{$src->{$class}{dests}}) {
	my $dest = $destobj->{dest};
	my $num = $destobj->{num};
	my $destdir = $dest->{dir};

	dbg " linking $num into $destdir";

	my $i;
	for ($i = 0; $i < $num; $i++)
	{
	my $srcname = shift @files;
	if (!$srcname) {
	# die "oops! ran out of srcs. at $i / $num. dump: ".Dumper($destobj);
	last;
	}

	my $dstname = $srcname;
	$dstname =~ s/[^-_\.A-Za-z0-9]/_/gs;
	$dstname =~ s/_+/_/gs;
	$dstname =~ s/^_//gs;
	$dstname = $destdir."/".$class."/".$dstname;

	if ($srcname !~ m,^/,) { # unrooted. root it
	$srcname = $cwd.'/'.$srcname;
	}

	remove_from_poss_delete($dstname);


	if (-l $dstname) {
	my $link = readlink($dstname);
	if ($link eq $srcname) {
	dbg " $srcname already linked to $dstname";
	next;
	}
	unlink $dstname;
	}

	if (symlink($srcname, $dstname)) {
	dbg " $srcname -> $dstname";
	} else {
	warn "symlink $srcname -> $dstname failed: $!";
	}
	}
	}
	}

	sub allocate {
	my ($src, $dest, $nhamref, $class) = @_;
	my $nsrc = $src->{$class}{num};

	dbg "$class nsrc=$nsrc nwanted=$$nhamref";
	if ($nsrc == 0) {
	dbg "already exhausted src";
	}
	elsif ($nsrc <= $$nhamref) {
	dbg "exhausted src";
	push (@{$dest->{$class}{srcs}}, { from => $src, num => $nsrc });
	push (@{$src->{$class}{dests}}, { dest => $dest, num => $nsrc });
	$$nhamref -= $nsrc;
	$src->{$class}{num} = 0;
	}
	else {
	dbg "filled dest, some left in src";
	push (@{$dest->{$class}{srcs}}, { from => $src, num => $$nhamref });
	push (@{$src->{$class}{dests}}, { dest => $dest, num => $$nhamref });
	$src->{$class}{num} -= $$nhamref;
	$$nhamref = 0;
	}
	}

	sub mark_for_poss_deletion {
	my ($dir) = @_;

	File::Find::find({ follow => 1, wanted => sub {

	return if (/mboxcountcache$/);
	if (!-d $_) {
	my $fname = $File::Find::name;
	$poss_delete->{$fname} = 1;
	dbg("marked as deleteable: $fname");
	} else {
	# TODO: delete dirs? for now, leave 'em behind
	}

	} }, $dir);
	}

	sub perform_poss_deletion {
	foreach my $fname (keys %{$poss_delete}) {
	unlink $fname or warn "cannot unlink $fname";
	}
	}

	sub remove_from_poss_delete {
	my ($fname) = @_;
	if (exists $poss_delete->{$fname}) {
	delete $poss_delete->{$fname};
	return 1;
	} else {
	return 0;
	}
	}

	sub mbox_count {
	my ($mboxpath) = @_;
	print "counting mbox: $mboxpath\n";
	return _mbox_extract_all($mboxpath, 1);
	}

	sub mbox_extract_all {
	my ($mboxpath) = @_;
	print "extracting mbox: $mboxpath\n";
	return _mbox_extract_all($mboxpath, 0);
	}

	sub _mbox_extract_all {
	my ($mboxpath, $justcount) = @_;

	# create an area to hold extracted mbox files
	# this cannot use $$, it must remain the same between runs
	if (!-d $mbox_work) {
	mkdir $mbox_work or die "cannot create tmpdir: $mbox_work";
	# fatal error, could be an attack
	}

	my $countcache = get_mbox_name ($mboxpath, 0);
	$countcache =~ s/OFF\d+$/mboxcountcache/gs;
	if ($justcount && -f $countcache && -M $countcache < -M $mboxpath) {
	open (CACHE, "<$countcache");
	my $count = <CACHE> + 0;
	close CACHE;
	return $count;
	}

	my $counter = 0;
	my @created_files = ();

	open (INPUT, "<$mboxpath") or die "cannot read $mboxpath";
	binmode INPUT;

	# get stat details for the input mbox
	my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
	$atime,$mtime,$ctime) = stat INPUT;

	# assumption: get_mbox_name() uses the same dir for all offsets
	my $newname = get_mbox_name ($mboxpath, 0);
	my $dir = dirname ($newname);
	if (!-d $dir) {
	mkdir $dir or die "cannot mkdir $dir";
	}
	chmod 0755, $dir or warn "cannot chmod $dir";

	my $start = 0; # start of a message
	my $where = 0; # current byte offset
	my $in_header = 0; # are in we a header?
	my $fromline;

	while (!eof INPUT) {
	my $offset = $start; # byte offset of this message

	while (<INPUT>) {

	nextfrom:
	last unless defined($_);
	if (substr($_,0,5) eq "From ") {
	$in_header = 1;
	$start = $where;
	$where = tell INPUT;
	$fromline = $_;
	last;
	}
	}
	last unless defined($_);

	# dbg "mbox From: $counter $start $where $fromline";

	if ($fromline && mbox_new_enough($fromline))
	{
	$counter++;

	if (!$justcount) {
	$newname = get_mbox_name ($mboxpath, $offset);

	if (-f $newname && (-M _ >= -M INPUT)) {
	# no need to recreate it, it's fresh

	my $past = 0;
	while (<INPUT>) {
	if ($past) {
	last if (!defined($_) \|\| substr($_,0,5) eq "From ");
	} else {
	$past = 1;
	}
	}
	}
	else {
	seek (INPUT, $where, 0);
	open (OUTPUT, ">$newname") or die "cannot write to $newname";
	binmode OUTPUT;

	my $past = 0;
	while (<INPUT>) {
	if ($past) {
	last if (!defined($_) \|\| substr($_,0,5) eq "From ");
	} else {
	$past = 1;
	}
	print OUTPUT;
	}

	close OUTPUT or die "failed to write to $newname";

	chmod 0644, $newname or warn "cannot chmod $newname";

	utime $atime, $mtime, $newname
	or warn "failed to touch $newname";
	}

	push @created_files, $newname;
	remove_from_poss_delete($newname);

	$where = tell INPUT;
	$offset = $where;

	# we've already read the next "From " line, parse it now
	goto nextfrom;
	}
	}
	}
	close INPUT;

	if ($justcount) {
	open (CACHE, ">$countcache");
	print CACHE $counter;
	close CACHE;
	return $counter;
	}
	else {
	print "extracted: $mboxpath: $counter files\n";
	return @created_files;
	}
	}

	sub get_mbox_name {
	my ($mboxpath, $where) = @_;

	my $dstname = $mboxpath;
	$dstname =~ s/[^-_\.A-Za-z0-9]/_/gs;
	$dstname =~ s/_+/_/gs;
	$dstname =~ s/^_//gs;
	$dstname = $mbox_work."/".$dstname."/OFF".$where;
	return $dstname;
	}

	sub mbox_new_enough {
	my ($fromline) = @_;

	# From xscludshmkjgc@yahoo.com Thu Apr 29 20:02:18 2004
	return unless ($fromline && $fromline =~ /^From \S+ +(.*)$/);

	$fromline = $1;
	$fromline .= " ".local_tz() unless $fromline =~ /(?:[-+]\d{4}\|\b[A-Z]{2,4}\b)/;
	my $time = first_date($fromline);
	return message_is_useful_by_date($time);
	}

	sub message_is_useful_by_date {
	my ($date) = @_;

	return 0 unless $date; # undef or 0 date = unusable

	if (!$opt_after && !$opt_before) {
	# Not using the feature
	return 1;
	}
	elsif (!$opt_before) {
	# Just care about after
	return $date > $opt_after;
	}
	else {
	return (($date < $opt_before) && ($date > $opt_after));
	}
	}

	sub dbg {
	return unless $DEBUG;
	warn "debug: ".join("", @_)."\n";
	}

	sub first_date {
	my (@strings) = @_;

	foreach my $string (@strings) {
	my $time = parse_rfc822_date($string);
	return $time if defined($time) && $time;
	}
	return undef;
	}

	###########################################################################

	my %TZ;
	my %MONTH;
	my $LOCALTZ;

	sub tz_init {

	# timezone mappings: in case of conflicts, use RFC 2822, then most
	# common and least conflicting mapping
	%TZ = (
	# standard
	'UT' => '+0000',
	'UTC' => '+0000',
	# US and Canada
	'NDT' => '-0230',
	'AST' => '-0400',
	'ADT' => '-0300',
	'NST' => '-0330',
	'EST' => '-0500',
	'EDT' => '-0400',
	'CST' => '-0600',
	'CDT' => '-0500',
	'MST' => '-0700',
	'MDT' => '-0600',
	'PST' => '-0800',
	'PDT' => '-0700',
	'HST' => '-1000',
	'AKST' => '-0900',
	'AKDT' => '-0800',
	'HADT' => '-0900',
	'HAST' => '-1000',
	# Europe
	'GMT' => '+0000',
	'BST' => '+0100',
	'IST' => '+0100',
	'WET' => '+0000',
	'WEST' => '+0100',
	'CET' => '+0100',
	'CEST' => '+0200',
	'EET' => '+0200',
	'EEST' => '+0300',
	'MSK' => '+0300',
	'MSD' => '+0400',
	'MET' => '+0100',
	'MEZ' => '+0100',
	'MEST' => '+0200',
	'MESZ' => '+0200',
	# South America
	'BRST' => '-0200',
	'BRT' => '-0300',
	# Australia
	'AEST' => '+1000',
	'AEDT' => '+1100',
	'ACST' => '+0930',
	'ACDT' => '+1030',
	'AWST' => '+0800',
	# New Zealand
	'NZST' => '+1200',
	'NZDT' => '+1300',
	# Asia
	'JST' => '+0900',
	'KST' => '+0900',
	'HKT' => '+0800',
	'SGT' => '+0800',
	'PHT' => '+0800',
	# Middle East
	'IDT' => '+0300',
	);

	# month mappings
	%MONTH = (jan => 1, feb => 2, mar => 3, apr => 4, may => 5, jun => 6,
	jul => 7, aug => 8, sep => 9, oct => 10, nov => 11, dec => 12);

	}

	sub local_tz {
	return $LOCALTZ if defined($LOCALTZ);

	# standard method for determining local timezone
	my $time = time;
	my @g = gmtime($time);
	my @t = localtime($time);
	my $z = $t[1]-$g[1]+($t[2]-$g[2])60+($t[7]-$g[7])1440+($t[5]-$g[5])*525600;
	$LOCALTZ = sprintf("%+.2d%.2d", $z/60, $z%60);
	return $LOCALTZ;
	}

	sub parse_rfc822_date {
	my ($date) = @_;
	local ($_);
	my ($yyyy, $mmm, $dd, $hh, $mm, $ss, $mon, $tzoff);

	# make it a bit easier to match
	$_ = " $date "; s/, */ /gs; s/\s+/ /gs;

	# now match it in parts. Date part first:
	if (s/ (\d+) (Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec) (\d{4}) / /i) {
	$dd = $1; $mon = lc($2); $yyyy = $3;
	} elsif (s/ (Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec) +(\d+) \d+:\d+:\d+ (\d{4}) / /i) {
	$dd = $2; $mon = lc($1); $yyyy = $3;
	} elsif (s/ (\d+) (Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec) (\d{2,3}) / /i) {
	$dd = $1; $mon = lc($2); $yyyy = $3;
	} else {
	dbg("util: time cannot be parsed: $date");
	return undef;
	}

	# handle two and three digit dates as specified by RFC 2822
	if (defined $yyyy) {
	if (length($yyyy) == 2 && $yyyy < 50) {
	$yyyy += 2000;
	}
	elsif (length($yyyy) != 4) {
	# three digit years and two digit years with values between 50 and 99
	$yyyy += 1900;
	}
	}

	# hh:mm:ss
	if (s/ (\d?\d):(\d\d)(:(\d\d))? / /) {
	$hh = $1; $mm = $2; $ss = $4 \|\| 0;
	}

	# numeric timezones
	if (s/ ([-+]\d{4}) / /) {
	$tzoff = $1;
	}
	# common timezones
	elsif (s/\b([A-Z]{2,4}(?:-DST)?)\b/ / && exists $TZ{$1}) {
	$tzoff = $TZ{$1};
	}
	# all other timezones are considered equivalent to "-0000"
	$tzoff \|\|= '-0000';

	# months
	if (exists $MONTH{$mon}) {
	$mmm = $MONTH{$mon};
	}

	$hh \|\|= 0; $mm \|\|= 0; $ss \|\|= 0; $dd \|\|= 0; $mmm \|\|= 0; $yyyy \|\|= 0;

	# Time::Local (v1.10 at least) throws warnings when the dates cause
	# a 32-bit overflow. So force a min/max for year.
	if ($yyyy > 2037) {
	dbg("util: date after supported range, forcing year to 2037: $date");
	$yyyy = 2037;
	}
	elsif ($yyyy < 1970) {
	dbg("util: date before supported range, forcing year to 1970: $date");
	$yyyy = 1971;
	}

	my $time;
	eval { # could croak
	$time = timegm($ss, $mm, $hh, $dd, $mmm-1, $yyyy);
	};

	if ($@) {
	dbg("util: time cannot be parsed: $date, $yyyy-$mmm-$dd $hh:$mm:$ss: $@");
	return undef;
	}

	if ($tzoff =~ /([-+])(\d\d)(\d\d)$/) # convert to seconds difference
	{
	$tzoff = (($2 * 60) + $3) * 60;
	if ($1 eq '-') {
	$time += $tzoff;
	} else {
	$time -= $tzoff;
	}
	}

	return $time;
	}

	sub time_to_rfc822_date {
	my($time) = @_;

	my @days = qw/Sun Mon Tue Wed Thu Fri Sat/;
	my @months = qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/;
	my @localtime = localtime($time \|\| time);
	$localtime[5]+=1900;

	sprintf("%s, %02d %s %4d %02d:%02d:%02d %s", $days[$localtime[6]], $localtime[3],
	$months[$localtime[4]], @localtime[5,2,1,0], local_tz());
	}

	###########################################################################

	__DATA__

	Quick test/demo. Given the following input structure:

	src1/{ham,spam}/{1,2,3}
	src2/{ham,spam}/{1,2}
	src4/{ham,spam}/1

	and this command:

	../mk-corpus-link-farm \
	-dest ./out1 -num 1 -dest ./out2 -num 2 -dest ./out3 -num 5 \
	src*

	we want:

	out1/{ham,spam}/1
	out2/{ham,spam}/{1,2}
	out3/{ham,spam}/{1,2,3}

	[and a warning that we exhausted the sources, because we actually
	asked for 5 mails in each class of out3.]

	test commands:

	mkdir t_splitcorpus; cd t_splitcorpus; mkdir -p src{1,2,3}/{ham,spam}
	for f in src1/{ham,spam}/{1,2,3} src2/{ham,spam}/{1,2} src3/{ham,spam}/1
	do echo > $f ; done;
	../mk-corpus-link-farm \
	-dest ./out1 -num 1 -dest ./out2 -num 2 -dest ./out3 -num 5 \
	src*

	../mk-corpus-link-farm \
	-dest ./out1 -num 1 -dest ./out2 -num 2 -dest ./out3 -num 5 \
	src1/*.mbox src2 src3