| #!/usr/bin/perl |
| # |
| # split-log-into-buckets-cached x:output [y:output2 ...] |
| # |
| # Split a mass-check log into several identically-sized buckets, evenly |
| # taking messages from all checked corpora and preserving comments, |
| # writing output to the files listed on the line. |
| # It does this evenly by running through all buckets sequentially |
| # as each line is read. |
| # |
| # Each output file must be listed, and the count of the "n" buckets |
| # for that file specified; so for example 1 file can contain |
| # 9 buckets. |
| # |
| # This variant operates randomly (as per -random) but caches results. |
| |
| my $input; |
| my @outs = (); |
| foreach my $arg (@ARGV) { |
| if ($arg =~ /^(\d+):(.*)$/) { |
| my $c = $1; |
| my $out = $2; |
| print "Creating $out with $c buckets\n"; |
| push (@outs, { c => $c, out => $out }); |
| } |
| else { |
| print "Reading from $arg\n"; |
| $input = $arg; |
| } |
| } |
| |
| die "usage\n" unless $input; |
| |
| my @instat = stat($input); |
| my $rebuild = 0; |
| foreach my $out (@outs) { |
| my @outstat = stat($out->{out}); |
| if (!($outstat[9] && $instat[9] && $instat[9] < $outstat[9])) { |
| $rebuild = 1; |
| } |
| } |
| |
| if ($rebuild == 0) { |
| print "Existing outputs are up-to-date\n"; |
| exit; |
| } |
| |
| my %buckets = (); |
| my $numbuckets = 0; |
| foreach my $out (@outs) { |
| my $last = $numbuckets + $out->{c}; |
| for ( ; $numbuckets < $last; $numbuckets++) { |
| # exploit the auto-syncing semantics of >> |
| open ($buckets{$numbuckets}, ">>".$out->{out}.".tmp"); |
| } |
| } |
| |
| srand (1); # explicitly static seed, for reproducability |
| |
| open (IN, "<$input") or die "cannot open $input"; |
| while (<IN>) { |
| select $buckets{1+int(rand()*$numbuckets)}; $| = 1; |
| print $_; |
| } |
| close IN; |
| |
| foreach my $i (1 .. $numbuckets) { |
| close $buckets{$i}; |
| } |
| foreach my $out (@outs) { |
| rename $out->{out}.".tmp", $out->{out}; |
| } |
| |