blob: 0d2b44226d084fa9966cba3a761bc1203a9e1ed3 [file] [log] [blame]
#!/usr/bin/perl
#
# split-log-into-buckets-cached x:output [y:output2 ...]
#
# Split a mass-check log into several identically-sized buckets, evenly
# taking messages from all checked corpora and preserving comments,
# writing output to the files listed on the line.
# It does this evenly by running through all buckets sequentially
# as each line is read.
#
# Each output file must be listed, and the count of the "n" buckets
# for that file specified; so for example 1 file can contain
# 9 buckets.
#
# This variant operates randomly (as per -random) but caches results.
my $input;
my @outs = ();
foreach my $arg (@ARGV) {
if ($arg =~ /^(\d+):(.*)$/) {
my $c = $1;
my $out = $2;
print "Creating $out with $c buckets\n";
push (@outs, { c => $c, out => $out });
}
else {
print "Reading from $arg\n";
$input = $arg;
}
}
die "usage\n" unless $input;
my @instat = stat($input);
my $rebuild = 0;
foreach my $out (@outs) {
my @outstat = stat($out->{out});
if (!($outstat[9] && $instat[9] && $instat[9] < $outstat[9])) {
$rebuild = 1;
}
}
if ($rebuild == 0) {
print "Existing outputs are up-to-date\n";
exit;
}
my %buckets = ();
my $numbuckets = 0;
foreach my $out (@outs) {
my $last = $numbuckets + $out->{c};
for ( ; $numbuckets < $last; $numbuckets++) {
# exploit the auto-syncing semantics of >>
open ($buckets{$numbuckets}, ">>".$out->{out}.".tmp");
}
}
srand (1); # explicitly static seed, for reproducability
open (IN, "<$input") or die "cannot open $input";
while (<IN>) {
select $buckets{1+int(rand()*$numbuckets)}; $| = 1;
print $_;
}
close IN;
foreach my $i (1 .. $numbuckets) {
close $buckets{$i};
}
foreach my $out (@outs) {
rename $out->{out}.".tmp", $out->{out};
}