masses/tenpass/split-log-into-buckets-cached - spamassassin - Git at Google

 #!/usr/bin/perl
 #
 # split-log-into-buckets-cached x:output [y:output2 ...]
 #
 # Split a mass-check log into several identically-sized buckets, evenly
 # taking messages from all checked corpora and preserving comments,
 # writing output to the files listed on the line.
 # It does this evenly by running through all buckets sequentially
 # as each line is read.
 #
 # Each output file must be listed, and the count of the "n" buckets
 # for that file specified; so for example 1 file can contain
 # 9 buckets.
 #
 # This variant operates randomly (as per -random) but caches results.

 my $input;
 my @outs = ();
 foreach my $arg (@ARGV) {
   if ($arg =~ /^(\d+):(.*)$/) {
     my $c = $1;
     my $out = $2;
     print "Creating $out with $c buckets\n";
     push (@outs, { c => $c, out => $out });
   }
   else {
     print "Reading from $arg\n";
     $input = $arg;
   }
 }

 die "usage\n" unless $input;

 my @instat = stat($input);
 my $rebuild = 0;
 foreach my $out (@outs) {
   my @outstat = stat($out->{out});
   if (!($outstat[9] && $instat[9] && $instat[9] < $outstat[9])) {
     $rebuild = 1;
   }
 }

 if ($rebuild == 0) {
   print "Existing outputs are up-to-date\n";
   exit;
 }

 my %buckets = ();
 my $numbuckets = 0;
 foreach my $out (@outs) {
   my $last = $numbuckets + $out->{c};
   for ( ; $numbuckets < $last; $numbuckets++) {
     # exploit the auto-syncing semantics of >>
     open ($buckets{$numbuckets}, ">>".$out->{out}.".tmp");
   }
 }

 srand (1);      # explicitly static seed, for reproducability

 open (IN, "<$input") or die "cannot open $input";
 while (<IN>) {
   select $buckets{1+int(rand()*$numbuckets)}; $| = 1;
   print $_;
 }
 close IN;

 foreach my $i (1 .. $numbuckets) {
   close $buckets{$i};
 }
 foreach my $out (@outs) {
   rename $out->{out}.".tmp", $out->{out};
 }
	#!/usr/bin/perl
	#
	# split-log-into-buckets-cached x:output [y:output2 ...]
	#
	# Split a mass-check log into several identically-sized buckets, evenly
	# taking messages from all checked corpora and preserving comments,
	# writing output to the files listed on the line.
	# It does this evenly by running through all buckets sequentially
	# as each line is read.
	#
	# Each output file must be listed, and the count of the "n" buckets
	# for that file specified; so for example 1 file can contain
	# 9 buckets.
	#
	# This variant operates randomly (as per -random) but caches results.

	my $input;
	my @outs = ();
	foreach my $arg (@ARGV) {
	if ($arg =~ /^(\d+):(.*)$/) {
	my $c = $1;
	my $out = $2;
	print "Creating $out with $c buckets\n";
	push (@outs, { c => $c, out => $out });
	}
	else {
	print "Reading from $arg\n";
	$input = $arg;
	}
	}

	die "usage\n" unless $input;

	my @instat = stat($input);
	my $rebuild = 0;
	foreach my $out (@outs) {
	my @outstat = stat($out->{out});
	if (!($outstat[9] && $instat[9] && $instat[9] < $outstat[9])) {
	$rebuild = 1;
	}
	}

	if ($rebuild == 0) {
	print "Existing outputs are up-to-date\n";
	exit;
	}

	my %buckets = ();
	my $numbuckets = 0;
	foreach my $out (@outs) {
	my $last = $numbuckets + $out->{c};
	for ( ; $numbuckets < $last; $numbuckets++) {
	# exploit the auto-syncing semantics of >>
	open ($buckets{$numbuckets}, ">>".$out->{out}.".tmp");
	}
	}

	srand (1); # explicitly static seed, for reproducability

	open (IN, "<$input") or die "cannot open $input";
	while (<IN>) {
	select $buckets{1+int(rand()*$numbuckets)}; $\| = 1;
	print $_;
	}
	close IN;

	foreach my $i (1 .. $numbuckets) {
	close $buckets{$i};
	}
	foreach my $out (@outs) {
	rename $out->{out}.".tmp", $out->{out};
	}