scripts/support/filter_grammar_parallel.pl - joshua - Git at Google

 #!/usr/bin/perl -w
 #$ -S /usr/bin/perl

 # Author: Damianos Karakos <damianos@jhu.edu>

 # This script parallelizes filtering of a grammar to a test set.  The
 # usage can be seen below.  If you do not specify --lines, a pass over
 # the grammar is made to count the number of lines, which is used to
 # split the file into chunks for filtering.  If you have already
 # counted the lines, you can pass it as this argument.

 use strict;
 use warnings;
 use Getopt::Long;
 use File::Basename;
 use Cwd;
 # use POSIX qw[ceil];
 # use List::Util qw[max min sum];
 # use File::Temp qw/ :mktemp /;

 my $JOSHUA = $ENV{JOSHUA};

 my $script = "$JOSHUA/scripts/support/filtering_script.sh";


 if(@ARGV == 0)
 {
     die "Usage: $0 --corpus=<corpus> --grammar=<grammar file> --n=<number of pieces> --output_grammar=<output grammar file> --lines=<number of lines of grammar> --fast --ngrams=<maximum n n-gram to compare to> --tmpdir=<tmpdir>\n";
 }

 my ($corpus, $grammar_file, $num_pieces, $output_grammar, $num_lines, $fast, $ngrams, $tmp_dir) = ("","",0,"",0,0,12,"");

 my $retval = GetOptions(
     "grammar=s"         => \$grammar_file,
     "n=s"               => \$num_pieces,
     "output_grammar=s"  => \$output_grammar,
     "corpus=s"          => \$corpus,
     "lines=s"       => \$num_lines,
     "fast!"      => \$fast,
     "ngrams=i"   => \$ngrams,
     "tmpdir=s"   => \$tmp_dir,
 );

 if (! $retval) {
     print "Invalid usage, quitting\n";
     exit 1;
 }

 if (! defined $tmp_dir) {
   my $proc_id = $$;
   my $hostname = `hostname`;
   chomp($hostname);
   $tmp_dir = "tmpdir.$hostname.$proc_id";
 }

 print STDERR "Grammar file: $grammar_file\n";
 print STDERR "Number of pieces: $num_pieces\n";
 print STDERR "Output grammar: $output_grammar\n";
 print STDERR "Corpus: $corpus\n";
 print STDERR "Number of lines in grammar: $num_lines\n" if($num_lines > 0);
 print STDERR "Temporary work directory: $tmp_dir\n";

 if($num_lines == 0)
 {
     open F, "gzip -cdf $grammar_file |" or die "ERROR: Cannot open $grammar_file\n";

     while(<F>)
     {
         $num_lines++;
     }
     close F;

     print STDERR "Found $num_lines in $grammar_file\n";
 }


 my $num_lines_per_piece = $num_lines/$num_pieces;

 if($num_lines_per_piece != int($num_lines_per_piece))
 {
     $num_lines_per_piece = int($num_lines_per_piece+1);
 }

 mkdir $tmp_dir unless -d $tmp_dir;

 my $grammar_basename = basename($grammar_file);

 open F, "gzip -cdf $grammar_file |" or die "ERROR: Cannot open $grammar_file\n";

 my $lines_remaining = $num_lines;

 my $actual_num_pieces = 0;      # This is needed because we are using the ceiling of num_lines/num_pieces, so the true number of pieces may be less

 foreach my $i (1..$num_pieces)
 {
     $actual_num_pieces++;
     my $grammar_piece = "$tmp_dir/$grammar_basename.$i";
     my $filtered_grammar_piece = "$grammar_piece.filtered.gz";

     open G, "| gzip -c > $grammar_piece.gz" or die "ERROR: Cannot write to $grammar_piece.gz\n";

     foreach(1..$num_lines_per_piece)
     {
         my $grammar_line = <F>;
         print G $grammar_line;
         $lines_remaining--;
         last if($lines_remaining == 0);
     }
     close G;

     my $logfile = "$tmp_dir/log.filtering.$i";

     &submit_job($script, $logfile, "$grammar_piece.gz", $corpus, $filtered_grammar_piece);

     last if ($lines_remaining == 0);
 }

 $num_pieces = $actual_num_pieces;

 #### Monitor the jobs and merge the resulting files if all jobs are done

 my $num_finished = 0;

 print STDERR "Waiting for grid jobs to finish\n";

 while($num_finished < $num_pieces)
 {
     # print STDERR "Number of finished jobs: $num_finished\n";
     $num_finished = 0;
     foreach my $i (1..$num_pieces)
     {
         open F, "$tmp_dir/log.filtering.$i" or next;
         # print STDERR "Checking $tmp_dir/log.filtering.$i\n";
         my @log_lines = <F>;

         # print STDERR $log_lines[-1];

         if((@log_lines > 0) && ($log_lines[-1] =~ m/skipped/i))
         {
             $num_finished++;
         }
     }
     sleep(1);
 }

 print STDERR "Grid jobs are done -- merging the filtered files\n";


 #### We will merge the resulting filtered files and save them in the designated location

 open G, "| gzip -c > $output_grammar" or die "ERROR: Cannot write to $output_grammar\n";

 foreach my $i (1..$num_pieces)
 {
     my $grammar_piece = "$tmp_dir/$grammar_basename.$i";
     my $filtered_grammar_piece = "$grammar_piece.filtered";

     open F, "gzip -cdf $filtered_grammar_piece |" or die "ERROR: Cannot open $filtered_grammar_piece\n";
     while(<F>)
     {
         print G $_;
     }
     close F;
 }

 # remove the temporary directory
 system("rm -rf $tmp_dir");


 sub submit_job
 {
     my ($script, $logfile, $grammar_piece, $corpus, $filtered_grammar_piece) = @_;

     unlink($logfile);

     my $cmdfile = $logfile;
 	$cmdfile =~ s/log/cmd/;
 	my $cmd = "qsub -cwd -j y -o $logfile -v JOSHUA=$JOSHUA $script $grammar_piece $corpus $filtered_grammar_piece $fast $ngrams";
 	system("echo $cmd >> $cmdfile");
     system($cmd);
 }
	#!/usr/bin/perl -w
	#$ -S /usr/bin/perl

	# Author: Damianos Karakos <damianos@jhu.edu>

	# This script parallelizes filtering of a grammar to a test set. The
	# usage can be seen below. If you do not specify --lines, a pass over
	# the grammar is made to count the number of lines, which is used to
	# split the file into chunks for filtering. If you have already
	# counted the lines, you can pass it as this argument.

	use strict;
	use warnings;
	use Getopt::Long;
	use File::Basename;
	use Cwd;
	# use POSIX qw[ceil];
	# use List::Util qw[max min sum];
	# use File::Temp qw/ :mktemp /;

	my $JOSHUA = $ENV{JOSHUA};

	my $script = "$JOSHUA/scripts/support/filtering_script.sh";


	if(@ARGV == 0)
	{
	die "Usage: $0 --corpus=<corpus> --grammar=<grammar file> --n=<number of pieces> --output_grammar=<output grammar file> --lines=<number of lines of grammar> --fast --ngrams=<maximum n n-gram to compare to> --tmpdir=<tmpdir>\n";
	}

	my ($corpus, $grammar_file, $num_pieces, $output_grammar, $num_lines, $fast, $ngrams, $tmp_dir) = ("","",0,"",0,0,12,"");

	my $retval = GetOptions(
	"grammar=s" => \$grammar_file,
	"n=s" => \$num_pieces,
	"output_grammar=s" => \$output_grammar,
	"corpus=s" => \$corpus,
	"lines=s" => \$num_lines,
	"fast!" => \$fast,
	"ngrams=i" => \$ngrams,
	"tmpdir=s" => \$tmp_dir,
	);

	if (! $retval) {
	print "Invalid usage, quitting\n";
	exit 1;
	}

	if (! defined $tmp_dir) {
	my $proc_id = $$;
	my $hostname = `hostname`;
	chomp($hostname);
	$tmp_dir = "tmpdir.$hostname.$proc_id";
	}

	print STDERR "Grammar file: $grammar_file\n";
	print STDERR "Number of pieces: $num_pieces\n";
	print STDERR "Output grammar: $output_grammar\n";
	print STDERR "Corpus: $corpus\n";
	print STDERR "Number of lines in grammar: $num_lines\n" if($num_lines > 0);
	print STDERR "Temporary work directory: $tmp_dir\n";

	if($num_lines == 0)
	{
	open F, "gzip -cdf $grammar_file \|" or die "ERROR: Cannot open $grammar_file\n";

	while(<F>)
	{
	$num_lines++;
	}
	close F;

	print STDERR "Found $num_lines in $grammar_file\n";
	}


	my $num_lines_per_piece = $num_lines/$num_pieces;

	if($num_lines_per_piece != int($num_lines_per_piece))
	{
	$num_lines_per_piece = int($num_lines_per_piece+1);
	}

	mkdir $tmp_dir unless -d $tmp_dir;

	my $grammar_basename = basename($grammar_file);

	open F, "gzip -cdf $grammar_file \|" or die "ERROR: Cannot open $grammar_file\n";

	my $lines_remaining = $num_lines;

	my $actual_num_pieces = 0; # This is needed because we are using the ceiling of num_lines/num_pieces, so the true number of pieces may be less

	foreach my $i (1..$num_pieces)
	{
	$actual_num_pieces++;
	my $grammar_piece = "$tmp_dir/$grammar_basename.$i";
	my $filtered_grammar_piece = "$grammar_piece.filtered.gz";

	open G, "\| gzip -c > $grammar_piece.gz" or die "ERROR: Cannot write to $grammar_piece.gz\n";

	foreach(1..$num_lines_per_piece)
	{
	my $grammar_line = <F>;
	print G $grammar_line;
	$lines_remaining--;
	last if($lines_remaining == 0);
	}
	close G;

	my $logfile = "$tmp_dir/log.filtering.$i";

	&submit_job($script, $logfile, "$grammar_piece.gz", $corpus, $filtered_grammar_piece);

	last if ($lines_remaining == 0);
	}

	$num_pieces = $actual_num_pieces;

	#### Monitor the jobs and merge the resulting files if all jobs are done

	my $num_finished = 0;

	print STDERR "Waiting for grid jobs to finish\n";

	while($num_finished < $num_pieces)
	{
	# print STDERR "Number of finished jobs: $num_finished\n";
	$num_finished = 0;
	foreach my $i (1..$num_pieces)
	{
	open F, "$tmp_dir/log.filtering.$i" or next;
	# print STDERR "Checking $tmp_dir/log.filtering.$i\n";
	my @log_lines = <F>;

	# print STDERR $log_lines[-1];

	if((@log_lines > 0) && ($log_lines[-1] =~ m/skipped/i))
	{
	$num_finished++;
	}
	}
	sleep(1);
	}

	print STDERR "Grid jobs are done -- merging the filtered files\n";


	#### We will merge the resulting filtered files and save them in the designated location

	open G, "\| gzip -c > $output_grammar" or die "ERROR: Cannot write to $output_grammar\n";

	foreach my $i (1..$num_pieces)
	{
	my $grammar_piece = "$tmp_dir/$grammar_basename.$i";
	my $filtered_grammar_piece = "$grammar_piece.filtered";

	open F, "gzip -cdf $filtered_grammar_piece \|" or die "ERROR: Cannot open $filtered_grammar_piece\n";
	while(<F>)
	{
	print G $_;
	}
	close F;
	}

	# remove the temporary directory
	system("rm -rf $tmp_dir");


	sub submit_job
	{
	my ($script, $logfile, $grammar_piece, $corpus, $filtered_grammar_piece) = @_;

	unlink($logfile);

	my $cmdfile = $logfile;
	$cmdfile =~ s/log/cmd/;
	my $cmd = "qsub -cwd -j y -o $logfile -v JOSHUA=$JOSHUA $script $grammar_piece $corpus $filtered_grammar_piece $fast $ngrams";
	system("echo $cmd >> $cmdfile");
	system($cmd);
	}