| #!/usr/bin/perl -w |
| #$ -S /usr/bin/perl |
| |
| # Author: Damianos Karakos <damianos@jhu.edu> |
| |
| # This script parallelizes filtering of a grammar to a test set. The |
| # usage can be seen below. If you do not specify --lines, a pass over |
| # the grammar is made to count the number of lines, which is used to |
| # split the file into chunks for filtering. If you have already |
| # counted the lines, you can pass it as this argument. |
| |
| use strict; |
| use warnings; |
| use Getopt::Long; |
| use File::Basename; |
| use Cwd; |
| # use POSIX qw[ceil]; |
| # use List::Util qw[max min sum]; |
| # use File::Temp qw/ :mktemp /; |
| |
| my $JOSHUA = $ENV{JOSHUA}; |
| |
| my $script = "$JOSHUA/scripts/support/filtering_script.sh"; |
| |
| |
| if(@ARGV == 0) |
| { |
| die "Usage: $0 --corpus=<corpus> --grammar=<grammar file> --n=<number of pieces> --output_grammar=<output grammar file> --lines=<number of lines of grammar> --fast --ngrams=<maximum n n-gram to compare to> --tmpdir=<tmpdir>\n"; |
| } |
| |
| my ($corpus, $grammar_file, $num_pieces, $output_grammar, $num_lines, $fast, $ngrams, $tmp_dir) = ("","",0,"",0,0,12,""); |
| |
| my $retval = GetOptions( |
| "grammar=s" => \$grammar_file, |
| "n=s" => \$num_pieces, |
| "output_grammar=s" => \$output_grammar, |
| "corpus=s" => \$corpus, |
| "lines=s" => \$num_lines, |
| "fast!" => \$fast, |
| "ngrams=i" => \$ngrams, |
| "tmpdir=s" => \$tmp_dir, |
| ); |
| |
| if (! $retval) { |
| print "Invalid usage, quitting\n"; |
| exit 1; |
| } |
| |
| if (! defined $tmp_dir) { |
| my $proc_id = $$; |
| my $hostname = `hostname`; |
| chomp($hostname); |
| $tmp_dir = "tmpdir.$hostname.$proc_id"; |
| } |
| |
| print STDERR "Grammar file: $grammar_file\n"; |
| print STDERR "Number of pieces: $num_pieces\n"; |
| print STDERR "Output grammar: $output_grammar\n"; |
| print STDERR "Corpus: $corpus\n"; |
| print STDERR "Number of lines in grammar: $num_lines\n" if($num_lines > 0); |
| print STDERR "Temporary work directory: $tmp_dir\n"; |
| |
| if($num_lines == 0) |
| { |
| open F, "gzip -cdf $grammar_file |" or die "ERROR: Cannot open $grammar_file\n"; |
| |
| while(<F>) |
| { |
| $num_lines++; |
| } |
| close F; |
| |
| print STDERR "Found $num_lines in $grammar_file\n"; |
| } |
| |
| |
| my $num_lines_per_piece = $num_lines/$num_pieces; |
| |
| if($num_lines_per_piece != int($num_lines_per_piece)) |
| { |
| $num_lines_per_piece = int($num_lines_per_piece+1); |
| } |
| |
| mkdir $tmp_dir unless -d $tmp_dir; |
| |
| my $grammar_basename = basename($grammar_file); |
| |
| open F, "gzip -cdf $grammar_file |" or die "ERROR: Cannot open $grammar_file\n"; |
| |
| my $lines_remaining = $num_lines; |
| |
| my $actual_num_pieces = 0; # This is needed because we are using the ceiling of num_lines/num_pieces, so the true number of pieces may be less |
| |
| foreach my $i (1..$num_pieces) |
| { |
| $actual_num_pieces++; |
| my $grammar_piece = "$tmp_dir/$grammar_basename.$i"; |
| my $filtered_grammar_piece = "$grammar_piece.filtered.gz"; |
| |
| open G, "| gzip -c > $grammar_piece.gz" or die "ERROR: Cannot write to $grammar_piece.gz\n"; |
| |
| foreach(1..$num_lines_per_piece) |
| { |
| my $grammar_line = <F>; |
| print G $grammar_line; |
| $lines_remaining--; |
| last if($lines_remaining == 0); |
| } |
| close G; |
| |
| my $logfile = "$tmp_dir/log.filtering.$i"; |
| |
| &submit_job($script, $logfile, "$grammar_piece.gz", $corpus, $filtered_grammar_piece); |
| |
| last if ($lines_remaining == 0); |
| } |
| |
| $num_pieces = $actual_num_pieces; |
| |
| #### Monitor the jobs and merge the resulting files if all jobs are done |
| |
| my $num_finished = 0; |
| |
| print STDERR "Waiting for grid jobs to finish\n"; |
| |
| while($num_finished < $num_pieces) |
| { |
| # print STDERR "Number of finished jobs: $num_finished\n"; |
| $num_finished = 0; |
| foreach my $i (1..$num_pieces) |
| { |
| open F, "$tmp_dir/log.filtering.$i" or next; |
| # print STDERR "Checking $tmp_dir/log.filtering.$i\n"; |
| my @log_lines = <F>; |
| |
| # print STDERR $log_lines[-1]; |
| |
| if((@log_lines > 0) && ($log_lines[-1] =~ m/skipped/i)) |
| { |
| $num_finished++; |
| } |
| } |
| sleep(1); |
| } |
| |
| print STDERR "Grid jobs are done -- merging the filtered files\n"; |
| |
| |
| #### We will merge the resulting filtered files and save them in the designated location |
| |
| open G, "| gzip -c > $output_grammar" or die "ERROR: Cannot write to $output_grammar\n"; |
| |
| foreach my $i (1..$num_pieces) |
| { |
| my $grammar_piece = "$tmp_dir/$grammar_basename.$i"; |
| my $filtered_grammar_piece = "$grammar_piece.filtered"; |
| |
| open F, "gzip -cdf $filtered_grammar_piece |" or die "ERROR: Cannot open $filtered_grammar_piece\n"; |
| while(<F>) |
| { |
| print G $_; |
| } |
| close F; |
| } |
| |
| # remove the temporary directory |
| system("rm -rf $tmp_dir"); |
| |
| |
| sub submit_job |
| { |
| my ($script, $logfile, $grammar_piece, $corpus, $filtered_grammar_piece) = @_; |
| |
| unlink($logfile); |
| |
| my $cmdfile = $logfile; |
| $cmdfile =~ s/log/cmd/; |
| my $cmd = "qsub -cwd -j y -o $logfile -v JOSHUA=$JOSHUA $script $grammar_piece $corpus $filtered_grammar_piece $fast $ngrams"; |
| system("echo $cmd >> $cmdfile"); |
| system($cmd); |
| } |
| |