scripts/training/filter-rules.pl - joshua - Git at Google

 #!/usr/bin/env perl

 # Filters grammars according to various criteria.

 use strict;
 use warnings;
 use List::Util qw/max/;
 use Getopt::Std;

 my %opts;
 my $ret = getopts("bps:uv", \%opts);

 if (!$ret) {
   print "Usage: filter-rules.pl [-u] [-s SCOPE] [-v]\n";
   print "   -b: skip blank source and target sides\n";
   print "   -p: just print the rule's scope\n";
   print "   -s SCOPE: remove rules with scope > SCOPE (Hopkins & Langmead, 2010)\n";
   print "   -u: remove abstract unary rules\n";
   print "   -v: be verbose\n";
   exit;
 }

 my ($total, $skipped) = (0, 0);
 my %skipped = (
   unary => 0,
   lex_scope => 0,
   unlex_scope => 0
 );
 while (my $line = <>) {
   my ($source, $target);
   if ($line =~ /^\[/) {
     # hierarchical grammars
     (undef, $source, $target) = split(/ \|\|\| /, $line);
   } else {
     # phrase table
     ($source, $target) = split(/ \|\|\| /, $line);
   }
   $total++;

   if ($opts{b}) {
     if ($source =~ /^\s*$/ or $target =~ /^\s*$/) {
       $skipped{blanks}++;
       $skipped++;
       next;
     }
   }

   if ($opts{u}) {
     my @symbols = split(' ', $source);

      # rule passes the filter if (a) it has more than one symbol or (b)
     # it has one symbol and that symbol is not a nonterminal
     if (@symbols == 1 and $symbols[0] =~ /^\[.*,1\]$/) {
       print STDERR "SKIPPING unary abstract rule $line" if $opts{v};
       $skipped{unary}++;
       $skipped++;
       next;
     }
   }

   if ($opts{s}) {
     my $scope = get_scope($source);
     if ($opts{p}) {
       chomp($source);
       print "SCOPE($source) = $scope\n";
       next;
     }
     if ($scope > $opts{s}) {
       print STDERR "SKIPPING out-of-scope rule $line" if $opts{v};
       $skipped{scope}++;
       $skipped++;

       if (is_lex($source)) {
         $skipped{"lex"}++;
       } else {
         $skipped{"unlex"}++;
       }
       next;
     }
   }

   print $line;
 }

 print STDERR "filter-rules.pl: skipped $skipped of $total rules\n";
 foreach my $key (keys %skipped) {
   print STDERR "  skipped $key: $skipped{$key}\n";
 }

 sub get_scope {
   my ($source) = @_;

   my @tokens = split(' ', $source);

   my $scope = 0;
   for (my $i = 0; $i < @tokens; $i++) {
     my $tok = $tokens[$i];
     if (is_nt($tok) && ($i == 0 || is_nt($tokens[$i-1]))) {
       $scope++;
     }
   }
   $scope++ if (is_nt($tokens[-1]));

   return $scope;
 }

 sub is_nt {
   my ($word) = @_;

   return 1 if $word =~ /^\[.*,\d+\]$/;
   return 0;
 }

 sub is_lex {
   my ($side) = @_;
   return grep { ! is_nt($_) } split(' ', $side);
 }
	#!/usr/bin/env perl

	# Filters grammars according to various criteria.

	use strict;
	use warnings;
	use List::Util qw/max/;
	use Getopt::Std;

	my %opts;
	my $ret = getopts("bps:uv", \%opts);

	if (!$ret) {
	print "Usage: filter-rules.pl [-u] [-s SCOPE] [-v]\n";
	print " -b: skip blank source and target sides\n";
	print " -p: just print the rule's scope\n";
	print " -s SCOPE: remove rules with scope > SCOPE (Hopkins & Langmead, 2010)\n";
	print " -u: remove abstract unary rules\n";
	print " -v: be verbose\n";
	exit;
	}

	my ($total, $skipped) = (0, 0);
	my %skipped = (
	unary => 0,
	lex_scope => 0,
	unlex_scope => 0
	);
	while (my $line = <>) {
	my ($source, $target);
	if ($line =~ /^\[/) {
	# hierarchical grammars
	(undef, $source, $target) = split(/ \\|\\|\\| /, $line);
	} else {
	# phrase table
	($source, $target) = split(/ \\|\\|\\| /, $line);
	}
	$total++;

	if ($opts{b}) {
	if ($source =~ /^\s$/ or $target =~ /^\s$/) {
	$skipped{blanks}++;
	$skipped++;
	next;
	}
	}

	if ($opts{u}) {
	my @symbols = split(' ', $source);

	# rule passes the filter if (a) it has more than one symbol or (b)
	# it has one symbol and that symbol is not a nonterminal
	if (@symbols == 1 and $symbols[0] =~ /^\[.*,1\]$/) {
	print STDERR "SKIPPING unary abstract rule $line" if $opts{v};
	$skipped{unary}++;
	$skipped++;
	next;
	}
	}

	if ($opts{s}) {
	my $scope = get_scope($source);
	if ($opts{p}) {
	chomp($source);
	print "SCOPE($source) = $scope\n";
	next;
	}
	if ($scope > $opts{s}) {
	print STDERR "SKIPPING out-of-scope rule $line" if $opts{v};
	$skipped{scope}++;
	$skipped++;

	if (is_lex($source)) {
	$skipped{"lex"}++;
	} else {
	$skipped{"unlex"}++;
	}
	next;
	}
	}

	print $line;
	}

	print STDERR "filter-rules.pl: skipped $skipped of $total rules\n";
	foreach my $key (keys %skipped) {
	print STDERR " skipped $key: $skipped{$key}\n";
	}

	sub get_scope {
	my ($source) = @_;

	my @tokens = split(' ', $source);

	my $scope = 0;
	for (my $i = 0; $i < @tokens; $i++) {
	my $tok = $tokens[$i];
	if (is_nt($tok) && ($i == 0 \|\| is_nt($tokens[$i-1]))) {
	$scope++;
	}
	}
	$scope++ if (is_nt($tokens[-1]));

	return $scope;
	}

	sub is_nt {
	my ($word) = @_;

	return 1 if $word =~ /^\[.*,\d+\]$/;
	return 0;
	}

	sub is_lex {
	my ($side) = @_;
	return grep { ! is_nt($_) } split(' ', $side);
	}