scripts/training/filter-rules.pl - joshua - Git at Google

 #!/usr/bin/env perl
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 # Removes rules from phrase tables and grammars according to various criteria.
 #
 # October 2016. It turns out that Thrax keeps *all* rules it finds for each source
 # side. In big bitexts, this can mean that common words collect tens or even hundreds
 # of thousands of translation options, due to garbage collection (Moore, 2004), all of
 # which are then retained. These can be filtered out by this script, which will reduce
 # the grammar to contain only the top 100 translation options (by count) for each source
 # side. This only works for grammars that have a 6th |||-delimited field containing
 # rule counts:
 #
 #   gzip -cd grammar.gz | filter-rules.pl -t 100 | gzip -9n > grammar-filtered.gz
 #
 # If you don't have that field, but do have a tuned model, you can also filter by using that,
 # say after tuning:
 #
 #   gzip -cd grammar.gz | filter-rules.pl -t 100 -c /path/to/joshua.config -o pt ...
 #
 # Really this should all just be built into thrax.

 use strict;
 use warnings;
 use List::Util qw/max sum/;
 use Getopt::Std;

 my %opts = ();
 my $ret = getopts("bps:uvc:t:o:f:", \%opts);

 if (!$ret) {
   print "Usage: filter-rules.pl [-u] [-s SCOPE] [-v]\n";
   print "   -b: skip blank source and target sides\n";
   print "   -p: just print the rule's scope\n";
   print "   -s SCOPE: remove rules with scope > SCOPE (Hopkins & Langmead, 2010)\n";
   print "   -u: remove abstract unary rules\n";
   print "   -v: be verbose\n";
   print "   -t: only include top N candidates\n";
   print "   -c: path to joshua config file\n";
   print "   -o: grammar owner (required for -t)\n";
   exit;
 }

 my ($total) = (0, 0);
 my %SKIPPED = (
   unary => 0,
   lex_scope => 0,
   unlex_scope => 0,
   redundant => 0,
 );


 my %WEIGHTS = ($opts{c}) ? read_weights($opts{c}) : ();

 my $prev_source = "";
 my @lines;
 while (my $line = <>) {
   my ($source, $target);
   if ($line =~ /^\[/) {
     # hierarchical grammars
     (undef, $source, $target) = split(/ \|\|\| /, $line);
   } else {
     # phrase table
     ($source, $target) = split(/ \|\|\| /, $line);
   }
   $total++;

   if ($opts{b}) {
     if ($source =~ /^\s*$/ or $target =~ /^\s*$/) {
       $SKIPPED{blanks}++;
       next;
     }
   }

   if ($opts{u}) {
     my @symbols = split(' ', $source);

      # rule passes the filter if (a) it has more than one symbol or (b)
     # it has one symbol and that symbol is not a nonterminal
     if (@symbols == 1 and $symbols[0] =~ /^\[.*,1\]$/) {
       print STDERR "SKIPPING unary abstract rule $line" if $opts{v};
       $SKIPPED{unary}++;
       next;
     }
   }

   if ($opts{s}) {
     my $scope = get_scope($source);
     if ($opts{p}) {
       chomp($source);
       print "SCOPE($source) = $scope\n";
       next;
     }
     if ($scope > $opts{s}) {
       print STDERR "SKIPPING out-of-scope rule $line" if $opts{v};
       $SKIPPED{scope}++;

       if (is_lex($source)) {
         $SKIPPED{"lex"}++;
       } else {
         $SKIPPED{"unlex"}++;
       }
       next;
     }
   }

   if ($prev_source ne "" and $source ne $prev_source) {
     filter_and_print_rules(\@lines);
     @lines = ();
   }

   $prev_source = $source;
   push(@lines, $line);
 }

 filter_and_print_rules(\@lines);

 my $skipped = sum(values(%SKIPPED));
 print STDERR "filter-rules.pl: skipped $skipped of $total rules\n";
 foreach my $key (keys %SKIPPED) {
   print STDERR "  skipped $key: $SKIPPED{$key}\n";
 }

 # If a config file was passed in, use the weights to weight all the rules for a source side, and print only the top $opts{t}
 sub filter_and_print_rules {
   my ($rulelist) = @_;
   my @rules = @$rulelist;

   my @filtered_rules = ();
   if ($opts{t} and $opts{c}) {
     my %scores;
     foreach my $rule (@rules) {
       my @tokens = split(/ \|\|\| /, $rule);
       my $features = $tokens[3];
       my @features = split(" ", $features);
       my $score = 0.0;
       for (my $i = 0; $i < @features; $i++) {
         my $feature = $features[$i];
         if ($feature =~ /=/) {
           my ($key,$value) = split("=", $feature);
           $score += $WEIGHTS{$key} * $value;
         } else {
           $score += -1 * $WEIGHTS{"tm_$opts{o}_$i"} * $feature;
         }
       }
       $scores{$rule} = $score;
     }

     my @sorted_rules = sort {$scores{$b} <=> $scores{$a}} keys(%scores);
     @filtered_rules = splice(@sorted_rules, 0, $opts{t});
     $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);

   } elsif ($opts{t}) {
     # Filter using field f (0-indexed), which is assumed to be the rarity penalty field
     my %counts;
     foreach my $rule (@rules) {
       my @tokens = split(/ \|\|\| /, $rule);
       if (@tokens < 6) {
         print STDERR "* WARNING, no counts present in field, not filtering\n";
         delete $opts{t};
         last;
       }
       my $countstr = $tokens[5];
       my @counts = split(" ", $countstr);
       my $count = $counts[0];
       $counts{$rule} = $count;
     }

     if (scalar(keys(%counts)) > 0) {
       my @sorted_rules = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
       @filtered_rules = splice(@sorted_rules, 0, $opts{t});
       $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);
     } else {
       @filtered_rules = @rules;
     }

   } else {
     @filtered_rules = @rules;
   }

   foreach my $rule (@filtered_rules) {
     print $rule;
   }
 }

 sub get_scope {
   my ($source) = @_;

   my @tokens = split(' ', $source);

   my $scope = 0;
   for (my $i = 0; $i < @tokens; $i++) {
     my $tok = $tokens[$i];
     if (is_nt($tok) && ($i == 0 || is_nt($tokens[$i-1]))) {
       $scope++;
     }
   }
   $scope++ if (is_nt($tokens[-1]));

   return $scope;
 }

 sub is_nt {
   my ($word) = @_;

   return 1 if $word =~ /^\[.*,\d+\]$/;
   return 0;
 }

 sub is_lex {
   my ($side) = @_;
   return grep { ! is_nt($_) } split(' ', $side);
 }

 sub read_weights {
   my ($config) = @_;

   my %weights;

   open READ, $config or die;
   while (my $line = <READ>) {
     chomp($line);
     next if ($line =~ /^\s*$/ or $line =~ /^#/ or $line =~ /=/);

     my ($key, $value) = split(" ", $line);
     $weights{$key} = $value;
   }
   close READ;

   return %weights;
 }
	#!/usr/bin/env perl
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# Removes rules from phrase tables and grammars according to various criteria.
	#
	# October 2016. It turns out that Thrax keeps all rules it finds for each source
	# side. In big bitexts, this can mean that common words collect tens or even hundreds
	# of thousands of translation options, due to garbage collection (Moore, 2004), all of
	# which are then retained. These can be filtered out by this script, which will reduce
	# the grammar to contain only the top 100 translation options (by count) for each source
	# side. This only works for grammars that have a 6th \|\|\|-delimited field containing
	# rule counts:
	#
	# gzip -cd grammar.gz \| filter-rules.pl -t 100 \| gzip -9n > grammar-filtered.gz
	#
	# If you don't have that field, but do have a tuned model, you can also filter by using that,
	# say after tuning:
	#
	# gzip -cd grammar.gz \| filter-rules.pl -t 100 -c /path/to/joshua.config -o pt ...
	#
	# Really this should all just be built into thrax.

	use strict;
	use warnings;
	use List::Util qw/max sum/;
	use Getopt::Std;

	my %opts = ();
	my $ret = getopts("bps:uvc:t:o:f:", \%opts);

	if (!$ret) {
	print "Usage: filter-rules.pl [-u] [-s SCOPE] [-v]\n";
	print " -b: skip blank source and target sides\n";
	print " -p: just print the rule's scope\n";
	print " -s SCOPE: remove rules with scope > SCOPE (Hopkins & Langmead, 2010)\n";
	print " -u: remove abstract unary rules\n";
	print " -v: be verbose\n";
	print " -t: only include top N candidates\n";
	print " -c: path to joshua config file\n";
	print " -o: grammar owner (required for -t)\n";
	exit;
	}

	my ($total) = (0, 0);
	my %SKIPPED = (
	unary => 0,
	lex_scope => 0,
	unlex_scope => 0,
	redundant => 0,
	);


	my %WEIGHTS = ($opts{c}) ? read_weights($opts{c}) : ();

	my $prev_source = "";
	my @lines;
	while (my $line = <>) {
	my ($source, $target);
	if ($line =~ /^\[/) {
	# hierarchical grammars
	(undef, $source, $target) = split(/ \\|\\|\\| /, $line);
	} else {
	# phrase table
	($source, $target) = split(/ \\|\\|\\| /, $line);
	}
	$total++;

	if ($opts{b}) {
	if ($source =~ /^\s$/ or $target =~ /^\s$/) {
	$SKIPPED{blanks}++;
	next;
	}
	}

	if ($opts{u}) {
	my @symbols = split(' ', $source);

	# rule passes the filter if (a) it has more than one symbol or (b)
	# it has one symbol and that symbol is not a nonterminal
	if (@symbols == 1 and $symbols[0] =~ /^\[.*,1\]$/) {
	print STDERR "SKIPPING unary abstract rule $line" if $opts{v};
	$SKIPPED{unary}++;
	next;
	}
	}

	if ($opts{s}) {
	my $scope = get_scope($source);
	if ($opts{p}) {
	chomp($source);
	print "SCOPE($source) = $scope\n";
	next;
	}
	if ($scope > $opts{s}) {
	print STDERR "SKIPPING out-of-scope rule $line" if $opts{v};
	$SKIPPED{scope}++;

	if (is_lex($source)) {
	$SKIPPED{"lex"}++;
	} else {
	$SKIPPED{"unlex"}++;
	}
	next;
	}
	}

	if ($prev_source ne "" and $source ne $prev_source) {
	filter_and_print_rules(\@lines);
	@lines = ();
	}

	$prev_source = $source;
	push(@lines, $line);
	}

	filter_and_print_rules(\@lines);

	my $skipped = sum(values(%SKIPPED));
	print STDERR "filter-rules.pl: skipped $skipped of $total rules\n";
	foreach my $key (keys %SKIPPED) {
	print STDERR " skipped $key: $SKIPPED{$key}\n";
	}

	# If a config file was passed in, use the weights to weight all the rules for a source side, and print only the top $opts{t}
	sub filter_and_print_rules {
	my ($rulelist) = @_;
	my @rules = @$rulelist;

	my @filtered_rules = ();
	if ($opts{t} and $opts{c}) {
	my %scores;
	foreach my $rule (@rules) {
	my @tokens = split(/ \\|\\|\\| /, $rule);
	my $features = $tokens[3];
	my @features = split(" ", $features);
	my $score = 0.0;
	for (my $i = 0; $i < @features; $i++) {
	my $feature = $features[$i];
	if ($feature =~ /=/) {
	my ($key,$value) = split("=", $feature);
	$score += $WEIGHTS{$key} * $value;
	} else {
	$score += -1 * $WEIGHTS{"tm_$opts{o}_$i"} * $feature;
	}
	}
	$scores{$rule} = $score;
	}

	my @sorted_rules = sort {$scores{$b} <=> $scores{$a}} keys(%scores);
	@filtered_rules = splice(@sorted_rules, 0, $opts{t});
	$SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);

	} elsif ($opts{t}) {
	# Filter using field f (0-indexed), which is assumed to be the rarity penalty field
	my %counts;
	foreach my $rule (@rules) {
	my @tokens = split(/ \\|\\|\\| /, $rule);
	if (@tokens < 6) {
	print STDERR "* WARNING, no counts present in field, not filtering\n";
	delete $opts{t};
	last;
	}
	my $countstr = $tokens[5];
	my @counts = split(" ", $countstr);
	my $count = $counts[0];
	$counts{$rule} = $count;
	}

	if (scalar(keys(%counts)) > 0) {
	my @sorted_rules = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
	@filtered_rules = splice(@sorted_rules, 0, $opts{t});
	$SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);
	} else {
	@filtered_rules = @rules;
	}

	} else {
	@filtered_rules = @rules;
	}

	foreach my $rule (@filtered_rules) {
	print $rule;
	}
	}

	sub get_scope {
	my ($source) = @_;

	my @tokens = split(' ', $source);

	my $scope = 0;
	for (my $i = 0; $i < @tokens; $i++) {
	my $tok = $tokens[$i];
	if (is_nt($tok) && ($i == 0 \|\| is_nt($tokens[$i-1]))) {
	$scope++;
	}
	}
	$scope++ if (is_nt($tokens[-1]));

	return $scope;
	}

	sub is_nt {
	my ($word) = @_;

	return 1 if $word =~ /^\[.*,\d+\]$/;
	return 0;
	}

	sub is_lex {
	my ($side) = @_;
	return grep { ! is_nt($_) } split(' ', $side);
	}

	sub read_weights {
	my ($config) = @_;

	my %weights;

	open READ, $config or die;
	while (my $line = <READ>) {
	chomp($line);
	next if ($line =~ /^\s*$/ or $line =~ /^#/ or $line =~ /=/);

	my ($key, $value) = split(" ", $line);
	$weights{$key} = $value;
	}
	close READ;

	return %weights;
	}