blob: 33cf21ebd31d33641d5038d009415a2309c620c4 [file] [log] [blame]
#!/usr/bin/env perl
# Filters grammars according to various criteria.
use strict;
use warnings;
use List::Util qw/max/;
use Getopt::Std;
my %opts;
my $ret = getopts("bps:uv", \%opts);
if (!$ret) {
print "Usage: filter-rules.pl [-u] [-s SCOPE] [-v]\n";
print " -b: skip blank source and target sides\n";
print " -p: just print the rule's scope\n";
print " -s SCOPE: remove rules with scope > SCOPE (Hopkins & Langmead, 2010)\n";
print " -u: remove abstract unary rules\n";
print " -v: be verbose\n";
exit;
}
my ($total, $skipped) = (0, 0);
my %skipped = (
unary => 0,
lex_scope => 0,
unlex_scope => 0
);
while (my $line = <>) {
my ($source, $target);
if ($line =~ /^\[/) {
# hierarchical grammars
(undef, $source, $target) = split(/ \|\|\| /, $line);
} else {
# phrase table
($source, $target) = split(/ \|\|\| /, $line);
}
$total++;
if ($opts{b}) {
if ($source =~ /^\s*$/ or $target =~ /^\s*$/) {
$skipped{blanks}++;
$skipped++;
next;
}
}
if ($opts{u}) {
my @symbols = split(' ', $source);
# rule passes the filter if (a) it has more than one symbol or (b)
# it has one symbol and that symbol is not a nonterminal
if (@symbols == 1 and $symbols[0] =~ /^\[.*,1\]$/) {
print STDERR "SKIPPING unary abstract rule $line" if $opts{v};
$skipped{unary}++;
$skipped++;
next;
}
}
if ($opts{s}) {
my $scope = get_scope($source);
if ($opts{p}) {
chomp($source);
print "SCOPE($source) = $scope\n";
next;
}
if ($scope > $opts{s}) {
print STDERR "SKIPPING out-of-scope rule $line" if $opts{v};
$skipped{scope}++;
$skipped++;
if (is_lex($source)) {
$skipped{"lex"}++;
} else {
$skipped{"unlex"}++;
}
next;
}
}
print $line;
}
print STDERR "filter-rules.pl: skipped $skipped of $total rules\n";
foreach my $key (keys %skipped) {
print STDERR " skipped $key: $skipped{$key}\n";
}
sub get_scope {
my ($source) = @_;
my @tokens = split(' ', $source);
my $scope = 0;
for (my $i = 0; $i < @tokens; $i++) {
my $tok = $tokens[$i];
if (is_nt($tok) && ($i == 0 || is_nt($tokens[$i-1]))) {
$scope++;
}
}
$scope++ if (is_nt($tokens[-1]));
return $scope;
}
sub is_nt {
my ($word) = @_;
return 1 if $word =~ /^\[.*,\d+\]$/;
return 0;
}
sub is_lex {
my ($side) = @_;
return grep { ! is_nt($_) } split(' ', $side);
}