blob: 31da72bad0136e0b97a32e30f273c092c79dfeed [file] [log] [blame]
#!/usr/bin/env perl
use warnings;
use strict;
# This script retrieves the names of all the features in the grammar. Dense features
# are named with consecutive integers starting at 0, while sparse features can have any name.
# To get the feature names from an unpacked grammar, we have to read through the whole grammar,
# since sparse features can be anywhere. For packed grammars, this can be read directly from
# the encoding.
if (@ARGV != 1) {
print "Usage: get_grammar_features.pl GRAMMAR\n";
exit 1;
}
if ((! exists $ENV{JOSHUA}) || (! -d $ENV{JOSHUA})) {
print "* FATAL: Environment variable \$JOSHUA not set properly\n";
exit
}
my $CAT = "$ENV{JOSHUA}/scripts/training/scat";
my ($grammar) = @ARGV;
if (-d $grammar) {
chomp(my @features = `java -d64 -Xmx256m -cp $ENV{JOSHUA}/class joshua.util.encoding.EncoderConfiguration $grammar | grep ^feature: | awk '{print \$NF}'`);
print join("\n", @features) . $/;
} elsif (-e $grammar) {
my %features;
open GRAMMAR, "$CAT $grammar|" or die "FATAL: can't read $grammar";
while (my $line = <GRAMMAR>) {
chomp($line);
my @tokens = split(/ \|\|\| /, $line);
# field 4 for regular grammars, field 3 for phrase tables
my $feature_str = ($line =~ /^\[/) ? $tokens[3] : $tokens[2];
my @features = split(' ', $feature_str);
my $feature_no = 0;
foreach my $feature (@features) {
if ($feature =~ /=/) {
my ($name) = split(/=/, $feature);
$features{$name} = 1;
} else {
$features{$feature_no++} = 1;
}
}
}
close(GRAMMAR);
print join("\n", keys(%features)) . $/;
}