scripts/training/get_grammar_features.pl - joshua - Git at Google

 #!/usr/bin/env perl

 use warnings;
 use strict;

 # This script retrieves the names of all the features in the grammar. Dense features
 # are named with consecutive integers starting at 0, while sparse features can have any name.
 # To get the feature names from an unpacked grammar, we have to read through the whole grammar,
 # since sparse features can be anywhere. For packed grammars, this can be read directly from
 # the encoding.

 if (@ARGV != 1) {
   print "Usage: get_grammar_features.pl GRAMMAR\n";
   exit 1;
 }

 if ((! exists $ENV{JOSHUA}) || (! -d $ENV{JOSHUA})) {
   print "* FATAL: Environment variable \$JOSHUA not set properly\n";
   exit
 }

 my $CAT = "$ENV{JOSHUA}/scripts/training/scat";

 my ($grammar) = @ARGV;

 if (-d $grammar) {
   chomp(my @features = `java -d64 -Xmx256m -cp $ENV{JOSHUA}/class joshua.util.encoding.EncoderConfiguration $grammar | grep ^feature: | awk '{print \$NF}'`);
   print join("\n", @features) . $/;

 } elsif (-e $grammar) {
   my %features;
   open GRAMMAR, "$CAT $grammar|" or die "FATAL: can't read $grammar";
   while (my $line = <GRAMMAR>) {
     chomp($line);
     my @tokens = split(/ \|\|\| /, $line);
     # field 4 for regular grammars, field 3 for phrase tables
     my $feature_str = ($line =~ /^\[/) ? $tokens[3] : $tokens[2];
     my @features = split(' ', $feature_str);
     my $feature_no = 0;
     foreach my $feature (@features) {
       if ($feature =~ /=/) {
         my ($name) = split(/=/, $feature);
         $features{$name} = 1;
       } else {
         $features{$feature_no++} = 1;
       }
     }
   }
   close(GRAMMAR);

   print join("\n", keys(%features)) . $/;
 }
	#!/usr/bin/env perl

	use warnings;
	use strict;

	# This script retrieves the names of all the features in the grammar. Dense features
	# are named with consecutive integers starting at 0, while sparse features can have any name.
	# To get the feature names from an unpacked grammar, we have to read through the whole grammar,
	# since sparse features can be anywhere. For packed grammars, this can be read directly from
	# the encoding.

	if (@ARGV != 1) {
	print "Usage: get_grammar_features.pl GRAMMAR\n";
	exit 1;
	}

	if ((! exists $ENV{JOSHUA}) \|\| (! -d $ENV{JOSHUA})) {
	print "* FATAL: Environment variable \$JOSHUA not set properly\n";
	exit
	}

	my $CAT = "$ENV{JOSHUA}/scripts/training/scat";

	my ($grammar) = @ARGV;

	if (-d $grammar) {
	chomp(my @features = `java -d64 -Xmx256m -cp $ENV{JOSHUA}/class joshua.util.encoding.EncoderConfiguration $grammar \| grep ^feature: \| awk '{print \$NF}'`);
	print join("\n", @features) . $/;

	} elsif (-e $grammar) {
	my %features;
	open GRAMMAR, "$CAT $grammar\|" or die "FATAL: can't read $grammar";
	while (my $line = <GRAMMAR>) {
	chomp($line);
	my @tokens = split(/ \\|\\|\\| /, $line);
	# field 4 for regular grammars, field 3 for phrase tables
	my $feature_str = ($line =~ /^\[/) ? $tokens[3] : $tokens[2];
	my @features = split(' ', $feature_str);
	my $feature_no = 0;
	foreach my $feature (@features) {
	if ($feature =~ /=/) {
	my ($name) = split(/=/, $feature);
	$features{$name} = 1;
	} else {
	$features{$feature_no++} = 1;
	}
	}
	}
	close(GRAMMAR);

	print join("\n", keys(%features)) . $/;
	}