blob: 66b4d0f81340f393a7cde00a6eb4d795c50d04bb [file] [log] [blame]
#!/usr/bin/env perl
# This script packs a grammar dense, unlabeled grammar, where we don't care what labels we give to
# the feature functions.
#
# Usage:
#
# grammar-packer.pl input-grammar [output-dir [packer-config]]
#
# where
#
# input-grammar is the input grammar to be packed
# output-dir is the packed grammar directory to write to (default: grammar.packed)
# packer-config is the packer config file (default: all floats)
use strict;
use warnings;
use Getopt::Std;
use File::Temp qw/tempfile/;
use File::Basename qw/basename/;
my %opts = (
m => '8g', # amount of memory to give the packer
T => '/tmp', # location of temporary space
v => 0, # verbose
);
getopts("m:T:v", \%opts);
my $JOSHUA = $ENV{JOSHUA} or die "you must defined \$JOSHUA";
my $CAT = "$JOSHUA/scripts/training/scat";
sub usage {
print "Usage: grammar-packer.pl [-m MEM] [-T /path/to/tmp] input-grammar [output-dir=grammar.packed]\n";
exit 1;
}
my $grammar = shift or usage();
my $output_dir = shift || "grammar.packed";
if (! -e $grammar) {
print "* FATAL: Can't find grammar '$grammar'\n";
exit 1;
}
# Sort the grammar or phrase table
my $name = basename($grammar);
my (undef,$sorted_grammar) = tempfile("${name}XXXX", DIR => $opts{T}, UNLINK => 1);
print STDERR "Sorting grammar to $sorted_grammar...\n" if $opts{v};
# We need to sort by source side, which is field 0 (for phrase tables not listing the LHS)
# or field 1 (convention, Thrax format)
chomp(my $first_line = `$CAT $grammar | head -n1`);
if ($first_line =~ /^\[/) {
# regular grammar
if (system("$CAT $grammar | sort -k3,3 --buffer-size=$opts{m} -T $opts{T} | gzip -9n > $sorted_grammar")) {
print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
exit 2;
}
} else {
# phrase-based grammar -- prepend nonterminal symbol
if (system("$CAT $grammar | $JOSHUA/scripts/support/moses_phrase_to_joshua.pl | sort -k3,3 --buffer-size=$opts{m} -T $opts{T} | gzip -9n > $sorted_grammar")) {
print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
exit 2;
}
}
#my $source_field = ($first_line =~ /^\[/) ? "3,3" : "1,1";
$grammar = $sorted_grammar;
# Do the packing using the config.
my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/class joshua.tools.GrammarPacker -p $output_dir -g $grammar";
print STDERR "Packing with $cmd...\n" if $opts{v};
my $retval = system($cmd);
if ($retval == 0) {
unlink($sorted_grammar);
} else {
print STDERR "* FATAL: Couldn't pack the grammar.\n";
exit 1;
}