blob: f4a4a4d011768c0d4cde216fc1569235b19250a0 [file] [log] [blame]
#!/usr/bin/env perl
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script packs a grammar dense, unlabeled grammar, where we don't care what labels we give to
# the feature functions.
#
# Usage:
#
# grammar-packer.pl input-grammar [output-dir [packer-config]]
#
# where
#
# input-grammar is the input grammar to be packed
# output-dir is the packed grammar directory to write to (default: grammar.packed)
# packer-config is the packer config file (default: all floats)
#
# This script *requires* the grammar to be in Hiero format. If you are working with a
# phrase-based model (either extracted from Thrax or from Moses) you must first convert it
# to Hiero format using the script $JOSHUA/scripts/support/phrase2hiero.py.
use strict;
use warnings;
use Getopt::Std;
use File::Temp qw/tempfile/;
use File::Basename qw/basename/;
my %opts = (
a => 0, # whether alignments are included in the grammar(s)
g => '', # comma-separated list of grammars to pack
o => '', # comma-separated list of grammar output directories
m => '8g', # amount of memory to give the packer
T => '/tmp', # location of temporary space
v => 0, # verbose
s => 0, # grammars have already been UNIX sorted, skip sorting
);
getopts("am:T:vg:o:s", \%opts) || die usage();
die usage() if (@ARGV);
my $JOSHUA = $ENV{JOSHUA} or die "you must defined \$JOSHUA";
my $CAT = "$JOSHUA/scripts/training/scat";
sub usage {
print "Usage: grammar-packer.pl [-a] [-m MEM] [-T /path/to/tmp] -g 'grammar [grammar2 ...]' -o 'grammar.packed [grammar2.packed ...]'\n";
exit 1;
}
my @grammars = split(' ', $opts{g});
my @outputs = split(' ', $opts{o});
# make sure outputs is same size as inputs, or 0
die usage() if (scalar(@outputs) == 0 || scalar(@grammars) == 0 || scalar(@outputs) != scalar(@grammars));
# if no outputs given, generate default names
if (scalar(@outputs) < scalar(@grammars)) {
for (my $i = 1; $i < @grammars; $i++) {
push(@outputs, $i == 1 ? "grammar.packed" : "grammar$i.packed");
}
}
my $grammar_no = 0;
my @sorted_grammars;
foreach my $grammar (@grammars) {
$grammar_no++;
if (! -e $grammar) {
print "* FATAL: Can't find grammar '$grammar'\n";
exit 1;
}
if ($opts{s}) {
# The user *swears* the grammars are already sorted! Trust her.
print STDERR "You claim the grammars are already sorted. Okay, I'll believe you!\n";
push(@sorted_grammars, $grammar);
} else {
# Sort the grammar or phrase table
my $name = basename($grammar);
my (undef,$sorted_grammar) = tempfile("${name}XXXX", DIR => $opts{T}, UNLINK => 1);
print STDERR "Sorting grammar to $sorted_grammar...\n" if $opts{v};
# regular grammar
if (system("$CAT $grammar | sed 's/ ||| /\t/g' | LC_ALL=C sort -t'\t' -k2,2 -k3,3 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
exit 2;
}
push(@sorted_grammars, $sorted_grammar);
}
}
# Do the packing using the config.
my $grammars = join(" ", @sorted_grammars);
my $outputs = join(" ", @outputs);
my $alignments = $opts{a} ? "--ga" : "";
my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/target/joshua-*-jar-with-dependencies.jar org.apache.joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs $alignments";
print STDERR "Packing with $cmd...\n" if $opts{v};
my $retval = system($cmd);
if ($retval == 0) {
map { unlink($_) } @sorted_grammars;
} else {
print STDERR "* FATAL: Couldn't pack the grammar.\n";
print STDERR "* Copying sorted grammars ($grammars) to current directory.\n";
system("cp $grammars .") unless $opts{s};
exit 1;
}