blob: f8f36317561ce5fa0324183db082cf0b9da41b58 [file] [log] [blame]
#!/usr/bin/env perl
# Matt Post <post@cs.jhu.edu>
# Converts a Moses grammar to a Joshua grammar.
#
# Usage: cat grammar.moses | moses2joshua_grammar.pl > grammar.joshua
#
# The notable differences between the formats are as follows:
#
# (1) The rule syntax. Moses' rules look like this:
#
# der [X][NN] [X] ||| of the [X][NN] [PP] ||| 0-0 0-1 1-2 ||| 1 ||| |||
#
# Whereas the corresponding Joshua rule looks like this:
#
# [PP] ||| der [NN] ||| of the [NN] ||| 1
#
# (2) Phrase table values. Joshua negates the phrase table feature values upon reading them in,
# whereas Moses (more sensibly) does not.
use strict;
use warnings;
use File::Basename;
use Getopt::Std;
my %opts;
getopts("i:d:x",\%opts);
sub usage {
print "Usage: cat moses.grammar | moses2joshua_grammar.pl > joshua.grammar\n";
exit;
}
# Rules look like these:
# <s> [X] ||| <s> [S] ||| 1 ||| ||| 0
# [X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0
# [X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0
while (my $rule = <>) {
chomp($rule);
my $orig_rule = $rule;
# ! es [X][VP] , [X] ||| , we [X][VP] , [PRN] ||| 0.0102041 0.00950695 1 0.000537615 2.718 ||| 2-2 ||| 98 1
# ! es [X][VP] [X][,] [X] ||| , we [X][VP] [X][,] [PRN] ||| 0.0120482 0.0206611 1 0.000867691 2.718 ||| 2-2 3-3 ||| 83 1
# Joshua doesn't support tree-to-string (currently), so we get rid of the source-side
# nonterminal. This also simplifies later processing.
$rule =~ s/ \[\S+?\](\[\S+?\])/ $1/g;
my ($l1, $l2, $probs, $alignment, $counts) = split(/\|\|\|/, $rule);
# The source-side nonterminals (of each pair) have been removed. Here we push all the for the
# source and target sides into arrays. We grab the LHS from the target side list.
my (@l1tokens) = split(' ', $l1);
pop(@l1tokens);
my (@l2tokens) = split(' ', $l2);
my $lhs = pop(@l2tokens);
# Now build a list of just the nonterminals. Then for the target side (L2), we map positions in
# the string to its index in the list of nonterminals, used to discover the permutation later on.
my (@l1nts);
for (my $i = 0; $i < @l1tokens; $i++) {
my $token = $l1tokens[$i];
if ($token =~ /\[(\S+?)\]/g) {
push(@l1nts, $1);
}
}
my (@l2nts,@l2orders);
my $num_lhs_seen = 0;
for (my $i = 0; $i < @l2tokens; $i++) {
my $token = $l2tokens[$i];
if ($token =~ /\[(\S+?)\]/g) {
push(@l2nts, $1);
$l2orders[$i] = ++$num_lhs_seen;
}
}
if (scalar @l1nts != scalar @l2nts) {
print STDERR "* WARNING: nonterminal count mismatch on RHS\n";
print STDERR "* " . (scalar @l1nts) . $/;
print STDERR "* " . (scalar @l2nts) . $/;
next;
}
# Build the permutation by first sorting L1 and then using the order index.
my @permutation;
my @alignments = sort by_first split(' ', $alignment);
foreach my $pair (@alignments) {
my ($l1,$l2) = split(/-/, $pair);
push(@permutation, $l2orders[$l2]) if defined $l2orders[$l2];
}
if (scalar(@permutation) != scalar(@l2nts)) {
my $a = scalar(@l2nts);
my $b = scalar(@permutation);
print STDERR "* [line $.] WARNING: permutation length is too short (l2nts $a, perm $b)\n";
next;
}
# Now go on and print the rule.
my $new_rule = "$lhs |||";
$num_lhs_seen = 0;
foreach my $token (@l1tokens) {
if ($token =~ /\[(\S+?)\]/g) {
$new_rule .= " [$1," . (++$num_lhs_seen) . "]";
} else {
$new_rule .= " $token";
}
}
$new_rule .= " |||";
$num_lhs_seen = 0;
foreach my $token (@l2tokens) {
if ($token =~ /\[(\S+?)\]/g) {
$new_rule .= " [$1," . $permutation[$num_lhs_seen++] . "]";
} else {
$new_rule .= " $token";
}
}
my @probs = map { transform($_) } (split(' ',$probs));
my $scores = join(" ", map { sprintf("%.5f", $_) } @probs);
$new_rule .= " ||| $scores";
# print STDERR " NEW_RULE: $new_rule$/";
print "$new_rule\n";
}
sub transform {
my ($weight) = @_;
# Moses defines the log_e() of non-positive weights as -100
return "-100" if ($weight <= 0.0);
# if ($weight eq "2.718") {
# return $weight;
# } else {
# return -log($weight);
# }
return -log($weight);
}
# Reads the moses config file to look for the span limit for grammar i (0-indexed).
sub get_span_limit {
my ($config_file, $index) = @_;
open READ, $config_file or die "can't find config file '$config_file'";
while (my $line = <READ>) {
if ($line =~ /max-chart-span/) {
# Burn through a number of lines until we get to the one we need. This assumes there are no
# intervening comments or blank lines.
for (my $i = 0; $i < $index; $i++) {
my $t = <READ>;
}
last;
}
}
chomp(my $max_span = <READ>);
close(READ);
return $max_span;
}
sub by_first {
my $a1 = $a;
$a1 =~ s/-.*//;
my $b1 = $b;
$b1 =~ s/-.*//;
return $a1 <=> $b1;
}