scripts/support/moses2joshua_grammar.pl - joshua - Git at Google

 #!/usr/bin/env perl
 # Matt Post <post@cs.jhu.edu>

 # Converts a Moses grammars and phrase tables to a Joshua grammar.
 #
 # Usage: cat grammar.moses | moses2joshua_grammar.pl > grammar.joshua
 #
 # The notable differences between the formats are as follows:
 #
 # (1) The rule syntax. Moses' rules look like this:
 #
 #     der [X][NN] [X] ||| of the [X][NN] [PP] ||| 0-0 0-1 1-2 ||| 1 ||| |||
 #
 # Whereas the corresponding Joshua rule looks like this:
 #
 #     [PP] ||| der [NN] ||| of the [NN] ||| 1
 #
 # (This doesn't apply to phrase tables, of course).
 #
 # (2) Phrase table values. Moses takes the log of each feature, whereas Joshua takes just
 #     negates the values when it reads them in. To make the conversion correct, this script
 #     computes the negative log of each of the feature values.

 use strict;
 use warnings;
 use File::Basename;
 use Getopt::Std;

 my %opts;
 getopts("m:p",\%opts);

 sub usage {
   print "Usage: cat moses.grammar | moses2joshua_grammar.pl [-p] [-m TREE_MAP_FILE] > grammar\n";
   print "where TREE_MAP_FILE maps rule target-sides to internal trees\n";
   print "where -p means to add a phrase penalty feature\n";

   exit;
 }

   # Rules look like the following, with many of the fields optional
   #
   # <s> [X] ||| <s> [S] ||| [weights] ||| [alignments] ||| [counts] ||| [tree]
   # [X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0
   # [X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0

 if ($opts{m}) {
   open MAP, ">$opts{m}" or die "can't write map to file '$opts{m}'";
 }

 while (my $rule = <>) {
   chomp($rule);

   my $orig_rule = $rule;

 # ! es [X][VP] , [X] ||| , we [X][VP] , [PRN] ||| 0.0102041 0.00950695 1 0.000537615 2.718 ||| 2-2 ||| 98 1
 # ! es [X][VP] [X][,] [X] ||| , we [X][VP] [X][,] [PRN] ||| 0.0120482 0.0206611 1 0.000867691 2.718 ||| 2-2 3-3 ||| 83 1

   # Get rid of the source-side nonterminal.
   $rule =~ s/( ?)\[\S+?\](\[\S+?\])/$1$2/g;

   my ($l1, $l2, $probs, $alignment, $counts, undef, $tree) = split(/\s*\|\|\|\s*/, $rule);

   # The source-side nonterminals (of each pair) have been removed. Here we push all the for the
   # source and target sides into arrays. We grab the LHS from the target side list.
   my (@l1tokens) = split(' ', $l1);
   my (@l2tokens) = split(' ', $l2);

   # Support regular phrase tables, too (with no LHS).
   my $lhs = undef;
   if ($l2tokens[-1] =~ /^\[.*\]$/) {
     pop(@l1tokens);
     $lhs = pop(@l2tokens);
   }

   # Now build a list of just the nonterminals. Then for the target side (L2), we map positions in
   # the string to its index in the list of nonterminals, used to discover the permutation later on.
   my (@l1nts);
   for (my $i = 0; $i < @l1tokens; $i++) {
     my $token = $l1tokens[$i];
     if ($token =~ /\[(\S+?)\]/g) {
       push(@l1nts, $1);
     }
   }
   my (@l2nts,@l2orders);
   my $num_lhs_seen = 0;
   for (my $i = 0; $i < @l2tokens; $i++) {
     my $token = $l2tokens[$i];
     if ($token =~ /\[(\S+?)\]/g) {
       push(@l2nts, $1);
       $l2orders[$i] = ++$num_lhs_seen;
     }
   }

   if (scalar @l1nts != scalar @l2nts) {
     print STDERR "* WARNING: nonterminal count mismatch on RHS\n";
     print STDERR "*  " . (scalar @l1nts) . $/;
     print STDERR "*  " . (scalar @l2nts) . $/;
     next;
   }

   # Build the permutation by first sorting L1 and then using the order index.
   my @permutation;
   my @alignments = sort by_first split(' ', $alignment);
   foreach my $pair (@alignments) {
     my ($l1,$l2) = split(/-/, $pair);
     push(@permutation, $l2orders[$l2]) if defined $l2orders[$l2];
   }

   if (scalar(@permutation) != scalar(@l2nts)) {
     my $a = scalar(@l2nts);
     my $b = scalar(@permutation);
     print STDERR "* [line $.] WARNING: permutation length is too short (l2nts $a, perm $b)\n";
     next;
   }

   # Now go on and print the rule.
   my $new_rule;
   if ($lhs) {
     $new_rule = "$lhs ||| ";
   } else {
     $new_rule = "";
   }
   $num_lhs_seen = 0;
   foreach my $token (@l1tokens) {
     if ($token =~ /\[(\S+?)\]/g) {
       $new_rule .= "[$1," . (++$num_lhs_seen) . "] ";
     } else {
       $new_rule .= "$token ";
     }
   }

   $new_rule .= "||| ";

   $num_lhs_seen = 0;
   my $target = "";
   foreach my $token (@l2tokens) {
     if ($token =~ /\[(\S+?)\]/g) {
       $target .= "[$1," . $permutation[$num_lhs_seen++] . "] ";
     } else {
       $target .= "$token ";
     }
   }
   $target =~ s/^\s+//;
   $new_rule .= $target;

   my @probs = map { transform($_) } (split(' ',$probs));
   # Moses no longer uses exp(1) as its phrase penalty feature, so we can't
   # rely on the uniform transform above...
   push(@probs, -1) if ($opts{p});
   my $scores = join(" ", map { sprintf("%.5f", $_) } @probs);

 #  $new_rule .= " ||| $scores ||| $alignment ||| $counts";
   $new_rule .= "||| $scores ||| $alignment";

   # print STDERR "  NEW_RULE: $rule => $new_rule$/" if $new_rule =~ /^\|/;
   print "$new_rule\n";

   if ($opts{m} and defined $tree) {
     $tree =~ s/.*{{Tree\s+(.*)}}.*/$1/;
     # Remove brackets around substitution points
     $tree =~ s/\[([^\[\]\s]+)\]/$1/g;
     # Add quotes around terminals
     $tree =~ s/\[([^\[\]]+) ([^\[\]]+)\]/[$1 "$2"]/g;
     $tree =~ s/\[/(/g;
     $tree =~ s/\]/)/g;

     print MAP "$tree ||| $target\n";
   }
 }

 close(MAP) if ($opts{m});

 sub transform {
   my ($weight) = @_;

   # Moses defines the log_e() of non-positive weights as -100
   # Return -1 times this, since Joshua negates all feature weights from the grammar
   return "100" if ($weight <= 0.0);

   # if ($weight eq "2.718") {
   # 	return $weight;
   # } else {
   # 	return -log($weight);
   # }

   return -log($weight);
 }

 # Reads the moses config file to look for the span limit for grammar i (0-indexed).
 sub get_span_limit {
   my ($config_file, $index) = @_;

   open READ, $config_file or die "can't find config file '$config_file'";
   while (my $line = <READ>) {
     if ($line =~ /max-chart-span/) {
       # Burn through a number of lines until we get to the one we need.  This assumes there are no
       # intervening comments or blank lines.
       for (my $i = 0; $i < $index; $i++) {
         my $t = <READ>;
       }
       last;
     }
   }
   chomp(my $max_span = <READ>);

   close(READ);
   return $max_span;
 }

 sub by_first {
   my $a1 = $a;
   $a1 =~ s/-.*//;
   my $b1 = $b;
   $b1 =~ s/-.*//;
   return $a1 <=> $b1;
 }
	#!/usr/bin/env perl
	# Matt Post <post@cs.jhu.edu>

	# Converts a Moses grammars and phrase tables to a Joshua grammar.
	#
	# Usage: cat grammar.moses \| moses2joshua_grammar.pl > grammar.joshua
	#
	# The notable differences between the formats are as follows:
	#
	# (1) The rule syntax. Moses' rules look like this:
	#
	# der [X][NN] [X] \|\|\| of the [X][NN] [PP] \|\|\| 0-0 0-1 1-2 \|\|\| 1 \|\|\| \|\|\|
	#
	# Whereas the corresponding Joshua rule looks like this:
	#
	# [PP] \|\|\| der [NN] \|\|\| of the [NN] \|\|\| 1
	#
	# (This doesn't apply to phrase tables, of course).
	#
	# (2) Phrase table values. Moses takes the log of each feature, whereas Joshua takes just
	# negates the values when it reads them in. To make the conversion correct, this script
	# computes the negative log of each of the feature values.

	use strict;
	use warnings;
	use File::Basename;
	use Getopt::Std;

	my %opts;
	getopts("m:p",\%opts);

	sub usage {
	print "Usage: cat moses.grammar \| moses2joshua_grammar.pl [-p] [-m TREE_MAP_FILE] > grammar\n";
	print "where TREE_MAP_FILE maps rule target-sides to internal trees\n";
	print "where -p means to add a phrase penalty feature\n";

	exit;
	}

	# Rules look like the following, with many of the fields optional
	#
	# <s> [X] \|\|\| <s> [S] \|\|\| [weights] \|\|\| [alignments] \|\|\| [counts] \|\|\| [tree]
	# [X][S] </s> [X] \|\|\| [X][S] </s> [S] \|\|\| 1 \|\|\| 0-0 \|\|\| 0
	# [X][S] [X][X] [X] \|\|\| [X][S] [X][X] [S] \|\|\| 2.718 \|\|\| 0-0 1-1 \|\|\| 0

	if ($opts{m}) {
	open MAP, ">$opts{m}" or die "can't write map to file '$opts{m}'";
	}

	while (my $rule = <>) {
	chomp($rule);

	my $orig_rule = $rule;

	# ! es [X][VP] , [X] \|\|\| , we [X][VP] , [PRN] \|\|\| 0.0102041 0.00950695 1 0.000537615 2.718 \|\|\| 2-2 \|\|\| 98 1
	# ! es [X][VP] [X][,] [X] \|\|\| , we [X][VP] [X][,] [PRN] \|\|\| 0.0120482 0.0206611 1 0.000867691 2.718 \|\|\| 2-2 3-3 \|\|\| 83 1

	# Get rid of the source-side nonterminal.
	$rule =~ s/( ?)\[\S+?\](\[\S+?\])/$1$2/g;

	my ($l1, $l2, $probs, $alignment, $counts, undef, $tree) = split(/\s\\|\\|\\|\s/, $rule);

	# The source-side nonterminals (of each pair) have been removed. Here we push all the for the
	# source and target sides into arrays. We grab the LHS from the target side list.
	my (@l1tokens) = split(' ', $l1);
	my (@l2tokens) = split(' ', $l2);

	# Support regular phrase tables, too (with no LHS).
	my $lhs = undef;
	if ($l2tokens[-1] =~ /^\[.*\]$/) {
	pop(@l1tokens);
	$lhs = pop(@l2tokens);
	}

	# Now build a list of just the nonterminals. Then for the target side (L2), we map positions in
	# the string to its index in the list of nonterminals, used to discover the permutation later on.
	my (@l1nts);
	for (my $i = 0; $i < @l1tokens; $i++) {
	my $token = $l1tokens[$i];
	if ($token =~ /\[(\S+?)\]/g) {
	push(@l1nts, $1);
	}
	}
	my (@l2nts,@l2orders);
	my $num_lhs_seen = 0;
	for (my $i = 0; $i < @l2tokens; $i++) {
	my $token = $l2tokens[$i];
	if ($token =~ /\[(\S+?)\]/g) {
	push(@l2nts, $1);
	$l2orders[$i] = ++$num_lhs_seen;
	}
	}

	if (scalar @l1nts != scalar @l2nts) {
	print STDERR "* WARNING: nonterminal count mismatch on RHS\n";
	print STDERR "* " . (scalar @l1nts) . $/;
	print STDERR "* " . (scalar @l2nts) . $/;
	next;
	}

	# Build the permutation by first sorting L1 and then using the order index.
	my @permutation;
	my @alignments = sort by_first split(' ', $alignment);
	foreach my $pair (@alignments) {
	my ($l1,$l2) = split(/-/, $pair);
	push(@permutation, $l2orders[$l2]) if defined $l2orders[$l2];
	}

	if (scalar(@permutation) != scalar(@l2nts)) {
	my $a = scalar(@l2nts);
	my $b = scalar(@permutation);
	print STDERR "* [line $.] WARNING: permutation length is too short (l2nts $a, perm $b)\n";
	next;
	}

	# Now go on and print the rule.
	my $new_rule;
	if ($lhs) {
	$new_rule = "$lhs \|\|\| ";
	} else {
	$new_rule = "";
	}
	$num_lhs_seen = 0;
	foreach my $token (@l1tokens) {
	if ($token =~ /\[(\S+?)\]/g) {
	$new_rule .= "[$1," . (++$num_lhs_seen) . "] ";
	} else {
	$new_rule .= "$token ";
	}
	}

	$new_rule .= "\|\|\| ";

	$num_lhs_seen = 0;
	my $target = "";
	foreach my $token (@l2tokens) {
	if ($token =~ /\[(\S+?)\]/g) {
	$target .= "[$1," . $permutation[$num_lhs_seen++] . "] ";
	} else {
	$target .= "$token ";
	}
	}
	$target =~ s/^\s+//;
	$new_rule .= $target;

	my @probs = map { transform($_) } (split(' ',$probs));
	# Moses no longer uses exp(1) as its phrase penalty feature, so we can't
	# rely on the uniform transform above...
	push(@probs, -1) if ($opts{p});
	my $scores = join(" ", map { sprintf("%.5f", $_) } @probs);

	# $new_rule .= " \|\|\| $scores \|\|\| $alignment \|\|\| $counts";
	$new_rule .= "\|\|\| $scores \|\|\| $alignment";

	# print STDERR " NEW_RULE: $rule => $new_rule$/" if $new_rule =~ /^\\|/;
	print "$new_rule\n";

	if ($opts{m} and defined $tree) {
	$tree =~ s/.{{Tree\s+(.)}}.*/$1/;
	# Remove brackets around substitution points
	$tree =~ s/\[([^\[\]\s]+)\]/$1/g;
	# Add quotes around terminals
	$tree =~ s/\[([^\[\]]+) ([^\[\]]+)\]/[$1 "$2"]/g;
	$tree =~ s/\[/(/g;
	$tree =~ s/\]/)/g;

	print MAP "$tree \|\|\| $target\n";
	}
	}

	close(MAP) if ($opts{m});

	sub transform {
	my ($weight) = @_;

	# Moses defines the log_e() of non-positive weights as -100
	# Return -1 times this, since Joshua negates all feature weights from the grammar
	return "100" if ($weight <= 0.0);

	# if ($weight eq "2.718") {
	# return $weight;
	# } else {
	# return -log($weight);
	# }

	return -log($weight);
	}

	# Reads the moses config file to look for the span limit for grammar i (0-indexed).
	sub get_span_limit {
	my ($config_file, $index) = @_;

	open READ, $config_file or die "can't find config file '$config_file'";
	while (my $line = <READ>) {
	if ($line =~ /max-chart-span/) {
	# Burn through a number of lines until we get to the one we need. This assumes there are no
	# intervening comments or blank lines.
	for (my $i = 0; $i < $index; $i++) {
	my $t = <READ>;
	}
	last;
	}
	}
	chomp(my $max_span = <READ>);

	close(READ);
	return $max_span;
	}

	sub by_first {
	my $a1 = $a;
	$a1 =~ s/-.*//;
	my $b1 = $b;
	$b1 =~ s/-.*//;
	return $a1 <=> $b1;
	}