scripts/support/moses2joshua_grammar.pl - joshua - Git at Google

 #!/usr/bin/env perl
 # Matt Post <post@cs.jhu.edu>

 # Converts a Moses grammar to a Joshua grammar.
 #
 # Usage: cat grammar.moses | moses2joshua_grammar.pl > grammar.joshua
 #
 # The notable differences between the formats are as follows:
 #
 # (1) The rule syntax. Moses' rules look like this:
 #
 #     der [X][NN] [X] ||| of the [X][NN] [PP] ||| 0-0 0-1 1-2 ||| 1 ||| |||
 #
 # Whereas the corresponding Joshua rule looks like this:
 #
 #     [PP] ||| der [NN] ||| of the [NN] ||| 1
 #
 # (2) Phrase table values. Joshua negates the phrase table feature values upon reading them in,
 # whereas Moses (more sensibly) does not.

 use strict;
 use warnings;
 use File::Basename;
 use Getopt::Std;

 my %opts;
 getopts("i:d:x",\%opts);

 sub usage {
   print "Usage: cat moses.grammar | moses2joshua_grammar.pl > joshua.grammar\n";
   exit;
 }

   # Rules look like these:
   # <s> [X] ||| <s> [S] ||| 1 ||| ||| 0
   # [X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0
   # [X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0

 while (my $rule = <>) {
   chomp($rule);

   my $orig_rule = $rule;

 # ! es [X][VP] , [X] ||| , we [X][VP] , [PRN] ||| 0.0102041 0.00950695 1 0.000537615 2.718 ||| 2-2 ||| 98 1
 # ! es [X][VP] [X][,] [X] ||| , we [X][VP] [X][,] [PRN] ||| 0.0120482 0.0206611 1 0.000867691 2.718 ||| 2-2 3-3 ||| 83 1

   # Joshua doesn't support tree-to-string (currently), so we get rid of the source-side
   # nonterminal. This also simplifies later processing.
   $rule =~ s/ \[\S+?\](\[\S+?\])/ $1/g;

   my ($l1, $l2, $probs, $alignment, $counts) = split(/\|\|\|/, $rule);

   # The source-side nonterminals (of each pair) have been removed. Here we push all the for the
   # source and target sides into arrays. We grab the LHS from the target side list.
   my (@l1tokens) = split(' ', $l1);
   pop(@l1tokens);
   my (@l2tokens) = split(' ', $l2);
   my $lhs = pop(@l2tokens);

   # Now build a list of just the nonterminals. Then for the target side (L2), we map positions in
   # the string to its index in the list of nonterminals, used to discover the permutation later on.
   my (@l1nts);
   for (my $i = 0; $i < @l1tokens; $i++) {
     my $token = $l1tokens[$i];
     if ($token =~ /\[(\S+?)\]/g) {
       push(@l1nts, $1);
     }
   }
   my (@l2nts,@l2orders);
   my $num_lhs_seen = 0;
   for (my $i = 0; $i < @l2tokens; $i++) {
     my $token = $l2tokens[$i];
     if ($token =~ /\[(\S+?)\]/g) {
       push(@l2nts, $1);
       $l2orders[$i] = ++$num_lhs_seen;
     }
   }

   if (scalar @l1nts != scalar @l2nts) {
     print STDERR "* WARNING: nonterminal count mismatch on RHS\n";
     print STDERR "*  " . (scalar @l1nts) . $/;
     print STDERR "*  " . (scalar @l2nts) . $/;
     next;
   }

   # Build the permutation by first sorting L1 and then using the order index.
   my @permutation;
   my @alignments = sort by_first split(' ', $alignment);
   foreach my $pair (@alignments) {
     my ($l1,$l2) = split(/-/, $pair);
     push(@permutation, $l2orders[$l2]) if defined $l2orders[$l2];
   }

   if (scalar(@permutation) != scalar(@l2nts)) {
     my $a = scalar(@l2nts);
     my $b = scalar(@permutation);
     print STDERR "* [line $.] WARNING: permutation length is too short (l2nts $a, perm $b)\n";
     next;
   }

   # Now go on and print the rule.
   my $new_rule = "$lhs |||";
   $num_lhs_seen = 0;
   foreach my $token (@l1tokens) {
     if ($token =~ /\[(\S+?)\]/g) {
       $new_rule .= " [$1," . (++$num_lhs_seen) . "]";
     } else {
       $new_rule .= " $token";
     }
   }

   $new_rule .= " |||";

   $num_lhs_seen = 0;
   foreach my $token (@l2tokens) {
     if ($token =~ /\[(\S+?)\]/g) {
       $new_rule .= " [$1," . $permutation[$num_lhs_seen++] . "]";
     } else {
       $new_rule .= " $token";
     }
   }

   my @probs = map { transform($_) } (split(' ',$probs));
   my $scores = join(" ", map { sprintf("%.5f", $_) } @probs);

   $new_rule .= " ||| $scores";

   # print STDERR "  NEW_RULE: $new_rule$/";
   print "$new_rule\n";
 }

 sub transform {
   my ($weight) = @_;

   # Moses defines the log_e() of non-positive weights as -100
   return "-100" if ($weight <= 0.0);

   # if ($weight eq "2.718") {
   # 	return $weight;
   # } else {
   # 	return -log($weight);
   # }

   return -log($weight);
 }

 # Reads the moses config file to look for the span limit for grammar i (0-indexed).
 sub get_span_limit {
   my ($config_file, $index) = @_;

   open READ, $config_file or die "can't find config file '$config_file'";
   while (my $line = <READ>) {
     if ($line =~ /max-chart-span/) {
       # Burn through a number of lines until we get to the one we need.  This assumes there are no
       # intervening comments or blank lines.
       for (my $i = 0; $i < $index; $i++) {
         my $t = <READ>;
       }
       last;
     }
   }
   chomp(my $max_span = <READ>);

   close(READ);
   return $max_span;
 }

 sub by_first {
   my $a1 = $a;
   $a1 =~ s/-.*//;
   my $b1 = $b;
   $b1 =~ s/-.*//;
   return $a1 <=> $b1;
 }
	#!/usr/bin/env perl
	# Matt Post <post@cs.jhu.edu>

	# Converts a Moses grammar to a Joshua grammar.
	#
	# Usage: cat grammar.moses \| moses2joshua_grammar.pl > grammar.joshua
	#
	# The notable differences between the formats are as follows:
	#
	# (1) The rule syntax. Moses' rules look like this:
	#
	# der [X][NN] [X] \|\|\| of the [X][NN] [PP] \|\|\| 0-0 0-1 1-2 \|\|\| 1 \|\|\| \|\|\|
	#
	# Whereas the corresponding Joshua rule looks like this:
	#
	# [PP] \|\|\| der [NN] \|\|\| of the [NN] \|\|\| 1
	#
	# (2) Phrase table values. Joshua negates the phrase table feature values upon reading them in,
	# whereas Moses (more sensibly) does not.

	use strict;
	use warnings;
	use File::Basename;
	use Getopt::Std;

	my %opts;
	getopts("i:d:x",\%opts);

	sub usage {
	print "Usage: cat moses.grammar \| moses2joshua_grammar.pl > joshua.grammar\n";
	exit;
	}

	# Rules look like these:
	# <s> [X] \|\|\| <s> [S] \|\|\| 1 \|\|\| \|\|\| 0
	# [X][S] </s> [X] \|\|\| [X][S] </s> [S] \|\|\| 1 \|\|\| 0-0 \|\|\| 0
	# [X][S] [X][X] [X] \|\|\| [X][S] [X][X] [S] \|\|\| 2.718 \|\|\| 0-0 1-1 \|\|\| 0

	while (my $rule = <>) {
	chomp($rule);

	my $orig_rule = $rule;

	# ! es [X][VP] , [X] \|\|\| , we [X][VP] , [PRN] \|\|\| 0.0102041 0.00950695 1 0.000537615 2.718 \|\|\| 2-2 \|\|\| 98 1
	# ! es [X][VP] [X][,] [X] \|\|\| , we [X][VP] [X][,] [PRN] \|\|\| 0.0120482 0.0206611 1 0.000867691 2.718 \|\|\| 2-2 3-3 \|\|\| 83 1

	# Joshua doesn't support tree-to-string (currently), so we get rid of the source-side
	# nonterminal. This also simplifies later processing.
	$rule =~ s/ \[\S+?\](\[\S+?\])/ $1/g;

	my ($l1, $l2, $probs, $alignment, $counts) = split(/\\|\\|\\|/, $rule);

	# The source-side nonterminals (of each pair) have been removed. Here we push all the for the
	# source and target sides into arrays. We grab the LHS from the target side list.
	my (@l1tokens) = split(' ', $l1);
	pop(@l1tokens);
	my (@l2tokens) = split(' ', $l2);
	my $lhs = pop(@l2tokens);

	# Now build a list of just the nonterminals. Then for the target side (L2), we map positions in
	# the string to its index in the list of nonterminals, used to discover the permutation later on.
	my (@l1nts);
	for (my $i = 0; $i < @l1tokens; $i++) {
	my $token = $l1tokens[$i];
	if ($token =~ /\[(\S+?)\]/g) {
	push(@l1nts, $1);
	}
	}
	my (@l2nts,@l2orders);
	my $num_lhs_seen = 0;
	for (my $i = 0; $i < @l2tokens; $i++) {
	my $token = $l2tokens[$i];
	if ($token =~ /\[(\S+?)\]/g) {
	push(@l2nts, $1);
	$l2orders[$i] = ++$num_lhs_seen;
	}
	}

	if (scalar @l1nts != scalar @l2nts) {
	print STDERR "* WARNING: nonterminal count mismatch on RHS\n";
	print STDERR "* " . (scalar @l1nts) . $/;
	print STDERR "* " . (scalar @l2nts) . $/;
	next;
	}

	# Build the permutation by first sorting L1 and then using the order index.
	my @permutation;
	my @alignments = sort by_first split(' ', $alignment);
	foreach my $pair (@alignments) {
	my ($l1,$l2) = split(/-/, $pair);
	push(@permutation, $l2orders[$l2]) if defined $l2orders[$l2];
	}

	if (scalar(@permutation) != scalar(@l2nts)) {
	my $a = scalar(@l2nts);
	my $b = scalar(@permutation);
	print STDERR "* [line $.] WARNING: permutation length is too short (l2nts $a, perm $b)\n";
	next;
	}

	# Now go on and print the rule.
	my $new_rule = "$lhs \|\|\|";
	$num_lhs_seen = 0;
	foreach my $token (@l1tokens) {
	if ($token =~ /\[(\S+?)\]/g) {
	$new_rule .= " [$1," . (++$num_lhs_seen) . "]";
	} else {
	$new_rule .= " $token";
	}
	}

	$new_rule .= " \|\|\|";

	$num_lhs_seen = 0;
	foreach my $token (@l2tokens) {
	if ($token =~ /\[(\S+?)\]/g) {
	$new_rule .= " [$1," . $permutation[$num_lhs_seen++] . "]";
	} else {
	$new_rule .= " $token";
	}
	}

	my @probs = map { transform($_) } (split(' ',$probs));
	my $scores = join(" ", map { sprintf("%.5f", $_) } @probs);

	$new_rule .= " \|\|\| $scores";

	# print STDERR " NEW_RULE: $new_rule$/";
	print "$new_rule\n";
	}

	sub transform {
	my ($weight) = @_;

	# Moses defines the log_e() of non-positive weights as -100
	return "-100" if ($weight <= 0.0);

	# if ($weight eq "2.718") {
	# return $weight;
	# } else {
	# return -log($weight);
	# }

	return -log($weight);
	}

	# Reads the moses config file to look for the span limit for grammar i (0-indexed).
	sub get_span_limit {
	my ($config_file, $index) = @_;

	open READ, $config_file or die "can't find config file '$config_file'";
	while (my $line = <READ>) {
	if ($line =~ /max-chart-span/) {
	# Burn through a number of lines until we get to the one we need. This assumes there are no
	# intervening comments or blank lines.
	for (my $i = 0; $i < $index; $i++) {
	my $t = <READ>;
	}
	last;
	}
	}
	chomp(my $max_span = <READ>);

	close(READ);
	return $max_span;
	}

	sub by_first {
	my $a1 = $a;
	$a1 =~ s/-.*//;
	my $b1 = $b;
	$b1 =~ s/-.*//;
	return $a1 <=> $b1;
	}