blob: 7d67cd957e010dd68c28724deca5e07e0a79e623 [file] [log] [blame]
#!/usr/bin/env perl
# This program converts a 1-best translation file and a collection
# of reference files which are all sentence/line aligned into
# sufficient SGML to run IBM's BLEU script. That is, we make up the
# document and segment ids and anything else we need to.
#
# Note that, since BLEU is document based, this is a bad idea if you'll be grouping many documents together thereby.
use warnings;
use strict;
my ($stem) = @ARGV;
die "Usage: $0 file_stem\n" .
"\nWe assume that files are named stem.1best, stem.ref.0, stem.ref.1,...\n"
unless $stem and 1 == @ARGV;
sub safe_open { my ($mode, $filename) = @_;
open my $fh, $mode, $filename
or die "Couldn't open file $filename: $!\n";
return $fh;
}
*STDIN = safe_open('<', "$stem.1best");
*STDOUT = safe_open('>', "$stem.1best.sgm");
print "<srcset setid=\"$stem\">\n",
"<doc docid=\"$stem\" sysid=\"Joshua\">\n";
my $segid = 0;
while (my $line = <STDIN>) { chomp $line;
$segid++;
print "<seg id=\"$segid\">$line</seg>\n";
}
print "</doc>\n",
"</srcset>\n";
*STDOUT = safe_open('>', "$stem.refs.sgm");
print "<refset setid=\"$stem\">\n";
my $refid = 0;
while (-e "$stem.ref.$refid") {
*STDIN = safe_open('<', "$stem.ref.$refid");
$refid++;
print "<doc docid=\"$stem\" sysid=\"$refid\">\n";
$segid = 0;
while (my $line = <STDIN>) { chomp $line;
$segid++;
print "<seg id=\"$segid\">$line</seg>\n";
}
print "</doc>\n";
};
print "</refset>\n";
__END__