| #!/usr/bin/env perl |
| |
| use strict; |
| |
| # Sample De-Tokenizer |
| # written by Josh Schroeder, based on code by Philipp Koehn |
| # modified later by ByungGyu Ahn, bahn@cs.jhu.edu, Luke Orland |
| |
| binmode(STDIN, ":utf8"); |
| binmode(STDOUT, ":utf8"); |
| use strict; |
| |
| my $language = "en"; |
| my $QUIET = 1; |
| my $HELP = 0; |
| |
| while (@ARGV) { |
| $_ = shift; |
| /^-l$/ && ($language = shift, next); |
| /^-v$/ && ($QUIET = 0, next); |
| /^-h$/ && ($HELP = 1, next); |
| } |
| |
| if ($HELP) { |
| print "Usage ./detokenizer.perl (-l [en|de|...]) < tokenizedfile > detokenizedfile\n"; |
| exit; |
| } |
| if (!$QUIET) { |
| print STDERR "Detokenizer Version 1.1\n"; |
| print STDERR "Language: $language\n"; |
| } |
| |
| while(<STDIN>) { |
| if (/^<.+>$/ || /^\s*$/) { |
| #don't try to detokenize XML/HTML tag lines |
| print $_; |
| } |
| else { |
| print &detokenize($_); |
| } |
| } |
| |
| sub detokenize { |
| my($text) = @_; |
| chomp($text); |
| $text = " $text "; |
| |
| # convert curly quotes to ASCII e.g. ‘“”’ |
| $text =~ s/\x{2018}/'/gs; |
| $text =~ s/\x{2019}/'/gs; |
| $text =~ s/\x{201c}/"/gs; |
| $text =~ s/\x{201d}/"/gs; |
| $text =~ s/\x{e2}\x{80}\x{98}/'/gs; |
| $text =~ s/\x{e2}\x{80}\x{99}/'/gs; |
| $text =~ s/\x{e2}\x{80}\x{9c}/"/gs; |
| $text =~ s/\x{e2}\x{80}\x{9d}/"/gs; |
| |
| $text =~ s/ '\s+' / " /g; |
| $text =~ s/ ` / ' /g; |
| $text =~ s/ ' / ' /g; |
| $text =~ s/ `` / " /g; |
| $text =~ s/ '' / " /g; |
| |
| # replace the pipe character, which is |
| # a special reserved character in Moses |
| $text =~ s/ -PIPE- / \| /g; |
| |
| $text =~ s/ -LRB- / \( /g; |
| $text =~ s/ -RRB- / \) /g; |
| $text =~ s/ -LSB- / \[ /g; |
| $text =~ s/ -RSB- / \] /g; |
| $text =~ s/ -LCB- / \{ /g; |
| $text =~ s/ -RCB- / \} /g; |
| $text =~ s/ -lrb- / \( /g; |
| $text =~ s/ -rrb- / \) /g; |
| $text =~ s/ -lsb- / \[ /g; |
| $text =~ s/ -rsb- / \] /g; |
| $text =~ s/ -lcb- / \{ /g; |
| $text =~ s/ -rcb- / \} /g; |
| |
| $text =~ s/ 'll /'ll /g; |
| $text =~ s/ 're /'re /g; |
| $text =~ s/ 've /'ve /g; |
| $text =~ s/ n't /n't /g; |
| $text =~ s/ 'LL /'LL /g; |
| $text =~ s/ 'RE /'RE /g; |
| $text =~ s/ 'VE /'VE /g; |
| $text =~ s/ N'T /N'T /g; |
| $text =~ s/ can not / cannot /g; |
| $text =~ s/ Can not / Cannot /g; |
| |
| # just in case the contraction was not properly treated |
| $text =~ s/ ' ll /'ll /g; |
| $text =~ s/ ' re /'re /g; |
| $text =~ s/ ' ve /'ve /g; |
| $text =~ s/n ' t /n't /g; |
| $text =~ s/ ' LL /'LL /g; |
| $text =~ s/ ' RE /'RE /g; |
| $text =~ s/ ' VE /'VE /g; |
| $text =~ s/N ' T /N'T /g; |
| |
| my $word; |
| my $i; |
| my @words = split(/ /,$text); |
| $text = ""; |
| my %quoteCount = ("\'"=>0,"\""=>0); |
| my $prependSpace = " "; |
| for ($i=0;$i<(scalar(@words));$i++) { |
| if ($words[$i] =~ /^[\p{IsSc}]+$/) { |
| #perform shift on currency |
| if (($i<(scalar(@words)-1)) && ($words[$i+1] =~ /^[0-9]/)) { |
| $text = $text.$prependSpace.$words[$i]; |
| $prependSpace = ""; |
| } else { |
| $text=$text.$words[$i]; |
| $prependSpace = " "; |
| } |
| } elsif ($words[$i] =~ /^[\(\[\{\¿\¡]+$/) { |
| #perform right shift on random punctuation items |
| $text = $text.$prependSpace.$words[$i]; |
| $prependSpace = ""; |
| } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ |
| #perform left shift on punctuation items |
| $text=$text.$words[$i]; |
| $prependSpace = " "; |
| } elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { |
| #left-shift the contraction for English |
| $text=$text.$words[$i]; |
| $prependSpace = " "; |
| } elsif (($language eq "en") && ($i>0) && ($i<(scalar(@words)-1)) && ($words[$i] eq "&") && ($words[$i-1] =~ /^[A-Z]$/) && ($words[$i+1] =~ /^[A-Z]$/)) { |
| #some contraction with an ampersand e.g. "R&D" |
| $text .= $words[$i]; |
| $prependSpace = ""; |
| } elsif (($language eq "fr") && ($i<(scalar(@words)-1)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) { |
| #right-shift the contraction for French |
| $text = $text.$prependSpace.$words[$i]; |
| $prependSpace = ""; |
| } elsif ($words[$i] =~ /^[\'\"]+$/) { |
| #combine punctuation smartly |
| if (($quoteCount{$words[$i]} % 2) eq 0) { |
| if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) { |
| #single quote for posesssives ending in s... "The Jones' house" |
| #left shift |
| $text=$text.$words[$i]; |
| $prependSpace = " "; |
| } elsif (($language eq "en") && ($words[$i] eq "'") && ($i < (scalar(@words)-1)) && ($words[$i+1] eq "s")) { |
| #single quote for possessive construction. "John's" |
| $text .= $words[$i]; |
| $prependSpace = ""; |
| } elsif (($quoteCount{$words[$i]} == 0) && |
| ($language eq "en") && ($words[$i] eq '"') && ($i>1) && ($words[$i-1] =~ /^[,.]$/) && ($words[$i-2] ne "said")) { |
| #emergency case in which the opening quote is missing |
| #ending double quote for direct quotes. e.g. Blah," he said. but not like he said, "Blah. |
| $text .= $words[$i]; |
| $prependSpace = " "; |
| } elsif (($language eq "en") && ($words[$i] eq '"') && ($i < (scalar(@words)-1)) && ($words[$i+1] =~ /^[,.]$/)) { |
| $text .= $words[$i]; |
| $prependSpace = " "; |
| } else { |
| #right shift |
| $text = $text.$prependSpace.$words[$i]; |
| $prependSpace = ""; |
| $quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1; |
| |
| } |
| } else { |
| #left shift |
| $text=$text.$words[$i]; |
| $prependSpace = " "; |
| $quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1; |
| |
| } |
| |
| } else { |
| $text=$text.$prependSpace.$words[$i]; |
| $prependSpace = " "; |
| } |
| } |
| |
| #clean continuing spaces |
| $text =~ s/ +/ /g; |
| |
| #delete spaces around double angle brackets «» |
| # Uh-oh. not a good idea. it is not consistent. |
| $text =~ s/(\x{c2}\x{ab}|\x{ab}) /$1/g; |
| $text =~ s/ (\x{c2}\x{bb}|\x{bb})/$1/g; |
| |
| # delete spaces around all other special characters |
| # Uh-oh. not a good idea. "Men&Women" |
| #$text =~ s/ ([^\p{IsAlnum}\s\.\'\`\,\-\"\|]) /$1/g; |
| $text =~ s/ \/ /\//g; |
| |
| # clean up spaces at head and tail of each line as well as any double-spacing |
| $text =~ s/\n /\n/g; |
| $text =~ s/ \n/\n/g; |
| $text =~ s/^ //g; |
| $text =~ s/ $//g; |
| |
| #add trailing break |
| $text .= "\n" unless $text =~ /\n$/; |
| |
| return $text; |
| } |