blob: 1c2454b3e4219dc4f0c462437dc6f9a673b3260c [file] [log] [blame]
#!/usr/bin/env perl
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script is distributed along with the datasets for the
# shared translation task of the NAACL 2012 Workshop on Statistical Machine
# Translation. We include it here for convenience.
use strict;
use warnings;
use utf8;
use v5.12;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my ($language) = shift(@ARGV) || "en";
my $have_html_entities = eval
{
require HTML::Entities;
HTML::Entities->import();
1;
};
while(<STDIN>) {
s/\r//g;
# remove extra spaces
s/\(/ \(/g;
s/\)/\) /g; s/ +/ /g;
s/\) ([\.\!\:\?\;\,])/\)$1/g;
s/\( /\(/g;
s/ \)/\)/g;
s/(\d) \%/$1\%/g;
s/ :/:/g;
s/ ;/;/g;
# normalize unicode punctuation
s/„/\"/g;
s/“/\"/g;
s/”/\"/g;
s/–/-/g;
s/−/-/g;
s/—/ - /g;
s/ +/ /g;
s/´/\'/g;
s/′/\'/g;
s/’/\'/g;
s/([a-z])‘([a-z])/$1\'$2/gi;
s/([a-z])’([a-z])/$1\'$2/gi;
s/‘/\"/g;
s/‚/\"/g;
s/’/\"/g;
s/''/\"/g;
s/´´/\"/g;
s/…/.../g;
s/°/º/g;
# Replace non-breaking spaces (which are surprisingly prevalent, and don't
# count as whitespace) with spaces.
# French quotes
s/\xA0«\xA0/ \"/g;
s\xA0/\"/g;
s/«/\"/g;
s/\xA0»\xA0/\" /g;
s/\xA0»/\"/g;
s/»/\"/g;
# handle pseudo-spaces
s/\xA0\%/\%/g;
s/nº\xA0/nº /g;
s/\xA0:/:/g;
s/\xA0ºC/ ºC/g;
s/\xA0cm/ cm/g;
s/\xA0\?/\?/g;
s/\xA0\!/\!/g;
s/\xA0;/;/g;
s/,\xA0/, /g; s/ +/ /g;
# English "quotation," followed by comma, style
if ($language eq "en") {
s/\"([,\.]+)/$1\"/g;
}
# Czech is confused
elsif ($language eq "cs" || $language eq "cz") {
}
# German/Spanish/French "quotation", followed by comma, style
else {
s/,\"/\",/g;
s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
}
# print STDERR "BAD LINE (with <feff>): $_" if //;
if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
s/(\d)\xA0(\d)/$1,$2/g;
}
else {
s/(\d)\xA0(\d)/$1.$2/g;
}
# Replace the rest of the nonbreaking spaces with a regular space.
s/[\xA0]+/ /g;
if ($have_html_entities) {
$_ = decode_entities($_);
}
# Get rid of unicode directional indicators
s/[\x{200E}\x{200F}\x{202B}]//;
print $_;
}