scripts/preparation/tokenize.pl - joshua - Git at Google

 #!/usr/bin/env perl
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 # Sample Tokenizer
 # written by Josh Schroeder, based on code by Philipp Koehn

 binmode(STDIN, ":encoding(utf8)");
 binmode(STDOUT, ":encoding(utf8)");

 use warnings;
 use strict;
 use File::Basename qw/dirname/;

 my $JOSHUA;
 BEGIN {
   $JOSHUA = $ENV{JOSHUA} || ".";
 }

 my %NONBREAKING_PREFIX = ();
 my $language = "en";
 my $QUIET = 1;
 my $HELP = 0;
 my $PREFIX_DIR = find_nonbreaking_prefixes();

 sub find_nonbreaking_prefixes {
   # look in the following locations until you find one
   my @prefixes = ( dirname($0), "$JOSHUA/scripts/preparation" );
   foreach my $prefix (@prefixes) {
     my $path = "$prefix/nonbreaking_prefixes";
     return $path if -e $path;
   }
   return "";
 }

 my $use_penn_treebank_tokenization = 1;

 #my $start = [ Time::HiRes::gettimeofday( ) ];

 while (@ARGV) {
 	$_ = shift;
 	/^-l$/ && ($language = shift, next);
 	/^-v$/ && ($QUIET = 0, next);
 	/^-h$/ && ($HELP = 1, next);
   /^-p$/ && ($PREFIX_DIR = shift, next);
 }

 if ($HELP) {
 	print "Usage ./tokenize.pl (-l [en|de|...]) < textfile > tokenizedfile\n";
 	exit;
 }
 if (!$QUIET) {
 	print STDERR "Tokenizer v3\n";
 	print STDERR "Language: $language\n";
 }

 load_prefixes($language,\%NONBREAKING_PREFIX);

 if (scalar(keys(%NONBREAKING_PREFIX)) == 0 && ! $QUIET){
 	print STDERR "Warning: No known abbreviations for language '$language'\n";
 }

 while(<STDIN>) {
 	if (/^<.+>$/ || /^\s*$/) {
 		#don't try to tokenize XML/HTML tag lines
 		print $_;
 	}
 	else {
 		print &tokenize($_);
 	}
 }

 #my $duration = Time::HiRes::tv_interval( $start );
 #print STDERR ("EXECUTION TIME: ".$duration."\n");


 sub tokenize {
 	my($text) = @_;
 	chomp($text);
 	$text = " $text ";

 	#  convert UTF8 curly quotes to ASCII
 	$text =~ s/\xe2\x80\x99/\'/gs;
 	$text =~ s/\xe2\x80\x98/\'/gs;
 	$text =~ s/\xe2\x80\x9c/\"/gs;
 	$text =~ s/\xe2\x80\x9d/\"/gs;

 	# seperate out all "other" special characters
 	$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-\"\|])/ $1 /g;


 	$text =~ s/\;/ \; /g;

 	$text =~ s/\:/ \: /g;


 	$text =~ s/‘/ \` /g;
 	$text =~ s/’/ \' /g;
 	$text =~ s/“/ \`\` /g;
 	$text =~ s/”/ \'\' /g;


 	# replace the pipe character, which is
 	# a special reserved character in Moses
 	$text =~ s/\|/ -PIPE- /g;

 	if($use_penn_treebank_tokenization) {
 		$text =~ s/\(/ -LRB- /g;
 		$text =~ s/\)/ -RRB- /g;
 		$text =~ s/\[/ -LSB- /g;
 		$text =~ s/\]/ -RSB- /g;
 		$text =~ s/\{/ -LCB- /g;
 		$text =~ s/\}/ -RCB- /g;

 		$text =~ s/\"\s*$/ \'\' /g;
 		$text =~ s/^\s*\"/ \`\` /g;
 		$text =~ s/(\S)\"\s/$1 \'\' /g;
 		$text =~ s/\s\"(\S)/ \`\` $1/g;
 		$text =~ s/(\S)\"/$1 \'\' /g;
 		$text =~ s/\"(\S)/ \`\` $1/g;

 		$text =~ s/\\'\s*$/ \' /g;
 		$text =~ s/^\s*\'/ \` /g;
 		$text =~ s/(\S)\'\s/$1 ' /g;
 		$text =~ s/\s\'(\S)/ \` $1/g;

 		$text =~ s/\'ll/ -CONTRACT-ll/g;
 		$text =~ s/\'re/ -CONTRACT-re/g;
 		$text =~ s/\'ve/ -CONTRACT-ve/g;
 		$text =~ s/n\'t/ n-CONTRACT-t/g;
 		$text =~ s/\'LL/ -CONTRACT-LL/g;
 		$text =~ s/\'RE/ -CONTRACT-RE/g;
 		$text =~ s/\'VE/ -CONTRACT-VE/g;
 		$text =~ s/N\'T/ N-CONTRACT-T/g;
 		$text =~ s/cannot/can not/g;
 		$text =~ s/Cannot/Can not/g;
 	}


 	#multi-dots stay together
 	$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
 	while($text =~ /DOTMULTI\./) {
 		$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
 		$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
 	}


 	#multi-dashes stay together
 	$text =~ s/\-([\-]+)/ DASHMULTI$1/g;
 	while($text =~ /DASHMULTI\-/) {
 		$text =~ s/DASHMULTI\-([^\-])/DASHDASHMULTI $1/g;
 		$text =~ s/DASHMULTI\-/DASHDASHMULTI/g;
 	}

 	# seperate out "," except if within numbers (5,300)
 	$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
 	# separate , pre and post number
 	$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
 	$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;


 	if(!$use_penn_treebank_tokenization) {
 		$text =~ s/\"/ " /g;
 		# turn ` into '
 		$text =~ s/\`/\'/g;


 		#turn '' into "
 		$text =~ s/\'\'/ \" /g;
 	}


 	if ($language eq "en") {
 		#split contractions right
 		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
 		#special case for "1990's"
 		$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
 		$text =~ s/ '\s+s / 's /g;
 		$text =~ s/ '\s+s / 's /g;
 		$text =~ s/ `\s+s / 's /g;
 	} elsif (($language eq "fr") or ($language eq "it")) {
 		#split contractions left
 		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
 	} else {
 		$text =~ s/\'/ \' /g;
 	}
 	$text =~ s/\' \'/\'\'/g;

 	#word token method
 	my @words = split(/\s/,$text);
 	$text = "";
 	for (my $i=0;$i<(scalar(@words));$i++) {
 		my $word = $words[$i];
 		if ( $word =~ /^(\S+)\.$/) {
 			my $pre = $1;
 			if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
 				#no change
 			} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
 				#no change
 			} else {
 				$word = $pre." .";
 			}
 		}
 		$text .= $word." ";
 	}

 	$text =~ s/'\s+'/''/g;

 	# clean up extraneous spaces
 	$text =~ s/ +/ /g;
 	$text =~ s/^ //g;
 	$text =~ s/ $//g;

 	#restore multi-dots
 	while($text =~ /DOTDOTMULTI/) {
 		$text =~ s/DOTDOTMULTI/DOTMULTI./g;
 	}
 	$text =~ s/DOTMULTI/./g;


 	#restore multi-dashes
 	while($text =~ /DASHDASHMULTI/) {
 		$text =~ s/DASHDASHMULTI/DASHMULTI-/g;
 	}
 	$text =~ s/DASHMULTI/-/g;

         # trim down multi-dashes and multi dots and
         # multi underscores
 	$text =~ s/\.{4,}/\./g;
 	$text =~ s/\-+/\-/g;
 	$text =~ s/\_ \_/\_/g;
 	$text =~ s/\_+/\_/g;


 	$text =~ s/-CONTRACT-/'/g;

 	#ensure final line break
 	$text .= "\n" unless $text =~ /\n$/;

         # ccb - debugging - June 26, changing quote tokens
         $text =~ s/\'\'/\"/g;
         $text =~ s/\`\`/\"/g;
         $text =~ s/\`/\'/g;


 	# repair this change if it's a time sequence like 12:01 or a date like 01/01/01.
 	$text =~ s/(\d) \: (\d)/$1\:$2/g;
 	$text =~ s/(\d) \/ (\d)/$1\/$2/g;


 	return $text;
 }

 sub load_prefixes {
 	my ($language, $PREFIX_REF) = @_;

 	my $prefixfile = "$PREFIX_DIR/nonbreaking_prefix.$language";

 	#default back to English if we don't have a language-specific prefix file
 	if (!(-e $prefixfile)) {
 		$prefixfile = "$PREFIX_DIR/nonbreaking_prefix.en";
 		print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n" unless $QUIET;
 		die ("ERROR: No abbreviations files found in $PREFIX_DIR\n") unless (-e $prefixfile);
 	}

 	if (-e "$prefixfile") {
 		open(PREFIX, "<:utf8", "$prefixfile");
 		while (<PREFIX>) {
 			my $item = $_;
 			chomp($item);
 			if (($item) && (substr($item,0,1) ne "#")) {
 				if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
 					$PREFIX_REF->{$1} = 2;
 				} else {
 					$PREFIX_REF->{$item} = 1;
 				}
 			}
 		}
 		close(PREFIX);
 	}

 }
	#!/usr/bin/env perl
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# Sample Tokenizer
	# written by Josh Schroeder, based on code by Philipp Koehn

	binmode(STDIN, ":encoding(utf8)");
	binmode(STDOUT, ":encoding(utf8)");

	use warnings;
	use strict;
	use File::Basename qw/dirname/;

	my $JOSHUA;
	BEGIN {
	$JOSHUA = $ENV{JOSHUA} \|\| ".";
	}

	my %NONBREAKING_PREFIX = ();
	my $language = "en";
	my $QUIET = 1;
	my $HELP = 0;
	my $PREFIX_DIR = find_nonbreaking_prefixes();

	sub find_nonbreaking_prefixes {
	# look in the following locations until you find one
	my @prefixes = ( dirname($0), "$JOSHUA/scripts/preparation" );
	foreach my $prefix (@prefixes) {
	my $path = "$prefix/nonbreaking_prefixes";
	return $path if -e $path;
	}
	return "";
	}

	my $use_penn_treebank_tokenization = 1;

	#my $start = [ Time::HiRes::gettimeofday( ) ];

	while (@ARGV) {
	$_ = shift;
	/^-l$/ && ($language = shift, next);
	/^-v$/ && ($QUIET = 0, next);
	/^-h$/ && ($HELP = 1, next);
	/^-p$/ && ($PREFIX_DIR = shift, next);
	}

	if ($HELP) {
	print "Usage ./tokenize.pl (-l [en\|de\|...]) < textfile > tokenizedfile\n";
	exit;
	}
	if (!$QUIET) {
	print STDERR "Tokenizer v3\n";
	print STDERR "Language: $language\n";
	}

	load_prefixes($language,\%NONBREAKING_PREFIX);

	if (scalar(keys(%NONBREAKING_PREFIX)) == 0 && ! $QUIET){
	print STDERR "Warning: No known abbreviations for language '$language'\n";
	}

	while(<STDIN>) {
	if (/^<.+>$/ \|\| /^\s*$/) {
	#don't try to tokenize XML/HTML tag lines
	print $_;
	}
	else {
	print &tokenize($_);
	}
	}

	#my $duration = Time::HiRes::tv_interval( $start );
	#print STDERR ("EXECUTION TIME: ".$duration."\n");


	sub tokenize {
	my($text) = @_;
	chomp($text);
	$text = " $text ";

	# convert UTF8 curly quotes to ASCII
	$text =~ s/\xe2\x80\x99/\'/gs;
	$text =~ s/\xe2\x80\x98/\'/gs;
	$text =~ s/\xe2\x80\x9c/\"/gs;
	$text =~ s/\xe2\x80\x9d/\"/gs;

	# seperate out all "other" special characters
	$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-\"\\|])/ $1 /g;


	$text =~ s/\;/ \; /g;

	$text =~ s/\:/ \: /g;


	$text =~ s/‘/ \` /g;
	$text =~ s/’/ \' /g;
	$text =~ s/“/ \`\` /g;
	$text =~ s/”/ \'\' /g;


	# replace the pipe character, which is
	# a special reserved character in Moses
	$text =~ s/\\|/ -PIPE- /g;

	if($use_penn_treebank_tokenization) {
	$text =~ s/\(/ -LRB- /g;
	$text =~ s/\)/ -RRB- /g;
	$text =~ s/\[/ -LSB- /g;
	$text =~ s/\]/ -RSB- /g;
	$text =~ s/\{/ -LCB- /g;
	$text =~ s/\}/ -RCB- /g;

	$text =~ s/\"\s*$/ \'\' /g;
	$text =~ s/^\s*\"/ \`\` /g;
	$text =~ s/(\S)\"\s/$1 \'\' /g;
	$text =~ s/\s\"(\S)/ \`\` $1/g;
	$text =~ s/(\S)\"/$1 \'\' /g;
	$text =~ s/\"(\S)/ \`\` $1/g;

	$text =~ s/\\'\s*$/ \' /g;
	$text =~ s/^\s*\'/ \` /g;
	$text =~ s/(\S)\'\s/$1 ' /g;
	$text =~ s/\s\'(\S)/ \` $1/g;

	$text =~ s/\'ll/ -CONTRACT-ll/g;
	$text =~ s/\'re/ -CONTRACT-re/g;
	$text =~ s/\'ve/ -CONTRACT-ve/g;
	$text =~ s/n\'t/ n-CONTRACT-t/g;
	$text =~ s/\'LL/ -CONTRACT-LL/g;
	$text =~ s/\'RE/ -CONTRACT-RE/g;
	$text =~ s/\'VE/ -CONTRACT-VE/g;
	$text =~ s/N\'T/ N-CONTRACT-T/g;
	$text =~ s/cannot/can not/g;
	$text =~ s/Cannot/Can not/g;
	}




	#multi-dots stay together
	$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
	while($text =~ /DOTMULTI\./) {
	$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
	$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
	}


	#multi-dashes stay together
	$text =~ s/\-([\-]+)/ DASHMULTI$1/g;
	while($text =~ /DASHMULTI\-/) {
	$text =~ s/DASHMULTI\-([^\-])/DASHDASHMULTI $1/g;
	$text =~ s/DASHMULTI\-/DASHDASHMULTI/g;
	}

	# seperate out "," except if within numbers (5,300)
	$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
	# separate , pre and post number
	$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
	$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;



	if(!$use_penn_treebank_tokenization) {
	$text =~ s/\"/ " /g;
	# turn ` into '
	$text =~ s/\`/\'/g;


	#turn '' into "
	$text =~ s/\'\'/ \" /g;
	}


	if ($language eq "en") {
	#split contractions right
	$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
	$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
	$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
	$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
	#special case for "1990's"
	$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
	$text =~ s/ '\s+s / 's /g;
	$text =~ s/ '\s+s / 's /g;
	$text =~ s/ `\s+s / 's /g;
	} elsif (($language eq "fr") or ($language eq "it")) {
	#split contractions left
	$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
	$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
	$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
	$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
	} else {
	$text =~ s/\'/ \' /g;
	}
	$text =~ s/\' \'/\'\'/g;

	#word token method
	my @words = split(/\s/,$text);
	$text = "";
	for (my $i=0;$i<(scalar(@words));$i++) {
	my $word = $words[$i];
	if ( $word =~ /^(\S+)\.$/) {
	my $pre = $1;
	if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) \|\| ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) \|\| ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
	#no change
	} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
	#no change
	} else {
	$word = $pre." .";
	}
	}
	$text .= $word." ";
	}

	$text =~ s/'\s+'/''/g;

	# clean up extraneous spaces
	$text =~ s/ +/ /g;
	$text =~ s/^ //g;
	$text =~ s/ $//g;

	#restore multi-dots
	while($text =~ /DOTDOTMULTI/) {
	$text =~ s/DOTDOTMULTI/DOTMULTI./g;
	}
	$text =~ s/DOTMULTI/./g;


	#restore multi-dashes
	while($text =~ /DASHDASHMULTI/) {
	$text =~ s/DASHDASHMULTI/DASHMULTI-/g;
	}
	$text =~ s/DASHMULTI/-/g;

	# trim down multi-dashes and multi dots and
	# multi underscores
	$text =~ s/\.{4,}/\./g;
	$text =~ s/\-+/\-/g;
	$text =~ s/\_ \_/\_/g;
	$text =~ s/\_+/\_/g;



	$text =~ s/-CONTRACT-/'/g;

	#ensure final line break
	$text .= "\n" unless $text =~ /\n$/;

	# ccb - debugging - June 26, changing quote tokens
	$text =~ s/\'\'/\"/g;
	$text =~ s/\`\`/\"/g;
	$text =~ s/\`/\'/g;


	# repair this change if it's a time sequence like 12:01 or a date like 01/01/01.
	$text =~ s/(\d) \: (\d)/$1\:$2/g;
	$text =~ s/(\d) \/ (\d)/$1\/$2/g;


	return $text;
	}

	sub load_prefixes {
	my ($language, $PREFIX_REF) = @_;

	my $prefixfile = "$PREFIX_DIR/nonbreaking_prefix.$language";

	#default back to English if we don't have a language-specific prefix file
	if (!(-e $prefixfile)) {
	$prefixfile = "$PREFIX_DIR/nonbreaking_prefix.en";
	print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n" unless $QUIET;
	die ("ERROR: No abbreviations files found in $PREFIX_DIR\n") unless (-e $prefixfile);
	}

	if (-e "$prefixfile") {
	open(PREFIX, "<:utf8", "$prefixfile");
	while (<PREFIX>) {
	my $item = $_;
	chomp($item);
	if (($item) && (substr($item,0,1) ne "#")) {
	if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
	$PREFIX_REF->{$1} = 2;
	} else {
	$PREFIX_REF->{$item} = 1;
	}
	}
	}
	close(PREFIX);
	}

	}