scripts/distributedLM/get_grammar_eng_voc_from_cn_voc.pl - joshua - Git at Google

 #!/usr/bin/perl -w
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 use Encode;
 #### get the eng voc of the phrase tbl based on the input chinese

 my $fcn_text=$ARGV[0]; #test cn sentences
 my $fphrase_tbl=$ARGV[1];
 my $feng_voc=$ARGV[2];
 my $ftbl_out=$ARGV[3];


 my %eng_voc=();

 #add basic lm vocabulary here
 $eng_voc{"<unk>"}=1;
 $eng_voc{"<s>"}=1;
 $eng_voc{"</s>"}=1;

 @g_cn_sents=();
 get_cn_sents($fcn_text,\@g_cn_sents);

 #filter phrase tbl and get the english voc
 open(FTM, $fphrase_tbl) or die "cannot open file $fphrase_tbl\n";
 open(FOUT, ">$ftbl_out") or die "cannot open file $ftbl_out\n";
 while(my $line=<FTM>){
     chomp($line);
     my @fds=split(/\s+\|{3}\s+/,$line);
     my $cn=$fds[1];
     my $eng=$fds[2];

     next if(filter_rule($cn,\@g_cn_sents)==1);
     print FOUT "$line\n";

     my @eng_wrds=split(/\s+/, $eng);
     foreach my $wrd (@eng_wrds){
 	$wrd =~ s/^\s+//g;
 	$wrd =~ s/\s+$//g;
 	next if($wrd =~ m/^\[PHRASE,\d+\]$/);
 	$eng_voc{$wrd}=1;
     }
 }
 close(FTM);
 close(FOUT);

 ### print the voc
 open(FVOC, ">$feng_voc") or die "cannot open file $feng_voc\n";
 foreach my $wrd (keys %eng_voc){
     #$wrd = encode("utf8", $wrd);
     print FVOC "$wrd\n";
 }
 close(FVOC);

 #filter the rule through a voc tbl
 sub filter_rule {
     my ($rule, $p_sents)=@_;
     #print STDERR "r1: $rule ||| ";

     $rule=add_escape($rule);
     #print STDERR "r2: $rule ||| ";

     $rule =~ s/\\\[PHRASE\\,\d+\\\]/\.\+/g; #ignore phrase tag
     #print STDERR "r3: $rule ||| ";

     my $res=1;
     foreach my $src_sent (@{$p_sents}){
     	if($src_sent =~ m/$rule/ ){ #if any src sent contains this rule, then retain the rule
 		$res=0;
                 last;
 	}
     }
     #print STDERR "$res\n";
     return $res;
 }


 #input text may contain the special chars directly, we need to add \
 sub add_escape {
     my ($text)=@_;
     $text =~ s/(\@|\.|\^|\$|\*|\+|\?|\[|\]|\{|\}|\(|\)|\<|\>|\/|\\|\||\`|\'|\"|\=|\-|\+|\,)/\\$1/g;
     return $text;
 }

 ### get unigram voc tbl
 sub get_cn_sents {
     my ($file, $p_sents)=@_;
     open(FILE, $file) or die "cannot open file $file\n";
     while(my $line=<FILE>){
 	$line =~ s/<seg\s+id=\d+>//g;
 	$line =~ s/<\/seg>//g;

         next if($line =~ m/^\s+$/); #blank line
         chomp($line);
         push(@{$p_sents}, $line);
     }
     close(FILE);
 }


 ############################# not used

 #filter the rule through a voc tbl
 sub filter_rule_old {
     my ($rule, $p_tbl)=@_;
     my @wrds=split(/\s+/,$rule);
     foreach my $wrd ( @wrds ){
 	next if($wrd =~ m/^\[PHRASE,\d\]$/); #ignore phrase tag
 	if(not exists $p_tbl->{$wrd}){
 		return 1;
 	}
     }
     return 0;
 }


 ### get unigram voc tbl
 sub get_cn_voc_tbl {
     my ($file, $p_tbl)=@_;
     open(FILE, $file) or die "cannot open file $file\n";
     while(my $line=<FILE>){
         next if($line =~ m/^\s+$/); #blank line
         chomp($line);
         my @wrds=split(/\s+/,$line);
         foreach my $wrd ( @wrds ){
 		$p_tbl->{$wrd}=1;
 	}
     }
     close(FILE);
 }


 ######## not used
 sub read_cn_voc_tbl {
     my ($file, $p_tbl)=@_;
     open(FILE, $file) or die "cannot open file $file\n";
     while(my $line=<FILE>){
         next if($line =~ m/^\s+$/); #blank line
         chomp($line);
         $p_tbl->{$line}=1;
     }
     close(FILE);
 }
	#!/usr/bin/perl -w
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	use Encode;
	#### get the eng voc of the phrase tbl based on the input chinese

	my $fcn_text=$ARGV[0]; #test cn sentences
	my $fphrase_tbl=$ARGV[1];
	my $feng_voc=$ARGV[2];
	my $ftbl_out=$ARGV[3];


	my %eng_voc=();

	#add basic lm vocabulary here
	$eng_voc{"<unk>"}=1;
	$eng_voc{"<s>"}=1;
	$eng_voc{"</s>"}=1;

	@g_cn_sents=();
	get_cn_sents($fcn_text,\@g_cn_sents);

	#filter phrase tbl and get the english voc
	open(FTM, $fphrase_tbl) or die "cannot open file $fphrase_tbl\n";
	open(FOUT, ">$ftbl_out") or die "cannot open file $ftbl_out\n";
	while(my $line=<FTM>){
	chomp($line);
	my @fds=split(/\s+\\|{3}\s+/,$line);
	my $cn=$fds[1];
	my $eng=$fds[2];

	next if(filter_rule($cn,\@g_cn_sents)==1);
	print FOUT "$line\n";

	my @eng_wrds=split(/\s+/, $eng);
	foreach my $wrd (@eng_wrds){
	$wrd =~ s/^\s+//g;
	$wrd =~ s/\s+$//g;
	next if($wrd =~ m/^\[PHRASE,\d+\]$/);
	$eng_voc{$wrd}=1;
	}
	}
	close(FTM);
	close(FOUT);

	### print the voc
	open(FVOC, ">$feng_voc") or die "cannot open file $feng_voc\n";
	foreach my $wrd (keys %eng_voc){
	#$wrd = encode("utf8", $wrd);
	print FVOC "$wrd\n";
	}
	close(FVOC);

	#filter the rule through a voc tbl
	sub filter_rule {
	my ($rule, $p_sents)=@_;
	#print STDERR "r1: $rule \|\|\| ";

	$rule=add_escape($rule);
	#print STDERR "r2: $rule \|\|\| ";

	$rule =~ s/\\\[PHRASE\\,\d+\\\]/\.\+/g; #ignore phrase tag
	#print STDERR "r3: $rule \|\|\| ";

	my $res=1;
	foreach my $src_sent (@{$p_sents}){
	if($src_sent =~ m/$rule/ ){ #if any src sent contains this rule, then retain the rule
	$res=0;
	last;
	}
	}
	#print STDERR "$res\n";
	return $res;
	}


	#input text may contain the special chars directly, we need to add \
	sub add_escape {
	my ($text)=@_;
	$text =~ s/(\@\|\.\|\^\|\$\|\*\|\+\|\?\|\[\|\]\|\{\|\}\|\(\|\)\|\<\|\>\|\/\|\\\|\\|\|\`\|\'\|\"\|\=\|\-\|\+\|\,)/\\$1/g;
	return $text;
	}

	### get unigram voc tbl
	sub get_cn_sents {
	my ($file, $p_sents)=@_;
	open(FILE, $file) or die "cannot open file $file\n";
	while(my $line=<FILE>){
	$line =~ s/<seg\s+id=\d+>//g;
	$line =~ s/<\/seg>//g;

	next if($line =~ m/^\s+$/); #blank line
	chomp($line);
	push(@{$p_sents}, $line);
	}
	close(FILE);
	}



	############################# not used

	#filter the rule through a voc tbl
	sub filter_rule_old {
	my ($rule, $p_tbl)=@_;
	my @wrds=split(/\s+/,$rule);
	foreach my $wrd ( @wrds ){
	next if($wrd =~ m/^\[PHRASE,\d\]$/); #ignore phrase tag
	if(not exists $p_tbl->{$wrd}){
	return 1;
	}
	}
	return 0;
	}


	### get unigram voc tbl
	sub get_cn_voc_tbl {
	my ($file, $p_tbl)=@_;
	open(FILE, $file) or die "cannot open file $file\n";
	while(my $line=<FILE>){
	next if($line =~ m/^\s+$/); #blank line
	chomp($line);
	my @wrds=split(/\s+/,$line);
	foreach my $wrd ( @wrds ){
	$p_tbl->{$wrd}=1;
	}
	}
	close(FILE);
	}


	######## not used
	sub read_cn_voc_tbl {
	my ($file, $p_tbl)=@_;
	open(FILE, $file) or die "cannot open file $file\n";
	while(my $line=<FILE>){
	next if($line =~ m/^\s+$/); #blank line
	chomp($line);
	$p_tbl->{$line}=1;
	}
	close(FILE);
	}