lucene/test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl - lucene-solr - Git at Google

 #!/usr/bin/perl

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 use warnings;
 use strict;
 use File::Spec;
 use Getopt::Long;
 use LWP::UserAgent;

 my ($volume, $directory, $script_name) = File::Spec->splitpath($0);

 my $version = '';
 unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+\.\d+/) {
   print STDERR "Usage: $script_name -v <version>\n";
   print STDERR "\tversion must be of the form X.Y.Z, e.g. 5.2.0\n"
       if ($version);
   exit 1;
 }
 my $url_prefix = "http://www.unicode.org/Public/${version}/ucd";
 my $scripts_url = "${url_prefix}/Scripts.txt";
 my $line_break_url = "${url_prefix}/LineBreak.txt";
 my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt";
 my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt";
 my $underscore_version = $version;
 $underscore_version =~ s/\./_/g;
 my $class_name = "WordBreakTestUnicode_${underscore_version}";
 my $output_filename = "${class_name}.java";
 my $header =<<"__HEADER__";
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.analysis.standard;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.junit.Ignore;

 /**
  * This class was automatically generated by ${script_name}
  * from: ${url_prefix}/auxiliary/WordBreakTest.txt
  *
  * WordBreakTest.txt indicates the points in the provided character sequences
  * at which conforming implementations must and must not break words.  This
  * class tests for expected token extraction from each of the test sequences
  * in WordBreakTest.txt, where the expected tokens are those character
  * sequences bounded by word breaks and containing at least one character
  * from one of the following character sets:
  *
  *    \\p{Script = Han}                (From $scripts_url)
  *    \\p{Script = Hiragana}
  *    \\p{LineBreak = Complex_Context} (From $line_break_url)
  *    \\p{WordBreak = ALetter}         (From $word_break_url)
  *    \\p{WordBreak = Hebrew_Letter}
  *    \\p{WordBreak = Katakana}
  *    \\p{WordBreak = Numeric}         (Excludes full-width Arabic digits)
  *    [\\uFF10-\\uFF19]                 (Full-width Arabic digits)
  */
 \@Ignore
 public class ${class_name} extends BaseTokenStreamTestCase {

   public void test(Analyzer analyzer) throws Exception {
 __HEADER__

 my $codepoints = [];
 map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
 my $regional_indicator_codepoints = [];
 # Complex_Context is an alias for 'SA', which is used in LineBreak.txt
 # Using lowercase versions of property value names to allow for case-
 # insensitive comparison with the names in the Unicode data files.
 parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
 parse_Unicode_data_file($scripts_url, $codepoints,
                         {'han' => 1, 'hiragana' => 1});
 parse_Unicode_data_file($word_break_url, $codepoints,
                         {'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1, 'e_base' => 1,
                          'e_modifier' => 1, 'glue_after_zwj' => 1, 'e_base_gaz' => 1});
 parse_Unicode_data_file($word_break_url, $regional_indicator_codepoints, {'regional_indicator' => 1});
 my @tests = split /\r?\n/, get_URL_content($word_break_test_url);

 my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
 open OUT, ">$output_path"
   || die "Error opening '$output_path' for writing: $!";

 print STDERR "Writing '$output_path'...";

 print OUT $header;

 for my $line (@tests) {
   next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
   # Example line: ÷ 0001 × 0300 ÷  #  ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
   my ($sequence) = $line =~ /^(.*?)\s*\#/;
   $line =~ s/\t/  /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
   print OUT "    // $line\n";
   $sequence =~ s/\s*÷\s*$//; # Trim trailing break character
   my $test_string = $sequence;
   $test_string =~ s/\s*÷\s*/\\u/g;
   $test_string =~ s/\s*×\s*/\\u/g;
   $test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
   $test_string =~ s/\\u000A/\\n/g;
   $test_string =~ s/\\u000D/\\r/g;
   $test_string =~ s/\\u0022/\\\"/g;
   $sequence =~ s/^\s*÷\s*//; # Trim leading break character

   # TODO: When upgrading JFlex to a version that supports Unicode 11.0+: remove the special case below for a Unicode 9.0 test data line that conflicts with TR#51 11.0 test data
   # ÷ 200D ÷ 261D ÷  #  ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
   if ($sequence =~ /^200D\s*÷\s*261D$/) {
     print OUT "    // Skipping this test because it conflicts with TR#51 v11.0 rules.\n\n";
     next;
   }

   my @tokens = ();
   my $isfirst = 0;
   for my $candidate (split /\s*÷\s*/, $sequence) {
     $isfirst = 1;
     my @chars = ();
     my $has_wanted_chars = 0;
     my $prev_char_regional_indicator = 0;
     while ($candidate =~ /([0-9A-F]+)/gi) {
       my $hexchar = $1;
       if (4 == length($hexchar)) {
         push @chars, $hexchar;
       } else {
         push @chars, above_BMP_char_to_surrogates($hexchar);
       }
       unless ($has_wanted_chars) {
         my $codepoint = hex($hexchar);
         if (defined($codepoints->[$codepoint])) {
           $has_wanted_chars = 1;
         } elsif (defined($regional_indicator_codepoints->[$codepoint])) {
           if (1 == $prev_char_regional_indicator) {
             $has_wanted_chars = 1; # must be 2 regional indicators in a row
           } else {
             $prev_char_regional_indicator = 1;
           }
         }
       }
     }
     if ($has_wanted_chars) {
       push @tokens, '"'.join('', map { $_ eq "0022" ? "\\\"" : "\\u$_" } @chars).'"';
     }
   }
   print OUT "    assertAnalyzesTo(analyzer, \"${test_string}\",\n";
   print OUT "                     new String[] { ";
   print OUT join(", ", @tokens), " });\n\n";
 }

 print OUT "  }\n}\n";
 close OUT;
 print STDERR "done.\n";


 # sub above_BMP_char_to_surrogates
 #
 # Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
 # to the corresponding UTF-16 surrogate pair
 #
 # Assumption: input string is a sequence more than four hex digits
 #
 sub above_BMP_char_to_surrogates {
   my $ch = hex(shift);
   my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
   my $low_surrogate  = 0xDC00 + ($ch & 0x3FF);
   return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
 }


 # sub parse_Unicode_data_file
 #
 # Downloads and parses the specified Unicode data file, parses it, and
 # extracts code points assigned any of the given property values, defining
 # the corresponding array position in the passed-in target array.
 #
 # Takes in the following parameters:
 #
 #  - URL of the Unicode data file to download and parse
 #  - Reference to target array
 #  - Reference to hash of property values to get code points for
 #
 sub parse_Unicode_data_file {
   my $url = shift;
   my $target = shift;
   my $wanted_property_values = shift;
   my $content = get_URL_content($url);
   print STDERR "Parsing '$url'...";
   my @lines = split /\r?\n/, $content;
   for (@lines) {
     s/\s*#.*//;         # Strip trailing comments
     s/\s+$//;           # Strip trailing space
     next unless (/\S/); # Skip empty lines
     my ($start, $end, $property_value);
     if (/^([0-9A-F]{4,5})\s*;\s*(.+)/i) {
       # 00AA       ; LATIN
       $start = $end = hex $1;
       $property_value = lc $2; # Property value names are case-insensitive
     } elsif (/^([0-9A-F]{4,5})..([0-9A-F]{4,5})\s*;\s*(.+)/i) {
       # 0AE6..0AEF ; Gujarati
       $start = hex $1;
       $end = hex $2;
       $property_value = lc $3; # Property value names are case-insensitive
     } else {
       next;
     }
     if (defined($wanted_property_values->{$property_value})) {
       for my $code_point ($start..$end) {
         $target->[$code_point] = 1;
       }
     }
   }
   print STDERR "done.\n";
 }

 # sub get_URL_content
 #
 # Retrieves and returns the content of the given URL.
 #
 sub get_URL_content {
   my $url = shift;
   print STDERR "Retrieving '$url'...";
   my $user_agent = LWP::UserAgent->new;
   my $request = HTTP::Request->new(GET => $url);
   my $response = $user_agent->request($request);
   unless ($response->is_success) {
     print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
     exit 1;
   }
   print STDERR "done.\n";
   return $response->content;
 }
	#!/usr/bin/perl

	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	use warnings;
	use strict;
	use File::Spec;
	use Getopt::Long;
	use LWP::UserAgent;

	my ($volume, $directory, $script_name) = File::Spec->splitpath($0);

	my $version = '';
	unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+\.\d+/) {
	print STDERR "Usage: $script_name -v <version>\n";
	print STDERR "\tversion must be of the form X.Y.Z, e.g. 5.2.0\n"
	if ($version);
	exit 1;
	}
	my $url_prefix = "http://www.unicode.org/Public/${version}/ucd";
	my $scripts_url = "${url_prefix}/Scripts.txt";
	my $line_break_url = "${url_prefix}/LineBreak.txt";
	my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt";
	my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt";
	my $underscore_version = $version;
	$underscore_version =~ s/\./_/g;
	my $class_name = "WordBreakTestUnicode_${underscore_version}";
	my $output_filename = "${class_name}.java";
	my $header =<<"__HEADER__";
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.analysis.standard;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.junit.Ignore;

	/**
	* This class was automatically generated by ${script_name}
	* from: ${url_prefix}/auxiliary/WordBreakTest.txt
	*
	* WordBreakTest.txt indicates the points in the provided character sequences
	* at which conforming implementations must and must not break words. This
	* class tests for expected token extraction from each of the test sequences
	* in WordBreakTest.txt, where the expected tokens are those character
	* sequences bounded by word breaks and containing at least one character
	* from one of the following character sets:
	*
	* \\p{Script = Han} (From $scripts_url)
	* \\p{Script = Hiragana}
	* \\p{LineBreak = Complex_Context} (From $line_break_url)
	* \\p{WordBreak = ALetter} (From $word_break_url)
	* \\p{WordBreak = Hebrew_Letter}
	* \\p{WordBreak = Katakana}
	* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
	* [\\uFF10-\\uFF19] (Full-width Arabic digits)
	*/
	\@Ignore
	public class ${class_name} extends BaseTokenStreamTestCase {

	public void test(Analyzer analyzer) throws Exception {
	__HEADER__

	my $codepoints = [];
	map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
	my $regional_indicator_codepoints = [];
	# Complex_Context is an alias for 'SA', which is used in LineBreak.txt
	# Using lowercase versions of property value names to allow for case-
	# insensitive comparison with the names in the Unicode data files.
	parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
	parse_Unicode_data_file($scripts_url, $codepoints,
	{'han' => 1, 'hiragana' => 1});
	parse_Unicode_data_file($word_break_url, $codepoints,
	{'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1, 'e_base' => 1,
	'e_modifier' => 1, 'glue_after_zwj' => 1, 'e_base_gaz' => 1});
	parse_Unicode_data_file($word_break_url, $regional_indicator_codepoints, {'regional_indicator' => 1});
	my @tests = split /\r?\n/, get_URL_content($word_break_test_url);

	my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
	open OUT, ">$output_path"
	\|\| die "Error opening '$output_path' for writing: $!";

	print STDERR "Writing '$output_path'...";

	print OUT $header;

	for my $line (@tests) {
	next if ($line =~ /^\s(?:\|\#.)$/); # Skip blank or comment-only lines
	# Example line: ÷ 0001 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
	my ($sequence) = $line =~ /^(.?)\s\#/;
	$line =~ s/\t/ /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
	print OUT " // $line\n";
	$sequence =~ s/\s÷\s$//; # Trim trailing break character
	my $test_string = $sequence;
	$test_string =~ s/\s÷\s/\\u/g;
	$test_string =~ s/\s×\s/\\u/g;
	$test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
	$test_string =~ s/\\u000A/\\n/g;
	$test_string =~ s/\\u000D/\\r/g;
	$test_string =~ s/\\u0022/\\\"/g;
	$sequence =~ s/^\s÷\s//; # Trim leading break character

	# TODO: When upgrading JFlex to a version that supports Unicode 11.0+: remove the special case below for a Unicode 9.0 test data line that conflicts with TR#51 11.0 test data
	# ÷ 200D ÷ 261D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
	if ($sequence =~ /^200D\s÷\s261D$/) {
	print OUT " // Skipping this test because it conflicts with TR#51 v11.0 rules.\n\n";
	next;
	}

	my @tokens = ();
	my $isfirst = 0;
	for my $candidate (split /\s÷\s/, $sequence) {
	$isfirst = 1;
	my @chars = ();
	my $has_wanted_chars = 0;
	my $prev_char_regional_indicator = 0;
	while ($candidate =~ /([0-9A-F]+)/gi) {
	my $hexchar = $1;
	if (4 == length($hexchar)) {
	push @chars, $hexchar;
	} else {
	push @chars, above_BMP_char_to_surrogates($hexchar);
	}
	unless ($has_wanted_chars) {
	my $codepoint = hex($hexchar);
	if (defined($codepoints->[$codepoint])) {
	$has_wanted_chars = 1;
	} elsif (defined($regional_indicator_codepoints->[$codepoint])) {
	if (1 == $prev_char_regional_indicator) {
	$has_wanted_chars = 1; # must be 2 regional indicators in a row
	} else {
	$prev_char_regional_indicator = 1;
	}
	}
	}
	}
	if ($has_wanted_chars) {
	push @tokens, '"'.join('', map { $_ eq "0022" ? "\\\"" : "\\u$_" } @chars).'"';
	}
	}
	print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";
	print OUT " new String[] { ";
	print OUT join(", ", @tokens), " });\n\n";
	}

	print OUT " }\n}\n";
	close OUT;
	print STDERR "done.\n";


	# sub above_BMP_char_to_surrogates
	#
	# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
	# to the corresponding UTF-16 surrogate pair
	#
	# Assumption: input string is a sequence more than four hex digits
	#
	sub above_BMP_char_to_surrogates {
	my $ch = hex(shift);
	my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
	my $low_surrogate = 0xDC00 + ($ch & 0x3FF);
	return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
	}


	# sub parse_Unicode_data_file
	#
	# Downloads and parses the specified Unicode data file, parses it, and
	# extracts code points assigned any of the given property values, defining
	# the corresponding array position in the passed-in target array.
	#
	# Takes in the following parameters:
	#
	# - URL of the Unicode data file to download and parse
	# - Reference to target array
	# - Reference to hash of property values to get code points for
	#
	sub parse_Unicode_data_file {
	my $url = shift;
	my $target = shift;
	my $wanted_property_values = shift;
	my $content = get_URL_content($url);
	print STDERR "Parsing '$url'...";
	my @lines = split /\r?\n/, $content;
	for (@lines) {
	s/\s#.//; # Strip trailing comments
	s/\s+$//; # Strip trailing space
	next unless (/\S/); # Skip empty lines
	my ($start, $end, $property_value);
	if (/^([0-9A-F]{4,5})\s;\s(.+)/i) {
	# 00AA ; LATIN
	$start = $end = hex $1;
	$property_value = lc $2; # Property value names are case-insensitive
	} elsif (/^([0-9A-F]{4,5})..([0-9A-F]{4,5})\s;\s(.+)/i) {
	# 0AE6..0AEF ; Gujarati
	$start = hex $1;
	$end = hex $2;
	$property_value = lc $3; # Property value names are case-insensitive
	} else {
	next;
	}
	if (defined($wanted_property_values->{$property_value})) {
	for my $code_point ($start..$end) {
	$target->[$code_point] = 1;
	}
	}
	}
	print STDERR "done.\n";
	}

	# sub get_URL_content
	#
	# Retrieves and returns the content of the given URL.
	#
	sub get_URL_content {
	my $url = shift;
	print STDERR "Retrieving '$url'...";
	my $user_agent = LWP::UserAgent->new;
	my $request = HTTP::Request->new(GET => $url);
	my $response = $user_agent->request($request);
	unless ($response->is_success) {
	print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
	exit 1;
	}
	print STDERR "done.\n";
	return $response->content;
	}