blob: dd16cb6d797d716423a2f8762edd57a5692bd97c [file] [log] [blame]
#!/usr/bin/perl
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use warnings;
use strict;
use File::Spec;
use Getopt::Long;
use LWP::UserAgent;
my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
my $version = '';
unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+\.\d+/) {
print STDERR "Usage: $script_name -v <version>\n";
print STDERR "\tversion must be of the form X.Y.Z, e.g. 5.2.0\n"
if ($version);
exit 1;
}
my $url_prefix = "http://www.unicode.org/Public/${version}/ucd";
my $scripts_url = "${url_prefix}/Scripts.txt";
my $line_break_url = "${url_prefix}/LineBreak.txt";
my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt";
my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt";
my $underscore_version = $version;
$underscore_version =~ s/\./_/g;
my $class_name = "WordBreakTestUnicode_${underscore_version}";
my $output_filename = "${class_name}.java";
my $header =<<"__HEADER__";
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.standard;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.junit.Ignore;
/**
* This class was automatically generated by ${script_name}
* from: ${url_prefix}/auxiliary/WordBreakTest.txt
*
* WordBreakTest.txt indicates the points in the provided character sequences
* at which conforming implementations must and must not break words. This
* class tests for expected token extraction from each of the test sequences
* in WordBreakTest.txt, where the expected tokens are those character
* sequences bounded by word breaks and containing at least one character
* from one of the following character sets:
*
* \\p{Script = Han} (From $scripts_url)
* \\p{Script = Hiragana}
* \\p{LineBreak = Complex_Context} (From $line_break_url)
* \\p{WordBreak = ALetter} (From $word_break_url)
* \\p{WordBreak = Hebrew_Letter}
* \\p{WordBreak = Katakana}
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
*/
\@Ignore
public class ${class_name} extends BaseTokenStreamTestCase {
public void test(Analyzer analyzer) throws Exception {
__HEADER__
my $codepoints = [];
map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
my $regional_indicator_codepoints = [];
# Complex_Context is an alias for 'SA', which is used in LineBreak.txt
# Using lowercase versions of property value names to allow for case-
# insensitive comparison with the names in the Unicode data files.
parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
parse_Unicode_data_file($scripts_url, $codepoints,
{'han' => 1, 'hiragana' => 1});
parse_Unicode_data_file($word_break_url, $codepoints,
{'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1, 'e_base' => 1,
'e_modifier' => 1, 'glue_after_zwj' => 1, 'e_base_gaz' => 1});
parse_Unicode_data_file($word_break_url, $regional_indicator_codepoints, {'regional_indicator' => 1});
my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
open OUT, ">$output_path"
|| die "Error opening '$output_path' for writing: $!";
print STDERR "Writing '$output_path'...";
print OUT $header;
for my $line (@tests) {
next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
# Example line: ÷ 0001 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
my ($sequence) = $line =~ /^(.*?)\s*\#/;
$line =~ s/\t/ /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
print OUT " // $line\n";
$sequence =~ s/\s*÷\s*$//; # Trim trailing break character
my $test_string = $sequence;
$test_string =~ s/\s*÷\s*/\\u/g;
$test_string =~ s/\s*×\s*/\\u/g;
$test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
$test_string =~ s/\\u000A/\\n/g;
$test_string =~ s/\\u000D/\\r/g;
$test_string =~ s/\\u0022/\\\"/g;
$sequence =~ s/^\s*÷\s*//; # Trim leading break character
# TODO: When upgrading JFlex to a version that supports Unicode 11.0+: remove the special case below for a Unicode 9.0 test data line that conflicts with TR#51 11.0 test data
# ÷ 200D ÷ 261D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
if ($sequence =~ /^200D\s*÷\s*261D$/) {
print OUT " // Skipping this test because it conflicts with TR#51 v11.0 rules.\n\n";
next;
}
my @tokens = ();
my $isfirst = 0;
for my $candidate (split /\s\s*/, $sequence) {
$isfirst = 1;
my @chars = ();
my $has_wanted_chars = 0;
my $prev_char_regional_indicator = 0;
while ($candidate =~ /([0-9A-F]+)/gi) {
my $hexchar = $1;
if (4 == length($hexchar)) {
push @chars, $hexchar;
} else {
push @chars, above_BMP_char_to_surrogates($hexchar);
}
unless ($has_wanted_chars) {
my $codepoint = hex($hexchar);
if (defined($codepoints->[$codepoint])) {
$has_wanted_chars = 1;
} elsif (defined($regional_indicator_codepoints->[$codepoint])) {
if (1 == $prev_char_regional_indicator) {
$has_wanted_chars = 1; # must be 2 regional indicators in a row
} else {
$prev_char_regional_indicator = 1;
}
}
}
}
if ($has_wanted_chars) {
push @tokens, '"'.join('', map { $_ eq "0022" ? "\\\"" : "\\u$_" } @chars).'"';
}
}
print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";
print OUT " new String[] { ";
print OUT join(", ", @tokens), " });\n\n";
}
print OUT " }\n}\n";
close OUT;
print STDERR "done.\n";
# sub above_BMP_char_to_surrogates
#
# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
# to the corresponding UTF-16 surrogate pair
#
# Assumption: input string is a sequence more than four hex digits
#
sub above_BMP_char_to_surrogates {
my $ch = hex(shift);
my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
my $low_surrogate = 0xDC00 + ($ch & 0x3FF);
return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
}
# sub parse_Unicode_data_file
#
# Downloads and parses the specified Unicode data file, parses it, and
# extracts code points assigned any of the given property values, defining
# the corresponding array position in the passed-in target array.
#
# Takes in the following parameters:
#
# - URL of the Unicode data file to download and parse
# - Reference to target array
# - Reference to hash of property values to get code points for
#
sub parse_Unicode_data_file {
my $url = shift;
my $target = shift;
my $wanted_property_values = shift;
my $content = get_URL_content($url);
print STDERR "Parsing '$url'...";
my @lines = split /\r?\n/, $content;
for (@lines) {
s/\s*#.*//; # Strip trailing comments
s/\s+$//; # Strip trailing space
next unless (/\S/); # Skip empty lines
my ($start, $end, $property_value);
if (/^([0-9A-F]{4,5})\s*;\s*(.+)/i) {
# 00AA ; LATIN
$start = $end = hex $1;
$property_value = lc $2; # Property value names are case-insensitive
} elsif (/^([0-9A-F]{4,5})..([0-9A-F]{4,5})\s*;\s*(.+)/i) {
# 0AE6..0AEF ; Gujarati
$start = hex $1;
$end = hex $2;
$property_value = lc $3; # Property value names are case-insensitive
} else {
next;
}
if (defined($wanted_property_values->{$property_value})) {
for my $code_point ($start..$end) {
$target->[$code_point] = 1;
}
}
}
print STDERR "done.\n";
}
# sub get_URL_content
#
# Retrieves and returns the content of the given URL.
#
sub get_URL_content {
my $url = shift;
print STDERR "Retrieving '$url'...";
my $user_agent = LWP::UserAgent->new;
my $request = HTTP::Request->new(GET => $url);
my $response = $user_agent->request($request);
unless ($response->is_success) {
print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
exit 1;
}
print STDERR "done.\n";
return $response->content;
}