| #!/usr/bin/perl |
| |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| use warnings; |
| use strict; |
| use File::Spec; |
| use Getopt::Long; |
| use LWP::UserAgent; |
| |
| my ($volume, $directory, $script_name) = File::Spec->splitpath($0); |
| |
| my $version = ''; |
| unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) { |
| print STDERR "Usage: $script_name -v <version>\n"; |
| print STDERR "\tversion must be of the form X.Y, e.g. 11.0\n" |
| if ($version); |
| exit 1; |
| } |
| my $url = "http://www.unicode.org/Public/emoji/${version}/emoji-test.txt"; |
| my $underscore_version = $version; |
| $underscore_version =~ s/\./_/g; |
| my $class_name = "EmojiTokenizationTestUnicode_${underscore_version}"; |
| my $output_filename = "${class_name}.java"; |
| my $header =<<"__HEADER__"; |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.analysis.standard; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.junit.Ignore; |
| |
| /** |
| * This class was automatically generated by ${script_name} |
| * from: ${url} |
| * |
| * emoji-test.txt contains emoji char sequences, which are represented as |
| * tokenization tests in this class. |
| * |
| */ |
| \@Ignore |
| public class ${class_name} extends BaseTokenStreamTestCase { |
| |
| public void test(Analyzer analyzer) throws Exception { |
| for (int i = 0 ; i < tests.length ; i += 2) { |
| String test = tests[i + 1]; |
| try { |
| assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "<EMOJI>" }); |
| } catch (Throwable t) { |
| throw new Exception("Failed to tokenize \\"" + tests[i] + "\\":", t); |
| } |
| } |
| } |
| |
| private String[] tests = new String[] { |
| __HEADER__ |
| |
| my @tests = split /\r?\n/, get_URL_content($url); |
| |
| my $output_path = File::Spec->catpath($volume, $directory, $output_filename); |
| open OUT, ">$output_path" |
| || die "Error opening '$output_path' for writing: $!"; |
| |
| print STDERR "Writing '$output_path'..."; |
| |
| print OUT $header; |
| |
| my $isFirst = 1; |
| for my $line (@tests) { |
| next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines |
| |
| print OUT ",\n\n" unless $isFirst; |
| $isFirst = 0; |
| |
| # Example line: 1F46E 1F3FB 200D 2642 FE0F ; fully-qualified # 👮🏻♂️ man police officer: light skin tone |
| $line =~ s/\s+$//; # Trim trailing whitespace |
| $line =~ s/\t/ /g; # Convert tabs to two spaces (no tabs allowed in Lucene source) |
| print OUT " \"$line\",\n"; |
| my ($test_string) = $line =~ /^(.*?)\s*;/; |
| $test_string =~ s/([0-9A-F]+)/\\u$1/g; |
| $test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge; |
| $test_string =~ s/\s//g; |
| print OUT " \"${test_string}\""; |
| } |
| print OUT " };\n}\n"; |
| close OUT; |
| print STDERR "done.\n"; |
| |
| |
| # sub above_BMP_char_to_surrogates |
| # |
| # Converts hex references to chars above the BMP (i.e., greater than 0xFFFF) |
| # to the corresponding UTF-16 surrogate pair |
| # |
| # Assumption: input string is a sequence more than four hex digits |
| # |
| sub above_BMP_char_to_surrogates { |
| my $ch = hex(shift); |
| my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10); |
| my $low_surrogate = 0xDC00 + ($ch & 0x3FF); |
| return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate); |
| } |
| |
| |
| # sub get_URL_content |
| # |
| # Retrieves and returns the content of the given URL. |
| # |
| sub get_URL_content { |
| my $url = shift; |
| print STDERR "Retrieving '$url'..."; |
| my $user_agent = LWP::UserAgent->new; |
| my $request = HTTP::Request->new(GET => $url); |
| my $response = $user_agent->request($request); |
| unless ($response->is_success) { |
| print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n"; |
| exit 1; |
| } |
| print STDERR "done.\n"; |
| return $response->content; |
| } |