| #!/usr/bin/perl |
| |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| =head1 NAME |
| |
| gen_word_break_data.pl - Generate word break table and tests |
| |
| =head1 SYNOPSIS |
| |
| perl gen_word_break_data.pl [-c] UCD_SRC_DIR |
| |
| =head1 DESCRIPTION |
| |
| This script generates the tables to lookup Unicode word break properties |
| for the StandardTokenizer. It also converts the word break test suite in |
| the UCD to JSON. |
| |
| UCD_SRC_DIR should point to a directory containing the files |
| WordBreakProperty.txt, WordBreakTest.txt, and DerivedCoreProperties.txt from |
| the Unicode Character Database available at |
| L<http://www.unicode.org/Public/6.3.0/ucd/>. |
| |
| =head1 OUTPUT FILES |
| |
| modules/unicode/ucd/WordBreak.tab |
| modules/unicode/ucd/WordBreakTest.json |
| |
| =head1 OPTIONS |
| |
| =head2 -c |
| |
| Show total table size for different shift values |
| |
| =cut |
| |
| use strict; |
| |
| use Getopt::Std; |
| use JSON; |
| use UnicodeTable; |
| |
| my $output_dir = '../../modules/unicode/ucd'; |
| my $table_filename = "$output_dir/WordBreak.tab"; |
| my $tests_filename = "$output_dir/WordBreakTest.json"; |
| |
| my %wb_map = ( |
| CR => 0, |
| LF => 0, |
| Newline => 0, |
| Regional_Indicator => 0, # These are symbols, so ignore them. |
| ALetter => 2, |
| Hebrew_Letter => 3, |
| Numeric => 4, |
| Katakana => 5, |
| ExtendNumLet => 6, |
| Extend => 7, |
| Format => 7, |
| Single_Quote => 8, |
| Double_Quote => 9, |
| MidNumLet => 10, |
| MidLetter => 11, |
| MidNum => 12, |
| ); |
| |
| my %opts; |
| if ( !getopts( 'c', \%opts ) || @ARGV != 1 ) { |
| print STDERR ("Usage: $0 [-c] UCD_SRC_DIR\n"); |
| exit; |
| } |
| |
| my $src_dir = $ARGV[0]; |
| |
| my $wb = UnicodeTable->read( |
| filename => "$src_dir/WordBreakProperty.txt", |
| type => 'Enumerated', |
| map => \%wb_map, |
| ); |
| my $alpha = UnicodeTable->read( |
| filename => "$src_dir/DerivedCoreProperties.txt", |
| type => 'Boolean', |
| map => { Alphabetic => 1 }, |
| ); |
| |
| # Many characters don't have a Word_Break property and form a single word. |
| # In order to include them in the tokenizing process, we use a custom |
| # property "ASingle" with value 1. |
| # |
| # For now, this property is used for all Alphabetic characters without a |
| # Word_Break property: Ideographic, Hiragana, and Complex_Context. |
| # |
| # There are also non-alphabetic, numeric characters without a WordBreak |
| # property that possibly should be included: |
| # |
| # - Decimal numbers (General_Category Nd) with East_Asian_Width F (Fullwidth) |
| # - Other numbers (General_Category No) |
| # |
| # These are ignored for now. |
| |
| for ( my $i = 0; $i < 0x30000; ++$i ) { |
| if ( !$wb->lookup($i) && $alpha->lookup($i) ) { |
| $wb->set( $i, 1 ); |
| } |
| } |
| |
| if ( $opts{c} ) { |
| $wb->calc_sizes( [ 2, 6 ], [ 3, 9 ] ); |
| exit; |
| } |
| |
| # Optimize for UTF-8 |
| my $row_shift = 6; |
| my $plane_shift = 6; |
| |
| my $wb_ascii = UnicodeTable->new( |
| table => [], |
| max => 0, |
| ); |
| |
| for ( my $i = 0; $i < 0x80; ++$i ) { |
| $wb_ascii->set( $i, $wb->lookup($i) ); |
| } |
| |
| my $wb_rows = $wb->compress($row_shift); |
| my $wb_planes = $wb_rows->compress_map($plane_shift); |
| my $wb_plane_map = $wb_planes->map_table; |
| |
| # test compressed table |
| |
| for ( my $i = 0; $i < 0x110000; ++$i ) { |
| my $v1 = $wb->lookup($i); |
| my $v2 = $wb_rows->lookup($i); |
| die("test for code point $i failed, want $v1, got $v2") |
| if $v1 != $v2; |
| } |
| |
| # dump tables |
| |
| open( my $out_file, '>', $table_filename ) |
| or die("$table_filename: $!\n"); |
| |
| print $out_file (<DATA>); |
| |
| $wb_ascii->dump( $out_file, 'wb_ascii' ); |
| print $out_file ("\n"); |
| $wb_plane_map->dump( $out_file, 'wb_plane_map' ); |
| print $out_file ("\n"); |
| $wb_planes->dump( $out_file, 'wb_planes' ); |
| print $out_file ("\n"); |
| $wb_rows->dump( $out_file, 'wb_rows' ); |
| |
| close($out_file); |
| |
| # convert UCD test suite |
| |
| open( my $in_file, '<', "$src_dir/WordBreakTest.txt" ) |
| or die("$src_dir/WordBreakTest.txt: $!\n"); |
| binmode( $in_file, ':utf8' ); |
| |
| my @tests; |
| |
| while (<$in_file>) { |
| s/\s*(#.*)?\z//s; |
| next if $_ eq ''; |
| my @items = split(/\s+/); |
| my $word = ''; |
| my $text = ''; |
| my @words; |
| |
| for ( my $i = 0; $i + 1 < @items; $i += 2 ) { |
| my ( $break, $code ) = ( $items[$i], hex( $items[ $i + 1 ] ) ); |
| my $chr = chr($code); |
| $text .= $chr; |
| |
| if ( $break eq "\xF7" ) { # division sign |
| if ( $word ne '' ) { |
| push( @words, $word ); |
| $word = ''; |
| } |
| |
| my $wb = $wb->lookup($code); |
| $word = $chr if $wb >= 1 && $wb <= 6; |
| } |
| elsif ( $break eq "\xD7" ) { # multiplication sign |
| $word .= $chr if $word ne ''; |
| } |
| else { |
| die("invalid break character '$break'"); |
| } |
| } |
| |
| push( @words, $word ) if $word ne ''; |
| |
| push( |
| @tests, |
| { text => $text, |
| words => \@words, |
| } |
| ); |
| } |
| |
| close($in_file); |
| |
| open( $out_file, '>', $tests_filename ) |
| or die("$tests_filename: $!\n"); |
| print $out_file ( JSON->new->utf8->pretty->encode( \@tests ) ); |
| close($out_file); |
| |
| __DATA__ |
| /* |
| |
| This file is generated with devel/bin/gen_word_break_data.pl. DO NOT EDIT! |
| The contents of this file are derived from the Unicode Character Database, |
| version 6.3.0, available from http://www.unicode.org/Public/6.3.0/ucd/. |
| The Unicode copyright and permission notice follows. |
| |
| Copyright (c) 1991-2011 Unicode, Inc. All rights reserved. Distributed under |
| the Terms of Use in http://www.unicode.org/copyright.html. |
| |
| Permission is hereby granted, free of charge, to any person obtaining a copy of |
| the Unicode data files and any associated documentation (the "Data Files") or |
| Unicode software and any associated documentation (the "Software") to deal in |
| the Data Files or Software without restriction, including without limitation |
| the rights to use, copy, modify, merge, publish, distribute, and/or sell copies |
| of the Data Files or Software, and to permit persons to whom the Data Files or |
| Software are furnished to do so, provided that (a) the above copyright |
| notice(s) and this permission notice appear with all copies of the Data Files |
| or Software, (b) both the above copyright notice(s) and this permission notice |
| appear in associated documentation, and (c) there is clear notice in each |
| modified Data File or in the Software as well as in the documentation |
| associated with the Data File(s) or Software that the data or software has been |
| modified. |
| |
| THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD |
| PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN |
| THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL |
| DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, |
| WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
| OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR |
| SOFTWARE. |
| |
| Except as contained in this notice, the name of a copyright holder shall not be |
| used in advertising or otherwise to promote the sale, use or other dealings in |
| these Data Files or Software without prior written authorization of the |
| copyright holder. |
| |
| */ |
| |