| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| package UnicodeTable; |
| use strict; |
| |
| use IO::File; |
| |
| =head1 NAME |
| |
| UnicodeTable - Create compressed Unicode tables for C programs |
| |
| =head1 SYNOPSIS |
| |
| my $table = UnicodeTable->read( |
| filename => $filename, |
| type => 'Enumerated', |
| map => \%map, |
| ); |
| |
| my $comp = $table->compress($shift); |
| |
| $comp->dump; |
| |
| =head1 DESCRIPTION |
| |
| This module creates compressed tables used to lookup Unicode properties |
| in C programs. To compress a table, it's split into blocks of a fixed |
| size. Identical blocks are discovered and only unique blocks are written to |
| the compressed table. An additional map table is created to map original |
| block indices to block ids. |
| |
| The map tables can then be compressed again using the same algorithm. |
| |
| Powers of two are used as block sizes, so the table indices to lookup values |
| can be computed using bit operations. |
| |
| =head1 METHODS |
| |
| =head2 new |
| |
| my $table = UnicodeTable->new( |
| values => \@values, |
| default => $default, |
| max => $max, |
| shift => $shift, |
| map_table => $map_table, |
| ); |
| |
| \@values is an arrayref with the table values, $max is the maximum value. |
| The default value for undefined table entries is $default or 0. |
| $shift and $map_table are used for compressed tables. |
| |
| =cut |
| |
| sub new { |
| my $class = shift; |
| |
| my $opts = @_ == 1 ? $_[0] : {@_}; |
| my $self = bless( {}, $class ); |
| |
| for my $name (qw(values default max shift map_table)) { |
| $self->{$name} = $opts->{$name}; |
| } |
| |
| $self->{default} = 0 |
| if !defined( $self->{default} ); |
| $self->{mask} = ( 1 << $self->{shift} ) - 1 |
| if defined( $self->{shift} ); |
| |
| return $self; |
| } |
| |
| =head2 read |
| |
| my $table = UnicodeTable->table( |
| filename => $filename, |
| type => $type, |
| map => \%map, |
| default => $default, |
| ); |
| |
| Reads a table from a Unicode data text file. $type is either 'Enumerated' |
| or 'Boolean'. \%map is a hashref that maps property values to integers. |
| For booleans, these integers are ORed. $default is the default value passed |
| to L<new>. |
| |
| =cut |
| |
| sub read { |
| my $class = shift; |
| |
| my $opts = @_ == 1 ? $_[0] : {@_}; |
| my $max = 0; |
| my @values; |
| |
| my $filename = $opts->{filename}; |
| die('filename missing') if !defined($filename); |
| my $type = $opts->{type} or die('type missing'); |
| my $map = $opts->{map} or die('map missing'); |
| $type = lc($type); |
| |
| my $file = IO::File->new; |
| $file->open( $filename, '<' ) |
| or die("$filename: $!\n"); |
| |
| while ( my $line = $file->getline ) { |
| $line =~ s/\s*(#.*)?\z//s; |
| next if $line eq ''; |
| my ( $chars, $prop ) = split( /\s*;\s*/, $line ); |
| my $val = $map->{$prop}; |
| |
| if ( !defined($val) ) { |
| if ( $type eq 'boolean' ) { |
| next; |
| } |
| else { |
| die("unknown property '$prop'"); |
| } |
| } |
| |
| $max = $val if $val > $max; |
| |
| if ( $chars =~ /^[0-9A-Fa-f]+\z/ ) { |
| my $i = hex($chars); |
| if ( $type eq 'boolean' ) { |
| $values[$i] |= $val; |
| } |
| else { |
| $values[$i] = $val; |
| } |
| } |
| elsif ( $chars =~ /^(\w+)\.\.(\w+)\z/ ) { |
| my ( $l, $r ) = ( hex($1), hex($2) ); |
| die("invalid range '$chars'") if $l > $r; |
| |
| for ( my $i = $l; $i <= $r; ++$i ) { |
| if ( $type eq 'boolean' ) { |
| $values[$i] |= $val; |
| } |
| else { |
| $values[$i] = $val; |
| } |
| } |
| } |
| else { |
| die("invalid range '$chars'"); |
| } |
| } |
| |
| close($file); |
| |
| return $class->new( |
| values => \@values, |
| default => $opts->{default}, |
| max => $max, |
| ); |
| } |
| |
| =head2 shift |
| |
| =head2 mask |
| |
| =head2 max |
| |
| =head2 map_table |
| |
| Accessors |
| |
| =cut |
| |
| sub shift { |
| return $_[0]->{shift}; |
| } |
| |
| sub mask { |
| return $_[0]->{mask}; |
| } |
| |
| sub max { |
| return $_[0]->{max}; |
| } |
| |
| sub map_table { |
| return $_[0]->{map_table}; |
| } |
| |
| =head2 set |
| |
| $table->set($i, $value); |
| |
| Set entry at index $i to $value. Don't use with compressed tables. |
| |
| =cut |
| |
| sub set { |
| my ( $self, $i, $value ) = @_; |
| $self->{values}[$i] = $value; |
| $self->{max} = $value if $value > $self->{max}; |
| } |
| |
| =head2 size |
| |
| my $size = $table->size; |
| |
| Storage size of the table in bytes. |
| |
| =cut |
| |
| sub size { |
| my $self = CORE::shift; |
| |
| my $max = $self->{max}; |
| my $bytes = $max < 0x100 ? 1 : $max < 0x10000 ? 2 : 4; |
| |
| return @{ $self->{values} } * $bytes; |
| } |
| |
| =head2 lookup |
| |
| my $value = $table->lookup($i); |
| |
| Lookup value at index $i. Also works with compressed tables. |
| |
| =cut |
| |
| sub lookup { |
| my ( $self, $i ) = @_; |
| |
| my $map_table = $self->{map_table}; |
| |
| if ($map_table) { |
| my $shift = $self->{shift}; |
| my $id = $map_table->lookup( $i >> $shift ); |
| my $j = ( $id << $shift ) | ( $i & $self->{mask} ); |
| return $self->{values}->[$j]; |
| } |
| else { |
| my $val = $self->{values}->[$i]; |
| return $self->{default} if !defined($val); |
| return $val; |
| } |
| } |
| |
| =head2 compress |
| |
| my $compressed_table = $table->compress($shift); |
| |
| Returns a compressed version of this table which is linked to a second |
| map table. Blocks of size (1 << $shift) are used. |
| |
| =cut |
| |
| sub compress { |
| my ( $self, $shift ) = @_; |
| |
| my $values = $self->{values}; |
| my $default = $self->{default}; |
| my $block_size = 1 << $shift; |
| my $block_count = 0; |
| my ( @compressed, @map_values, %block_ids ); |
| |
| for ( my $start = 0; $start < @$values; $start += $block_size ) { |
| my @block; |
| |
| for ( my $i = $start; $i < $start + $block_size; ++$i ) { |
| my $val = $values->[$i]; |
| $val = $default if !defined($val); |
| push( @block, $val ); |
| } |
| |
| my $str = join( '|', @block ); |
| my $block_id = $block_ids{$str}; |
| |
| if ( !defined($block_id) ) { |
| $block_id = $block_count++; |
| $block_ids{$str} = $block_id; |
| push( @compressed, @block ); |
| } |
| |
| push( @map_values, $block_id ); |
| } |
| |
| # find default for map table |
| |
| my @default_block; |
| |
| for ( my $i = 0; $i < $block_size; ++$i ) { |
| push( @default_block, $default ); |
| } |
| |
| my $str = join( '|', @default_block ); |
| my $default_block_id = $block_ids{$str}; |
| |
| if ( !defined($default_block_id) ) { |
| $default_block_id = $block_count++; |
| push( @compressed, @default_block ); |
| } |
| |
| my $map_table = UnicodeTable->new( |
| values => \@map_values, |
| default => $default_block_id, |
| max => $block_count - 1, |
| ); |
| |
| return UnicodeTable->new( |
| values => \@compressed, |
| default => $default, |
| max => $self->{max}, |
| shift => $shift, |
| map_table => $map_table, |
| ); |
| } |
| |
| =head2 compress_map |
| |
| my $map_table = $table->compress_map($shift); |
| |
| Compress the map table of a table for multi stage lookup. Returns the |
| compressed map table. |
| |
| =cut |
| |
| sub compress_map { |
| my ( $self, $shift ) = @_; |
| |
| my $comp = $self->{map_table}->compress($shift); |
| $self->{map_table} = $comp; |
| |
| return $comp; |
| } |
| |
| =head2 dump |
| |
| $table->dump($file, $name); |
| |
| Dump the table as C code to filehandle $file. The table name is $name. |
| |
| =cut |
| |
| sub dump { |
| my ( $self, $file, $name ) = @_; |
| |
| my $values = $self->{values}; |
| my $size = @$values; |
| my $uc_name = uc($name); |
| |
| print $file (<<"EOF") if $self->{shift}; |
| #define ${uc_name}_SHIFT $self->{shift} |
| #define ${uc_name}_MASK $self->{mask} |
| EOF |
| print $file (<<"EOF"); |
| #define ${uc_name}_SIZE $size |
| |
| EOF |
| |
| my $max = $self->{max}; |
| my $bits = $max < 0x100 ? 8 : $max < 0x10000 ? 16 : 32; |
| my $pad = length($max); |
| my $vals_per_line = int( 76 / ( $pad + 2 ) ); |
| |
| print $file ("static const uint${bits}_t $name\[$size] = {\n"); |
| |
| my $i = 0; |
| |
| while ( $i < $size ) { |
| printf $file ( " \%${pad}d", $values->[$i] ); |
| |
| my $max = $i + $vals_per_line; |
| $max = $size if $max > $size; |
| |
| while ( ++$i < $max ) { |
| printf $file ( ", \%${pad}d", $values->[$i] ); |
| } |
| |
| print $file (',') if $i < $size; |
| print $file ("\n"); |
| } |
| |
| print $file ("};\n"); |
| } |
| |
| sub calc_sizes { |
| my ( $self, $range2, $range1 ) = @_; |
| |
| for ( my $shift2 = $range2->[0]; $shift2 <= $range2->[1]; ++$shift2 ) { |
| my $comp = $self->compress($shift2); |
| my $map_table = $comp->map_table; |
| my $size3 = $comp->size; |
| |
| for ( my $shift1 = $range1->[0]; $shift1 <= $range1->[1]; ++$shift1 ) |
| { |
| my $comp_map_table = $map_table->compress($shift1); |
| |
| my $size1 = $comp_map_table->map_table->size; |
| my $size2 = $comp_map_table->size; |
| |
| printf( |
| "shift %2d %2d: %6d + %6d + %6d = %7d bytes, %4d %4d\n", |
| $shift1, $shift2, |
| $size1, $size2, |
| $size3, $size1 + $size2 + $size3, |
| $comp_map_table->map_table->max, $comp_map_table->max, |
| ); |
| } |
| |
| print("\n"); |
| } |
| } |
| |
| =head1 AUTHOR |
| |
| Nick Wellnhofer <wellnhofer@aevum.de> |
| |
| =cut |
| |
| 1; |