| #!/usr/bin/perl |
| # |
| # Generate a composition table and its lookup utilities, using Unicode data |
| # files as input. |
| # |
| # Input: UnicodeData.txt and CompositionExclusions.txt |
| # Output: unicode_norm_table.h and unicode_norm_hashfunc.h |
| # |
| # Copyright (c) 2000-2021, PostgreSQL Global Development Group |
| |
| use strict; |
| use warnings; |
| |
| use FindBin; |
| use lib "$FindBin::RealBin/../../tools/"; |
| use PerfectHash; |
| |
| my $output_table_file = "unicode_norm_table.h"; |
| my $output_func_file = "unicode_norm_hashfunc.h"; |
| |
| my $FH; |
| |
| # Read list of codes that should be excluded from re-composition. |
| my @composition_exclusion_codes = (); |
| open($FH, '<', "CompositionExclusions.txt") |
| or die "Could not open CompositionExclusions.txt: $!."; |
| while (my $line = <$FH>) |
| { |
| if ($line =~ /^([[:xdigit:]]+)/) |
| { |
| push @composition_exclusion_codes, $1; |
| } |
| } |
| close $FH; |
| |
| # Read entries from UnicodeData.txt into a list, and a hash table. We need |
| # three fields from each row: the codepoint, canonical combining class, |
| # and character decomposition mapping |
| my @characters = (); |
| my %character_hash = (); |
| open($FH, '<', "UnicodeData.txt") |
| or die "Could not open UnicodeData.txt: $!."; |
| while (my $line = <$FH>) |
| { |
| |
| # Split the line wanted and get the fields needed: |
| # - Unicode code value |
| # - Canonical Combining Class |
| # - Character Decomposition Mapping |
| my @elts = split(';', $line); |
| my $code = $elts[0]; |
| my $class = $elts[3]; |
| my $decomp = $elts[5]; |
| |
| # Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes |
| # in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than |
| # 4 bytes. (This is just pro forma, as there aren't any such entries in |
| # the data file, currently.) |
| next if hex($code) > 0x10FFFF; |
| |
| # Skip characters with no decompositions and a class of 0, to reduce the |
| # table size. |
| next if $class eq '0' && $decomp eq ''; |
| |
| my %char_entry = (code => $code, class => $class, decomp => $decomp); |
| push(@characters, \%char_entry); |
| $character_hash{$code} = \%char_entry; |
| } |
| close $FH; |
| |
| my $num_characters = scalar @characters; |
| |
| # Start writing out the output files |
| open my $OT, '>', $output_table_file |
| or die "Could not open output file $output_table_file: $!\n"; |
| open my $OF, '>', $output_func_file |
| or die "Could not open output file $output_func_file: $!\n"; |
| |
| print $OT <<HEADER; |
| /*------------------------------------------------------------------------- |
| * |
| * unicode_norm_table.h |
| * Composition table used for Unicode normalization |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * src/include/common/unicode_norm_table.h |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| /* |
| * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, |
| * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_TABLE_H |
| * here. |
| */ |
| typedef struct |
| { |
| uint32 codepoint; /* Unicode codepoint */ |
| uint8 comb_class; /* combining class of character */ |
| uint8 dec_size_flags; /* size and flags of decomposition code list */ |
| uint16 dec_index; /* index into UnicodeDecomp_codepoints, or the |
| * decomposition itself if DECOMP_INLINE */ |
| } pg_unicode_decomposition; |
| |
| #define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */ |
| #define DECOMP_INLINE 0x40 /* decomposition is stored inline in |
| * dec_index */ |
| #define DECOMP_COMPAT 0x20 /* compatibility mapping */ |
| |
| #define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F) |
| #define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0) |
| #define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0) |
| #define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0) |
| |
| /* Table of Unicode codepoints and their decompositions */ |
| static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] = |
| { |
| HEADER |
| |
| print $OF <<HEADER; |
| /*------------------------------------------------------------------------- |
| * |
| * unicode_norm_hashfunc.h |
| * Perfect hash functions used for Unicode normalization |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * src/include/common/unicode_norm_hashfunc.h |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| /* |
| * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, |
| * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H |
| * here. |
| */ |
| |
| #include "common/unicode_norm_table.h" |
| |
| /* Typedef for perfect hash functions */ |
| typedef int (*cp_hash_func) (const void *key); |
| |
| /* Information for lookups with perfect hash functions */ |
| typedef struct |
| { |
| const pg_unicode_decomposition *decomps; |
| cp_hash_func hash; |
| int num_decomps; |
| } pg_unicode_decompinfo; |
| |
| typedef struct |
| { |
| const uint16 *inverse_lookup; |
| cp_hash_func hash; |
| int num_recomps; |
| } pg_unicode_recompinfo; |
| |
| HEADER |
| |
| my $decomp_index = 0; |
| my $decomp_string = ""; |
| my @dec_cp_packed; |
| my $main_index = 0; |
| my @rec_info; |
| |
| my $last_code = $characters[-1]->{code}; |
| foreach my $char (@characters) |
| { |
| my $code = $char->{code}; |
| my $class = $char->{class}; |
| my $decomp = $char->{decomp}; |
| |
| # Save the code point bytes as a string in network order. |
| push @dec_cp_packed, pack('N', hex($char->{code})); |
| |
| # The character decomposition mapping field in UnicodeData.txt is a list |
| # of unicode codepoints, separated by space. But it can be prefixed with |
| # so-called compatibility formatting tag, like "<compat>", or "<font>". |
| # The entries with compatibility formatting tags should not be used for |
| # re-composing characters during normalization, so flag them in the table. |
| # (The tag doesn't matter, only whether there is a tag or not) |
| my $compat = 0; |
| if ($decomp =~ /\<.*\>/) |
| { |
| $compat = 1; |
| $decomp =~ s/\<[^][]*\>//g; |
| } |
| my @decomp_elts = split(" ", $decomp); |
| |
| # Decomposition size |
| # Print size of decomposition |
| my $decomp_size = scalar(@decomp_elts); |
| die if $decomp_size > 0x1F; # to not overrun bitmask |
| |
| my $first_decomp = shift @decomp_elts; |
| |
| my $flags = ""; |
| my $comment = ""; |
| |
| if ($compat) |
| { |
| $flags .= " | DECOMP_COMPAT"; |
| } |
| |
| if ($decomp_size == 2) |
| { |
| # Should this be used for recomposition? |
| if ( $character_hash{$first_decomp} |
| && $character_hash{$first_decomp}->{class} != 0) |
| { |
| $flags .= " | DECOMP_NO_COMPOSE"; |
| $comment = "non-starter decomposition"; |
| } |
| else |
| { |
| foreach my $lcode (@composition_exclusion_codes) |
| { |
| if ($lcode eq $code) |
| { |
| $flags .= " | DECOMP_NO_COMPOSE"; |
| $comment = "in exclusion list"; |
| last; |
| } |
| } |
| } |
| |
| # Save info for recomposeable codepoints. |
| # Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C |
| # above! See also the inverse lookup in recompose_code() found in |
| # src/common/unicode_norm.c. |
| if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/)) |
| { |
| push @rec_info, |
| { |
| code => $code, |
| main_index => $main_index, |
| first => $first_decomp, |
| second => $decomp_elts[0] |
| }; |
| } |
| } |
| |
| if ($decomp_size == 0) |
| { |
| print $OT "\t{0x$code, $class, 0$flags, 0}"; |
| } |
| elsif ($decomp_size == 1 && length($first_decomp) <= 4) |
| { |
| |
| # The decomposition consists of a single codepoint, and it fits |
| # in a uint16, so we can store it "inline" in the main table. |
| $flags .= " | DECOMP_INLINE"; |
| print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}"; |
| } |
| else |
| { |
| print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}"; |
| |
| # Now save the decompositions into a dedicated area that will |
| # be written afterwards. First build the entry dedicated to |
| # a sub-table with the code and decomposition. |
| $decomp_string .= ",\n" if ($decomp_string ne ""); |
| |
| $decomp_string .= "\t /* $decomp_index */ 0x$first_decomp"; |
| foreach (@decomp_elts) |
| { |
| $decomp_string .= ", 0x$_"; |
| } |
| |
| $decomp_index = $decomp_index + $decomp_size; |
| } |
| |
| # Print a comma after all items except the last one. |
| print $OT "," unless ($code eq $last_code); |
| |
| print $OT "\t/* $comment */" if ($comment ne ""); |
| print $OT "\n"; |
| |
| $main_index++; |
| } |
| print $OT "\n};\n\n"; |
| |
| # Print the array of decomposed codes. |
| print $OT <<HEADER; |
| /* codepoints array */ |
| static const uint32 UnicodeDecomp_codepoints[$decomp_index] = |
| { |
| $decomp_string |
| }; |
| HEADER |
| |
| # Emit the definition of the decomp hash function. |
| my $dec_funcname = 'Decomp_hash_func'; |
| my $dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed, |
| $dec_funcname, fixed_key_length => 4); |
| print $OF "/* Perfect hash function for decomposition */\n"; |
| print $OF "static $dec_func\n"; |
| |
| # Emit the structure that wraps the hash lookup information into |
| # one variable. |
| print $OF <<HEADER; |
| /* Hash lookup information for decomposition */ |
| static const pg_unicode_decompinfo UnicodeDecompInfo = |
| { |
| UnicodeDecompMain, |
| $dec_funcname, |
| $num_characters |
| }; |
| |
| HEADER |
| |
| # Find the lowest codepoint that decomposes to each recomposeable |
| # code pair and create a mapping to it. |
| my $recomp_string = ""; |
| my @rec_cp_packed; |
| my %seenit; |
| my $firstentry = 1; |
| foreach my $rec (sort recomp_sort @rec_info) |
| { |
| # The hash key is formed by concatenating the bytes of the two |
| # codepoints. See also recompose_code() in common/unicode_norm.c. |
| my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second}); |
| |
| # We are only interested in the lowest code point that decomposes |
| # to the given code pair. |
| next if $seenit{$hashkey}; |
| |
| # Save the hash key bytes in network order |
| push @rec_cp_packed, pack('Q>', $hashkey); |
| |
| # Append inverse lookup element |
| $recomp_string .= ",\n" if !$firstentry; |
| $recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s", |
| $rec->{first}, |
| $rec->{second}, |
| $rec->{code}, |
| $rec->{main_index}; |
| |
| $seenit{$hashkey} = 1; |
| $firstentry = 0; |
| } |
| |
| # Emit the inverse lookup array containing indexes into UnicodeDecompMain. |
| my $num_recomps = scalar @rec_cp_packed; |
| print $OF <<HEADER; |
| /* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */ |
| static const uint16 RecompInverseLookup[$num_recomps] = |
| { |
| $recomp_string |
| }; |
| |
| HEADER |
| |
| # Emit the definition of the recomposition hash function. |
| my $rec_funcname = 'Recomp_hash_func'; |
| my $rec_func = |
| PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname, |
| fixed_key_length => 8); |
| print $OF "/* Perfect hash function for recomposition */\n"; |
| print $OF "static $rec_func\n"; |
| |
| # Emit the structure that wraps the hash lookup information into |
| # one variable. |
| print $OF <<HEADER; |
| /* Hash lookup information for recomposition */ |
| static const pg_unicode_recompinfo UnicodeRecompInfo = |
| { |
| RecompInverseLookup, |
| $rec_funcname, |
| $num_recomps |
| }; |
| HEADER |
| |
| close $OT; |
| close $OF; |
| |
| sub recomp_sort |
| { |
| my $a1 = hex($a->{first}); |
| my $b1 = hex($b->{first}); |
| |
| my $a2 = hex($a->{second}); |
| my $b2 = hex($b->{second}); |
| |
| # First sort by the first code point |
| return -1 if $a1 < $b1; |
| return 1 if $a1 > $b1; |
| |
| # Then sort by the second code point |
| return -1 if $a2 < $b2; |
| return 1 if $a2 > $b2; |
| |
| # Finally sort by the code point that decomposes into first and |
| # second ones. |
| my $acode = hex($a->{code}); |
| my $bcode = hex($b->{code}); |
| |
| return -1 if $acode < $bcode; |
| return 1 if $acode > $bcode; |
| |
| die "found duplicate entries of recomposeable code pairs"; |
| } |