src/common/unicode/generate-unicode_norm_table.pl - cloudberry - Git at Google

 #!/usr/bin/perl
 #
 # Generate a composition table and its lookup utilities, using Unicode data
 # files as input.
 #
 # Input: UnicodeData.txt and CompositionExclusions.txt
 # Output: unicode_norm_table.h and unicode_norm_hashfunc.h
 #
 # Copyright (c) 2000-2021, PostgreSQL Global Development Group

 use strict;
 use warnings;

 use FindBin;
 use lib "$FindBin::RealBin/../../tools/";
 use PerfectHash;

 my $output_table_file = "unicode_norm_table.h";
 my $output_func_file  = "unicode_norm_hashfunc.h";

 my $FH;

 # Read list of codes that should be excluded from re-composition.
 my @composition_exclusion_codes = ();
 open($FH, '<', "CompositionExclusions.txt")
   or die "Could not open CompositionExclusions.txt: $!.";
 while (my $line = <$FH>)
 {
 	if ($line =~ /^([[:xdigit:]]+)/)
 	{
 		push @composition_exclusion_codes, $1;
 	}
 }
 close $FH;

 # Read entries from UnicodeData.txt into a list, and a hash table. We need
 # three fields from each row: the codepoint, canonical combining class,
 # and character decomposition mapping
 my @characters     = ();
 my %character_hash = ();
 open($FH, '<', "UnicodeData.txt")
   or die "Could not open UnicodeData.txt: $!.";
 while (my $line = <$FH>)
 {

 	# Split the line wanted and get the fields needed:
 	# - Unicode code value
 	# - Canonical Combining Class
 	# - Character Decomposition Mapping
 	my @elts   = split(';', $line);
 	my $code   = $elts[0];
 	my $class  = $elts[3];
 	my $decomp = $elts[5];

 	# Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes
 	# in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than
 	# 4 bytes. (This is just pro forma, as there aren't any such entries in
 	# the data file, currently.)
 	next if hex($code) > 0x10FFFF;

 	# Skip characters with no decompositions and a class of 0, to reduce the
 	# table size.
 	next if $class eq '0' && $decomp eq '';

 	my %char_entry = (code => $code, class => $class, decomp => $decomp);
 	push(@characters, \%char_entry);
 	$character_hash{$code} = \%char_entry;
 }
 close $FH;

 my $num_characters = scalar @characters;

 # Start writing out the output files
 open my $OT, '>', $output_table_file
   or die "Could not open output file $output_table_file: $!\n";
 open my $OF, '>', $output_func_file
   or die "Could not open output file $output_func_file: $!\n";

 print $OT <<HEADER;
 /*-------------------------------------------------------------------------
  *
  * unicode_norm_table.h
  *	  Composition table used for Unicode normalization
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * src/include/common/unicode_norm_table.h
  *
  *-------------------------------------------------------------------------
  */

 /*
  * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
  * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_TABLE_H
  * here.
  */
 typedef struct
 {
 	uint32		codepoint;		/* Unicode codepoint */
 	uint8		comb_class;		/* combining class of character */
 	uint8		dec_size_flags; /* size and flags of decomposition code list */
 	uint16		dec_index;		/* index into UnicodeDecomp_codepoints, or the
 								 * decomposition itself if DECOMP_INLINE */
 } pg_unicode_decomposition;

 #define DECOMP_NO_COMPOSE	0x80	/* don't use for re-composition */
 #define DECOMP_INLINE		0x40	/* decomposition is stored inline in
 									 * dec_index */
 #define DECOMP_COMPAT		0x20	/* compatibility mapping */

 #define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
 #define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
 #define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
 #define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)

 /* Table of Unicode codepoints and their decompositions */
 static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
 {
 HEADER

 print $OF <<HEADER;
 /*-------------------------------------------------------------------------
  *
  * unicode_norm_hashfunc.h
  *	  Perfect hash functions used for Unicode normalization
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * src/include/common/unicode_norm_hashfunc.h
  *
  *-------------------------------------------------------------------------
  */

 /*
  * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
  * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H
  * here.
  */

 #include "common/unicode_norm_table.h"

 /* Typedef for perfect hash functions */
 typedef int (*cp_hash_func) (const void *key);

 /* Information for lookups with perfect hash functions */
 typedef struct
 {
 	const pg_unicode_decomposition *decomps;
 	cp_hash_func	hash;
 	int		num_decomps;
 } pg_unicode_decompinfo;

 typedef struct
 {
 	const uint16	*inverse_lookup;
 	cp_hash_func	hash;
 	int		num_recomps;
 } pg_unicode_recompinfo;

 HEADER

 my $decomp_index  = 0;
 my $decomp_string = "";
 my @dec_cp_packed;
 my $main_index = 0;
 my @rec_info;

 my $last_code = $characters[-1]->{code};
 foreach my $char (@characters)
 {
 	my $code   = $char->{code};
 	my $class  = $char->{class};
 	my $decomp = $char->{decomp};

 	# Save the code point bytes as a string in network order.
 	push @dec_cp_packed, pack('N', hex($char->{code}));

 	# The character decomposition mapping field in UnicodeData.txt is a list
 	# of unicode codepoints, separated by space. But it can be prefixed with
 	# so-called compatibility formatting tag, like "<compat>", or "<font>".
 	# The entries with compatibility formatting tags should not be used for
 	# re-composing characters during normalization, so flag them in the table.
 	# (The tag doesn't matter, only whether there is a tag or not)
 	my $compat = 0;
 	if ($decomp =~ /\<.*\>/)
 	{
 		$compat = 1;
 		$decomp =~ s/\<[^][]*\>//g;
 	}
 	my @decomp_elts = split(" ", $decomp);

 	# Decomposition size
 	# Print size of decomposition
 	my $decomp_size = scalar(@decomp_elts);
 	die if $decomp_size > 0x1F;    # to not overrun bitmask

 	my $first_decomp = shift @decomp_elts;

 	my $flags   = "";
 	my $comment = "";

 	if ($compat)
 	{
 		$flags .= " | DECOMP_COMPAT";
 	}

 	if ($decomp_size == 2)
 	{
 		# Should this be used for recomposition?
 		if (   $character_hash{$first_decomp}
 			&& $character_hash{$first_decomp}->{class} != 0)
 		{
 			$flags .= " | DECOMP_NO_COMPOSE";
 			$comment = "non-starter decomposition";
 		}
 		else
 		{
 			foreach my $lcode (@composition_exclusion_codes)
 			{
 				if ($lcode eq $code)
 				{
 					$flags .= " | DECOMP_NO_COMPOSE";
 					$comment = "in exclusion list";
 					last;
 				}
 			}
 		}

 		# Save info for recomposeable codepoints.
 		# Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C
 		# above!  See also the inverse lookup in recompose_code() found in
 		# src/common/unicode_norm.c.
 		if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/))
 		{
 			push @rec_info,
 			  {
 				code       => $code,
 				main_index => $main_index,
 				first      => $first_decomp,
 				second     => $decomp_elts[0]
 			  };
 		}
 	}

 	if ($decomp_size == 0)
 	{
 		print $OT "\t{0x$code, $class, 0$flags, 0}";
 	}
 	elsif ($decomp_size == 1 && length($first_decomp) <= 4)
 	{

 		# The decomposition consists of a single codepoint, and it fits
 		# in a uint16, so we can store it "inline" in the main table.
 		$flags .= " | DECOMP_INLINE";
 		print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
 	}
 	else
 	{
 		print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}";

 		# Now save the decompositions into a dedicated area that will
 		# be written afterwards.  First build the entry dedicated to
 		# a sub-table with the code and decomposition.
 		$decomp_string .= ",\n" if ($decomp_string ne "");

 		$decomp_string .= "\t /* $decomp_index */ 0x$first_decomp";
 		foreach (@decomp_elts)
 		{
 			$decomp_string .= ", 0x$_";
 		}

 		$decomp_index = $decomp_index + $decomp_size;
 	}

 	# Print a comma after all items except the last one.
 	print $OT "," unless ($code eq $last_code);

 	print $OT "\t/* $comment */" if ($comment ne "");
 	print $OT "\n";

 	$main_index++;
 }
 print $OT "\n};\n\n";

 # Print the array of decomposed codes.
 print $OT <<HEADER;
 /* codepoints array  */
 static const uint32 UnicodeDecomp_codepoints[$decomp_index] =
 {
 $decomp_string
 };
 HEADER

 # Emit the definition of the decomp hash function.
 my $dec_funcname = 'Decomp_hash_func';
 my $dec_func     = PerfectHash::generate_hash_function(\@dec_cp_packed,
 	$dec_funcname, fixed_key_length => 4);
 print $OF "/* Perfect hash function for decomposition */\n";
 print $OF "static $dec_func\n";

 # Emit the structure that wraps the hash lookup information into
 # one variable.
 print $OF <<HEADER;
 /* Hash lookup information for decomposition */
 static const pg_unicode_decompinfo UnicodeDecompInfo =
 {
 	UnicodeDecompMain,
 	$dec_funcname,
 	$num_characters
 };

 HEADER

 # Find the lowest codepoint that decomposes to each recomposeable
 # code pair and create a mapping to it.
 my $recomp_string = "";
 my @rec_cp_packed;
 my %seenit;
 my $firstentry = 1;
 foreach my $rec (sort recomp_sort @rec_info)
 {
 	# The hash key is formed by concatenating the bytes of the two
 	# codepoints. See also recompose_code() in common/unicode_norm.c.
 	my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second});

 	# We are only interested in the lowest code point that decomposes
 	# to the given code pair.
 	next if $seenit{$hashkey};

 	# Save the hash key bytes in network order
 	push @rec_cp_packed, pack('Q>', $hashkey);

 	# Append inverse lookup element
 	$recomp_string .= ",\n" if !$firstentry;
 	$recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s",
 	  $rec->{first},
 	  $rec->{second},
 	  $rec->{code},
 	  $rec->{main_index};

 	$seenit{$hashkey} = 1;
 	$firstentry = 0;
 }

 # Emit the inverse lookup array containing indexes into UnicodeDecompMain.
 my $num_recomps = scalar @rec_cp_packed;
 print $OF <<HEADER;
 /* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */
 static const uint16 RecompInverseLookup[$num_recomps] =
 {
 $recomp_string
 };

 HEADER

 # Emit the definition of the recomposition hash function.
 my $rec_funcname = 'Recomp_hash_func';
 my $rec_func =
   PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname,
 	fixed_key_length => 8);
 print $OF "/* Perfect hash function for recomposition */\n";
 print $OF "static $rec_func\n";

 # Emit the structure that wraps the hash lookup information into
 # one variable.
 print $OF <<HEADER;
 /* Hash lookup information for recomposition */
 static const pg_unicode_recompinfo UnicodeRecompInfo =
 {
 	RecompInverseLookup,
 	$rec_funcname,
 	$num_recomps
 };
 HEADER

 close $OT;
 close $OF;

 sub recomp_sort
 {
 	my $a1 = hex($a->{first});
 	my $b1 = hex($b->{first});

 	my $a2 = hex($a->{second});
 	my $b2 = hex($b->{second});

 	# First sort by the first code point
 	return -1 if $a1 < $b1;
 	return 1  if $a1 > $b1;

 	# Then sort by the second code point
 	return -1 if $a2 < $b2;
 	return 1  if $a2 > $b2;

 	# Finally sort by the code point that decomposes into first and
 	# second ones.
 	my $acode = hex($a->{code});
 	my $bcode = hex($b->{code});

 	return -1 if $acode < $bcode;
 	return 1  if $acode > $bcode;

 	die "found duplicate entries of recomposeable code pairs";
 }
	#!/usr/bin/perl
	#
	# Generate a composition table and its lookup utilities, using Unicode data
	# files as input.
	#
	# Input: UnicodeData.txt and CompositionExclusions.txt
	# Output: unicode_norm_table.h and unicode_norm_hashfunc.h
	#
	# Copyright (c) 2000-2021, PostgreSQL Global Development Group

	use strict;
	use warnings;

	use FindBin;
	use lib "$FindBin::RealBin/../../tools/";
	use PerfectHash;

	my $output_table_file = "unicode_norm_table.h";
	my $output_func_file = "unicode_norm_hashfunc.h";

	my $FH;

	# Read list of codes that should be excluded from re-composition.
	my @composition_exclusion_codes = ();
	open($FH, '<', "CompositionExclusions.txt")
	or die "Could not open CompositionExclusions.txt: $!.";
	while (my $line = <$FH>)
	{
	if ($line =~ /^([[:xdigit:]]+)/)
	{
	push @composition_exclusion_codes, $1;
	}
	}
	close $FH;

	# Read entries from UnicodeData.txt into a list, and a hash table. We need
	# three fields from each row: the codepoint, canonical combining class,
	# and character decomposition mapping
	my @characters = ();
	my %character_hash = ();
	open($FH, '<', "UnicodeData.txt")
	or die "Could not open UnicodeData.txt: $!.";
	while (my $line = <$FH>)
	{

	# Split the line wanted and get the fields needed:
	# - Unicode code value
	# - Canonical Combining Class
	# - Character Decomposition Mapping
	my @elts = split(';', $line);
	my $code = $elts[0];
	my $class = $elts[3];
	my $decomp = $elts[5];

	# Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes
	# in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than
	# 4 bytes. (This is just pro forma, as there aren't any such entries in
	# the data file, currently.)
	next if hex($code) > 0x10FFFF;

	# Skip characters with no decompositions and a class of 0, to reduce the
	# table size.
	next if $class eq '0' && $decomp eq '';

	my %char_entry = (code => $code, class => $class, decomp => $decomp);
	push(@characters, \%char_entry);
	$character_hash{$code} = \%char_entry;
	}
	close $FH;

	my $num_characters = scalar @characters;

	# Start writing out the output files
	open my $OT, '>', $output_table_file
	or die "Could not open output file $output_table_file: $!\n";
	open my $OF, '>', $output_func_file
	or die "Could not open output file $output_func_file: $!\n";

	print $OT <<HEADER;
	/*-------------------------------------------------------------------------
	*
	* unicode_norm_table.h
	* Composition table used for Unicode normalization
	*
	* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
	* Portions Copyright (c) 1994, Regents of the University of California
	*
	* src/include/common/unicode_norm_table.h
	*
	*-------------------------------------------------------------------------
	*/

	/*
	* File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
	* do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_TABLE_H
	* here.
	*/
	typedef struct
	{
	uint32 codepoint; /* Unicode codepoint */
	uint8 comb_class; /* combining class of character */
	uint8 dec_size_flags; /* size and flags of decomposition code list */
	uint16 dec_index; /* index into UnicodeDecomp_codepoints, or the
	* decomposition itself if DECOMP_INLINE */
	} pg_unicode_decomposition;

	#define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */
	#define DECOMP_INLINE 0x40 /* decomposition is stored inline in
	* dec_index */
	#define DECOMP_COMPAT 0x20 /* compatibility mapping */

	#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
	#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE \| DECOMP_COMPAT)) != 0)
	#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
	#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)

	/* Table of Unicode codepoints and their decompositions */
	static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
	{
	HEADER

	print $OF <<HEADER;
	/*-------------------------------------------------------------------------
	*
	* unicode_norm_hashfunc.h
	* Perfect hash functions used for Unicode normalization
	*
	* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
	* Portions Copyright (c) 1994, Regents of the University of California
	*
	* src/include/common/unicode_norm_hashfunc.h
	*
	*-------------------------------------------------------------------------
	*/

	/*
	* File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
	* do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H
	* here.
	*/

	#include "common/unicode_norm_table.h"

	/* Typedef for perfect hash functions */
	typedef int (cp_hash_func) (const void key);

	/* Information for lookups with perfect hash functions */
	typedef struct
	{
	const pg_unicode_decomposition *decomps;
	cp_hash_func hash;
	int num_decomps;
	} pg_unicode_decompinfo;

	typedef struct
	{
	const uint16 *inverse_lookup;
	cp_hash_func hash;
	int num_recomps;
	} pg_unicode_recompinfo;

	HEADER

	my $decomp_index = 0;
	my $decomp_string = "";
	my @dec_cp_packed;
	my $main_index = 0;
	my @rec_info;

	my $last_code = $characters[-1]->{code};
	foreach my $char (@characters)
	{
	my $code = $char->{code};
	my $class = $char->{class};
	my $decomp = $char->{decomp};

	# Save the code point bytes as a string in network order.
	push @dec_cp_packed, pack('N', hex($char->{code}));

	# The character decomposition mapping field in UnicodeData.txt is a list
	# of unicode codepoints, separated by space. But it can be prefixed with
	# so-called compatibility formatting tag, like "<compat>", or "<font>".
	# The entries with compatibility formatting tags should not be used for
	# re-composing characters during normalization, so flag them in the table.
	# (The tag doesn't matter, only whether there is a tag or not)
	my $compat = 0;
	if ($decomp =~ /\<.*\>/)
	{
	$compat = 1;
	$decomp =~ s/\<[^][]*\>//g;
	}
	my @decomp_elts = split(" ", $decomp);

	# Decomposition size
	# Print size of decomposition
	my $decomp_size = scalar(@decomp_elts);
	die if $decomp_size > 0x1F; # to not overrun bitmask

	my $first_decomp = shift @decomp_elts;

	my $flags = "";
	my $comment = "";

	if ($compat)
	{
	$flags .= " \| DECOMP_COMPAT";
	}

	if ($decomp_size == 2)
	{
	# Should this be used for recomposition?
	if ( $character_hash{$first_decomp}
	&& $character_hash{$first_decomp}->{class} != 0)
	{
	$flags .= " \| DECOMP_NO_COMPOSE";
	$comment = "non-starter decomposition";
	}
	else
	{
	foreach my $lcode (@composition_exclusion_codes)
	{
	if ($lcode eq $code)
	{
	$flags .= " \| DECOMP_NO_COMPOSE";
	$comment = "in exclusion list";
	last;
	}
	}
	}

	# Save info for recomposeable codepoints.
	# Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C
	# above! See also the inverse lookup in recompose_code() found in
	# src/common/unicode_norm.c.
	if (!($flags =~ /DECOMP_COMPAT/ \|\| $flags =~ /DECOMP_NO_COMPOSE/))
	{
	push @rec_info,
	{
	code => $code,
	main_index => $main_index,
	first => $first_decomp,
	second => $decomp_elts[0]
	};
	}
	}

	if ($decomp_size == 0)
	{
	print $OT "\t{0x$code, $class, 0$flags, 0}";
	}
	elsif ($decomp_size == 1 && length($first_decomp) <= 4)
	{

	# The decomposition consists of a single codepoint, and it fits
	# in a uint16, so we can store it "inline" in the main table.
	$flags .= " \| DECOMP_INLINE";
	print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
	}
	else
	{
	print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}";

	# Now save the decompositions into a dedicated area that will
	# be written afterwards. First build the entry dedicated to
	# a sub-table with the code and decomposition.
	$decomp_string .= ",\n" if ($decomp_string ne "");

	$decomp_string .= "\t /* $decomp_index */ 0x$first_decomp";
	foreach (@decomp_elts)
	{
	$decomp_string .= ", 0x$_";
	}

	$decomp_index = $decomp_index + $decomp_size;
	}

	# Print a comma after all items except the last one.
	print $OT "," unless ($code eq $last_code);

	print $OT "\t/* $comment */" if ($comment ne "");
	print $OT "\n";

	$main_index++;
	}
	print $OT "\n};\n\n";

	# Print the array of decomposed codes.
	print $OT <<HEADER;
	/* codepoints array */
	static const uint32 UnicodeDecomp_codepoints[$decomp_index] =
	{
	$decomp_string
	};
	HEADER

	# Emit the definition of the decomp hash function.
	my $dec_funcname = 'Decomp_hash_func';
	my $dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed,
	$dec_funcname, fixed_key_length => 4);
	print $OF "/* Perfect hash function for decomposition */\n";
	print $OF "static $dec_func\n";

	# Emit the structure that wraps the hash lookup information into
	# one variable.
	print $OF <<HEADER;
	/* Hash lookup information for decomposition */
	static const pg_unicode_decompinfo UnicodeDecompInfo =
	{
	UnicodeDecompMain,
	$dec_funcname,
	$num_characters
	};

	HEADER

	# Find the lowest codepoint that decomposes to each recomposeable
	# code pair and create a mapping to it.
	my $recomp_string = "";
	my @rec_cp_packed;
	my %seenit;
	my $firstentry = 1;
	foreach my $rec (sort recomp_sort @rec_info)
	{
	# The hash key is formed by concatenating the bytes of the two
	# codepoints. See also recompose_code() in common/unicode_norm.c.
	my $hashkey = (hex($rec->{first}) << 32) \| hex($rec->{second});

	# We are only interested in the lowest code point that decomposes
	# to the given code pair.
	next if $seenit{$hashkey};

	# Save the hash key bytes in network order
	push @rec_cp_packed, pack('Q>', $hashkey);

	# Append inverse lookup element
	$recomp_string .= ",\n" if !$firstentry;
	$recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s",
	$rec->{first},
	$rec->{second},
	$rec->{code},
	$rec->{main_index};

	$seenit{$hashkey} = 1;
	$firstentry = 0;
	}

	# Emit the inverse lookup array containing indexes into UnicodeDecompMain.
	my $num_recomps = scalar @rec_cp_packed;
	print $OF <<HEADER;
	/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */
	static const uint16 RecompInverseLookup[$num_recomps] =
	{
	$recomp_string
	};

	HEADER

	# Emit the definition of the recomposition hash function.
	my $rec_funcname = 'Recomp_hash_func';
	my $rec_func =
	PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname,
	fixed_key_length => 8);
	print $OF "/* Perfect hash function for recomposition */\n";
	print $OF "static $rec_func\n";

	# Emit the structure that wraps the hash lookup information into
	# one variable.
	print $OF <<HEADER;
	/* Hash lookup information for recomposition */
	static const pg_unicode_recompinfo UnicodeRecompInfo =
	{
	RecompInverseLookup,
	$rec_funcname,
	$num_recomps
	};
	HEADER

	close $OT;
	close $OF;

	sub recomp_sort
	{
	my $a1 = hex($a->{first});
	my $b1 = hex($b->{first});

	my $a2 = hex($a->{second});
	my $b2 = hex($b->{second});

	# First sort by the first code point
	return -1 if $a1 < $b1;
	return 1 if $a1 > $b1;

	# Then sort by the second code point
	return -1 if $a2 < $b2;
	return 1 if $a2 > $b2;

	# Finally sort by the code point that decomposes into first and
	# second ones.
	my $acode = hex($a->{code});
	my $bcode = hex($b->{code});

	return -1 if $acode < $bcode;
	return 1 if $acode > $bcode;

	die "found duplicate entries of recomposeable code pairs";
	}