src/backend/utils/mb/Unicode/convutils.pm - cloudberry - Git at Google

 #
 # Copyright (c) 2001-2023, PostgreSQL Global Development Group
 #
 # src/backend/utils/mb/Unicode/convutils.pm

 package convutils;

 use strict;
 use warnings;

 use Carp;
 use Exporter 'import';

 our @EXPORT =
   qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables);

 # Constants used in the 'direction' field of the character maps
 use constant {
 	NONE => 0,
 	TO_UNICODE => 1,
 	FROM_UNICODE => 2,
 	BOTH => 3
 };

 #######################################################################
 # read_source - common routine to read source file
 #
 # fname ; input file name
 #
 sub read_source
 {
 	my ($fname) = @_;
 	my @r;

 	open(my $in, '<', $fname) || die("cannot open $fname");

 	while (<$in>)
 	{
 		next if (/^#/);
 		chop;

 		next if (/^$/);    # Ignore empty lines

 		next if (/^0x([0-9A-F]+)\s+(#.*)$/);

 		# The Unicode source files have three columns
 		# 1: The "foreign" code (in hex)
 		# 2: Unicode code point (in hex)
 		# 3: Unicode name
 		if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(#.*)$/)
 		{
 			print STDERR "READ ERROR at line $. in $fname: $_\n";
 			exit;
 		}
 		my $out = {
 			code => hex($1),
 			ucs => hex($2),
 			comment => $4,
 			direction => BOTH,
 			f => $fname,
 			l => $.
 		};

 		# Ignore pure ASCII mappings. PostgreSQL character conversion code
 		# never even passes these to the conversion code.
 		next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);

 		push(@r, $out);
 	}
 	close($in);

 	return \@r;
 }

 ##################################################################
 # print_conversion_tables - output mapping tables
 #
 # print_conversion_tables($this_script, $csname, \%charset)
 #
 # this_script - the name of the *caller script* of this feature
 # csname      - character set name other than ucs
 # charset     - ref to character set array
 #
 # Input character set array format:
 #
 # Each element in the character set array is a hash. Each hash has the following fields:
 #   direction  - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether)
 #   ucs        - Unicode code point
 #   ucs_second - Second Unicode code point, if this is a "combined" character.
 #   code       - Byte sequence in the "other" character set, as an integer
 #   comment    - Text representation of the character
 #   f          - Source filename
 #   l          - Line number in source file
 #
 sub print_conversion_tables
 {
 	my ($this_script, $csname, $charset) = @_;

 	print_conversion_tables_direction($this_script, $csname, FROM_UNICODE,
 		$charset);
 	print_conversion_tables_direction($this_script, $csname, TO_UNICODE,
 		$charset);
 	return;
 }

 #############################################################################
 # INTERNAL ROUTINES

 #######################################################################
 # print_conversion_tables_direction - write the whole content of C source of radix tree
 #
 # print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth)
 #
 # this_script - the name of the *caller script* of this feature
 # csname      - character set name other than ucs
 # direction   - desired direction, TO_UNICODE or FROM_UNICODE
 # charset     - ref to character set array
 #
 sub print_conversion_tables_direction
 {
 	my ($this_script, $csname, $direction, $charset) = @_;

 	my $fname;
 	my $tblname;
 	if ($direction == TO_UNICODE)
 	{
 		$fname = lc("${csname}_to_utf8.map");
 		$tblname = lc("${csname}_to_unicode_tree");

 		print "- Writing ${csname}=>UTF8 conversion table: $fname\n";
 	}
 	else
 	{
 		$fname = lc("utf8_to_${csname}.map");
 		$tblname = lc("${csname}_from_unicode_tree");

 		print "- Writing UTF8=>${csname} conversion table: $fname\n";
 	}

 	open(my $out, '>', $fname) || die("cannot open $fname");

 	print $out "/* src/backend/utils/mb/Unicode/$fname */\n";
 	print $out "/* This file is generated by $this_script */\n\n";

 	# Collect regular, non-combined, mappings, and create the radix tree from them.
 	my $charmap = &make_charmap($out, $charset, $direction, 0);
 	print_radix_table($out, $tblname, $charmap);

 	# Collect combined characters, and create combined character table (if any)
 	my $charmap_combined = &make_charmap_combined($charset, $direction);

 	if (scalar @{$charmap_combined} > 0)
 	{
 		if ($direction == TO_UNICODE)
 		{
 			print_to_utf8_combined_map($out, $csname, $charmap_combined, 1);
 		}
 		else
 		{
 			print_from_utf8_combined_map($out, $csname, $charmap_combined, 1);
 		}
 	}

 	close($out);
 	return;
 }

 sub print_from_utf8_combined_map
 {
 	my ($out, $charset, $table, $verbose) = @_;

 	my $last_comment = "";

 	printf $out "\n/* Combined character map */\n";
 	printf $out
 	  "static const pg_utf_to_local_combined ULmap${charset}_combined[%d] = {",
 	  scalar(@$table);
 	my $first = 1;
 	foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table)
 	{
 		print($out ",") if (!$first);
 		$first = 0;
 		print $out "\t/* $last_comment */"
 		  if ($verbose && $last_comment ne "");

 		printf $out "\n  {0x%08x, 0x%08x, 0x%04x}",
 		  $i->{utf8}, $i->{utf8_second}, $i->{code};
 		if ($verbose >= 2)
 		{
 			$last_comment =
 			  sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
 		}
 		elsif ($verbose >= 1)
 		{
 			$last_comment = $i->{comment};
 		}
 	}
 	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
 	print $out "\n};\n";
 	return;
 }

 sub print_to_utf8_combined_map
 {
 	my ($out, $charset, $table, $verbose) = @_;

 	my $last_comment = "";

 	printf $out "\n/* Combined character map */\n";
 	printf $out
 	  "static const pg_local_to_utf_combined LUmap${charset}_combined[%d] = {",
 	  scalar(@$table);

 	my $first = 1;
 	foreach my $i (sort { $a->{code} <=> $b->{code} } @$table)
 	{
 		print($out ",") if (!$first);
 		$first = 0;
 		print $out "\t/* $last_comment */"
 		  if ($verbose && $last_comment ne "");

 		printf $out "\n  {0x%04x, 0x%08x, 0x%08x}",
 		  $i->{code}, $i->{utf8}, $i->{utf8_second};

 		if ($verbose >= 2)
 		{
 			$last_comment =
 			  sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
 		}
 		elsif ($verbose >= 1)
 		{
 			$last_comment = $i->{comment};
 		}
 	}
 	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
 	print $out "\n};\n";
 	return;
 }

 #######################################################################
 # print_radix_table(<output handle>, <table name>, <charmap hash ref>)
 #
 # Input: A hash, mapping an input character to an output character.
 #
 # Constructs a radix tree from the hash, and prints it out as a C-struct.
 #
 sub print_radix_table
 {
 	my ($out, $tblname, $c) = @_;

 	###
 	### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each
 	### radix tree is represented as a nested hash, each hash indexed by
 	### input byte
 	###
 	my %b1map;
 	my %b2map;
 	my %b3map;
 	my %b4map;
 	foreach my $in (keys %$c)
 	{
 		my $out = $c->{$in};

 		if ($in <= 0xff)
 		{
 			$b1map{$in} = $out;
 		}
 		elsif ($in <= 0xffff)
 		{
 			my $b1 = $in >> 8;
 			my $b2 = $in & 0xff;

 			$b2map{$b1}{$b2} = $out;
 		}
 		elsif ($in <= 0xffffff)
 		{
 			my $b1 = $in >> 16;
 			my $b2 = ($in >> 8) & 0xff;
 			my $b3 = $in & 0xff;

 			$b3map{$b1}{$b2}{$b3} = $out;
 		}
 		elsif ($in <= 0xffffffff)
 		{
 			my $b1 = $in >> 24;
 			my $b2 = ($in >> 16) & 0xff;
 			my $b3 = ($in >> 8) & 0xff;
 			my $b4 = $in & 0xff;

 			$b4map{$b1}{$b2}{$b3}{$b4} = $out;
 		}
 		else
 		{
 			die sprintf("up to 4 byte code is supported: %x", $in);
 		}
 	}

 	my @segments;

 	###
 	### Build a linear list of "segments", from the nested hashes.
 	###
 	### Each segment is a lookup table, keyed by the next byte in the input.
 	### The segments are written out physically to one big array in the final
 	### step, but logically, they form a radix tree. Or rather, four radix
 	### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte
 	### inputs, and 4-byte inputs.
 	###
 	### Each segment is represented by a hash with following fields:
 	###
 	### comment => <string to output as a comment>
 	### label => <label that can be used to refer to this segment from elsewhere>
 	### values => <a hash, keyed by byte, 0-0xff>
 	###
 	### Entries in 'values' can be integers (for leaf-level segments), or
 	### string labels, pointing to a segment with that label. Any missing
 	### values are treated as zeros. If 'values' hash is missing altogether,
 	### it's treated as all-zeros.
 	###
 	### Subsequent steps will enrich the segments with more fields.
 	###

 	# Add the segments for the radix trees themselves.
 	push @segments,
 	  build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map);
 	push @segments,
 	  build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map);
 	push @segments,
 	  build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map);
 	push @segments,
 	  build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map);

 	###
 	### Find min and max index used in each level of each tree.
 	###
 	### These are stored separately, and we can then leave out the unused
 	### parts of every segment. (When using the resulting tree, you must
 	### check each input byte against the min and max.)
 	###
 	my %min_idx;
 	my %max_idx;
 	foreach my $seg (@segments)
 	{
 		my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} };
 		my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} };

 		foreach my $i (keys %{ $seg->{values} })
 		{
 			$this_min = $i if (!defined $this_min || $i < $this_min);
 			$this_max = $i if (!defined $this_max || $i > $this_max);
 		}

 		$min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min;
 		$max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max;
 	}

 	# Copy the mins and max's back to every segment, for convenience.
 	foreach my $seg (@segments)
 	{
 		$seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} };
 		$seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} };
 	}

 	###
 	### Prepend a dummy all-zeros map to the beginning.
 	###
 	### A 0 is an invalid value anywhere in the table, and this allows us to
 	### point to 0 offset from any table, to get a 0 result.
 	###

 	# Find the max range between min and max indexes in any of the segments.
 	my $widest_range = 0;
 	foreach my $seg (@segments)
 	{
 		my $this_range = $seg->{max_idx} - $seg->{min_idx};
 		$widest_range = $this_range if ($this_range > $widest_range);
 	}

 	unshift @segments,
 	  {
 		header => "Dummy map, for invalid values",
 		min_idx => 0,
 		max_idx => $widest_range,
 		label => "dummy map"
 	  };

 	###
 	### Eliminate overlapping zeros
 	###
 	### For each segment, if there are zero values at the end of, and there
 	### are also zero values at the beginning of the next segment, we can
 	### overlay the tail of this segment with the head of next segment, to
 	### save space.
 	###
 	### To achieve that, we subtract the 'max_idx' of each segment with the
 	### amount of zeros that can be overlaid.
 	###
 	for (my $j = 0; $j < $#segments - 1; $j++)
 	{
 		my $seg = $segments[$j];
 		my $nextseg = $segments[ $j + 1 ];

 		# Count the number of zero values at the end of this segment.
 		my $this_trail_zeros = 0;
 		for (
 			my $i = $seg->{max_idx};
 			$i >= $seg->{min_idx} && !$seg->{values}->{$i};
 			$i--)
 		{
 			$this_trail_zeros++;
 		}

 		# Count the number of zeros at the beginning of next segment.
 		my $next_lead_zeros = 0;
 		for (
 			my $i = $nextseg->{min_idx};
 			$i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i};
 			$i++)
 		{
 			$next_lead_zeros++;
 		}

 		# How many zeros in common?
 		my $overlaid_trail_zeros =
 		  ($this_trail_zeros > $next_lead_zeros)
 		  ? $next_lead_zeros
 		  : $this_trail_zeros;

 		$seg->{overlaid_trail_zeros} = $overlaid_trail_zeros;
 		$seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros;
 	}

 	###
 	### Replace label references with real offsets.
 	###
 	### So far, the non-leaf segments have referred to other segments by
 	### their labels. Replace them with numerical offsets from the beginning
 	### of the final array. You cannot move, add, or remove segments after
 	### this step, as that would invalidate the offsets calculated here!
 	###
 	my $flatoff = 0;
 	my %segmap;

 	# First pass: assign offsets to each segment, and build hash
 	# of label => offset.
 	foreach my $seg (@segments)
 	{
 		$seg->{offset} = $flatoff;
 		$segmap{ $seg->{label} } = $flatoff;
 		$flatoff += $seg->{max_idx} - $seg->{min_idx} + 1;
 	}
 	my $tblsize = $flatoff;

 	# Second pass: look up the offset of each label reference in the hash.
 	foreach my $seg (@segments)
 	{
 		while (my ($i, $val) = each %{ $seg->{values} })
 		{
 			if (!($val =~ /^[0-9,.E]+$/))
 			{
 				my $segoff = $segmap{$val};
 				if ($segoff)
 				{
 					$seg->{values}->{$i} = $segoff;
 				}
 				else
 				{
 					die "no segment with label $val";
 				}
 			}
 		}
 	}

 	# Also look up the positions of the roots in the table.
 	# Missing map represents dummy mapping.
 	my $b1root = $segmap{"1-byte"} || 0;
 	my $b2root = $segmap{"2-byte"} || 0;
 	my $b3root = $segmap{"3-byte"} || 0;
 	my $b4root = $segmap{"4-byte"} || 0;

 	# And the lower-upper values of each level in each radix tree.
 	# Missing values represent zero.
 	my $b1_lower = $min_idx{1}{1} || 0;
 	my $b1_upper = $max_idx{1}{1} || 0;

 	my $b2_1_lower = $min_idx{2}{1} || 0;
 	my $b2_1_upper = $max_idx{2}{1} || 0;
 	my $b2_2_lower = $min_idx{2}{2} || 0;
 	my $b2_2_upper = $max_idx{2}{2} || 0;

 	my $b3_1_lower = $min_idx{3}{1} || 0;
 	my $b3_1_upper = $max_idx{3}{1} || 0;
 	my $b3_2_lower = $min_idx{3}{2} || 0;
 	my $b3_2_upper = $max_idx{3}{2} || 0;
 	my $b3_3_lower = $min_idx{3}{3} || 0;
 	my $b3_3_upper = $max_idx{3}{3} || 0;

 	my $b4_1_lower = $min_idx{4}{1} || 0;
 	my $b4_1_upper = $max_idx{4}{1} || 0;
 	my $b4_2_lower = $min_idx{4}{2} || 0;
 	my $b4_2_upper = $max_idx{4}{2} || 0;
 	my $b4_3_lower = $min_idx{4}{3} || 0;
 	my $b4_3_upper = $max_idx{4}{3} || 0;
 	my $b4_4_lower = $min_idx{4}{4} || 0;
 	my $b4_4_upper = $max_idx{4}{4} || 0;

 	###
 	### Find the maximum value in the whole table, to determine if we can
 	### use uint16 or if we need to use uint32.
 	###
 	my $max_val = 0;
 	foreach my $seg (@segments)
 	{
 		foreach my $val (values %{ $seg->{values} })
 		{
 			$max_val = $val if ($val > $max_val);
 		}
 	}

 	my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32";

 	# For formatting, determine how many values we can fit on a single
 	# line, and how wide each value needs to be to align nicely.
 	my $vals_per_line;
 	my $colwidth;

 	if ($max_val <= 0xffff)
 	{
 		$vals_per_line = 8;
 		$colwidth = 4;
 	}
 	elsif ($max_val <= 0xffffff)
 	{
 		$vals_per_line = 4;
 		$colwidth = 6;
 	}
 	else
 	{
 		$vals_per_line = 4;
 		$colwidth = 8;
 	}

 	###
 	### Print the struct and array.
 	###
 	printf $out "static const $datatype ${tblname}_table[$tblsize];\n";
 	printf $out "\n";
 	printf $out "static const pg_mb_radix_tree $tblname =\n";
 	printf $out "{\n";
 	if ($datatype eq "uint16")
 	{
 		print $out "  ${tblname}_table,\n";
 		print $out "  NULL, /* 32-bit table not used */\n";
 	}
 	if ($datatype eq "uint32")
 	{
 		print $out "  NULL, /* 16-bit table not used */\n";
 		print $out "  ${tblname}_table,\n";
 	}
 	printf $out "\n";
 	printf $out "  0x%04x, /* offset of table for 1-byte inputs */\n",
 	  $b1root;
 	printf $out "  0x%02x, /* b1_lower */\n", $b1_lower;
 	printf $out "  0x%02x, /* b1_upper */\n", $b1_upper;
 	printf $out "\n";
 	printf $out "  0x%04x, /* offset of table for 2-byte inputs */\n",
 	  $b2root;
 	printf $out "  0x%02x, /* b2_1_lower */\n", $b2_1_lower;
 	printf $out "  0x%02x, /* b2_1_upper */\n", $b2_1_upper;
 	printf $out "  0x%02x, /* b2_2_lower */\n", $b2_2_lower;
 	printf $out "  0x%02x, /* b2_2_upper */\n", $b2_2_upper;
 	printf $out "\n";
 	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n",
 	  $b3root;
 	printf $out "  0x%02x, /* b3_1_lower */\n", $b3_1_lower;
 	printf $out "  0x%02x, /* b3_1_upper */\n", $b3_1_upper;
 	printf $out "  0x%02x, /* b3_2_lower */\n", $b3_2_lower;
 	printf $out "  0x%02x, /* b3_2_upper */\n", $b3_2_upper;
 	printf $out "  0x%02x, /* b3_3_lower */\n", $b3_3_lower;
 	printf $out "  0x%02x, /* b3_3_upper */\n", $b3_3_upper;
 	printf $out "\n";
 	printf $out "  0x%04x, /* offset of table for 4-byte inputs */\n",
 	  $b4root;
 	printf $out "  0x%02x, /* b4_1_lower */\n", $b4_1_lower;
 	printf $out "  0x%02x, /* b4_1_upper */\n", $b4_1_upper;
 	printf $out "  0x%02x, /* b4_2_lower */\n", $b4_2_lower;
 	printf $out "  0x%02x, /* b4_2_upper */\n", $b4_2_upper;
 	printf $out "  0x%02x, /* b4_3_lower */\n", $b4_3_lower;
 	printf $out "  0x%02x, /* b4_3_upper */\n", $b4_3_upper;
 	printf $out "  0x%02x, /* b4_4_lower */\n", $b4_4_lower;
 	printf $out "  0x%02x  /* b4_4_upper */\n", $b4_4_upper;
 	print $out "};\n";
 	print $out "\n";
 	print $out "static const $datatype ${tblname}_table[$tblsize] =\n";
 	print $out "{";
 	my $off = 0;

 	foreach my $seg (@segments)
 	{
 		printf $out "\n";
 		printf $out "  /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off;
 		printf $out "\n";

 		for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};)
 		{

 			# Print the next line's worth of values.
 			# XXX pad to begin at a nice boundary
 			printf $out "  /* %02x */ ", $i;
 			for (
 				my $j = 0;
 				$j < $vals_per_line && $i <= $seg->{max_idx};
 				$j++)
 			{
 				# missing values represent zero.
 				my $val = $seg->{values}->{$i} || 0;

 				printf $out " 0x%0*x", $colwidth, $val;
 				$off++;
 				if ($off != $tblsize)
 				{
 					print $out ",";
 				}
 				$i++;
 			}
 			print $out "\n";
 		}
 		if ($seg->{overlaid_trail_zeros})
 		{
 			printf $out
 			  "    /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n";
 		}
 	}

 	# Sanity check.
 	if ($off != $tblsize) { die "table size didn't match!"; }

 	print $out "};\n";
 	return;
 }

 ###
 sub build_segments_from_tree
 {
 	my ($header, $rootlabel, $depth, $map) = @_;

 	my @segments;

 	if (%{$map})
 	{
 		@segments =
 		  build_segments_recurse($header, $rootlabel, "", 1, $depth, $map);

 		# Sort the segments into "breadth-first" order. Not strictly required,
 		# but makes the maps nicer to read.
 		@segments =
 		  sort { $a->{level} cmp $b->{level} or $a->{path} cmp $b->{path} }
 		  @segments;
 	}

 	return @segments;
 }

 ###
 sub build_segments_recurse
 {
 	my ($header, $label, $path, $level, $depth, $map) = @_;

 	my @segments;

 	if ($level == $depth)
 	{
 		push @segments,
 		  {
 			header => $header . ", leaf: ${path}xx",
 			label => $label,
 			level => $level,
 			depth => $depth,
 			path => $path,
 			values => $map
 		  };
 	}
 	else
 	{
 		my %children;

 		while (my ($i, $val) = each %$map)
 		{
 			my $childpath = $path . sprintf("%02x", $i);
 			my $childlabel = "$depth-level-$level-$childpath";

 			push @segments,
 			  build_segments_recurse($header, $childlabel, $childpath,
 				$level + 1, $depth, $val);
 			$children{$i} = $childlabel;
 		}

 		push @segments,
 		  {
 			header => $header . ", byte #$level: ${path}xx",
 			label => $label,
 			level => $level,
 			depth => $depth,
 			path => $path,
 			values => \%children
 		  };
 	}
 	return @segments;
 }

 #######################################################################
 # make_charmap - convert charset table to charmap hash
 #
 # make_charmap(\@charset, $direction)
 # charset     - ref to charset table : see print_conversion_tables
 # direction   - conversion direction
 #
 sub make_charmap
 {
 	my ($out, $charset, $direction, $verbose) = @_;

 	croak "unacceptable direction : $direction"
 	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);

 	# In verbose mode, print a large comment with the source and comment of
 	# each character
 	if ($verbose)
 	{
 		print $out "/*\n";
 		print $out "<src>  <dst>    <file>:<lineno> <comment>\n";
 	}

 	my %charmap;
 	foreach my $c (@$charset)
 	{

 		# combined characters are handled elsewhere
 		next if (defined $c->{ucs_second});

 		next if ($c->{direction} != $direction && $c->{direction} != BOTH);

 		my ($src, $dst) =
 		  $direction == TO_UNICODE
 		  ? ($c->{code}, ucs2utf($c->{ucs}))
 		  : (ucs2utf($c->{ucs}), $c->{code});

 		# check for duplicate source codes
 		if (defined $charmap{$src})
 		{
 			printf STDERR
 			  "Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n",
 			  $c->{f}, $c->{l}, $src, $charmap{$src}, $dst;
 			exit;
 		}
 		$charmap{$src} = $dst;

 		if ($verbose)
 		{
 			printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f},
 			  $c->{l}, $c->{comment};
 		}
 	}
 	if ($verbose)
 	{
 		print $out "*/\n\n";
 	}

 	return \%charmap;
 }

 #######################################################################
 # make_charmap_combined - convert charset table to charmap hash
 #     with checking duplicate source code
 #
 # make_charmap_combined(\@charset, $direction)
 # charset     - ref to charset table : see print_conversion_tables
 # direction   - conversion direction
 #
 sub make_charmap_combined
 {
 	my ($charset, $direction) = @_;

 	croak "unacceptable direction : $direction"
 	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);

 	my @combined;
 	foreach my $c (@$charset)
 	{
 		next if ($c->{direction} != $direction && $c->{direction} != BOTH);

 		if (defined $c->{ucs_second})
 		{
 			my $entry = {
 				utf8 => ucs2utf($c->{ucs}),
 				utf8_second => ucs2utf($c->{ucs_second}),
 				code => $c->{code},
 				comment => $c->{comment},
 				f => $c->{f},
 				l => $c->{l}
 			};
 			push @combined, $entry;
 		}
 	}

 	return \@combined;
 }

 #######################################################################
 # convert UCS-4 to UTF-8
 #
 sub ucs2utf
 {
 	my ($ucs) = @_;
 	my $utf;

 	if ($ucs <= 0x007f)
 	{
 		$utf = $ucs;
 	}
 	elsif ($ucs > 0x007f && $ucs <= 0x07ff)
 	{
 		$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
 	}
 	elsif ($ucs > 0x07ff && $ucs <= 0xffff)
 	{
 		$utf =
 		  ((($ucs >> 12) | 0xe0) << 16) |
 		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
 	}
 	else
 	{
 		$utf =
 		  ((($ucs >> 18) | 0xf0) << 24) |
 		  (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
 		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
 	}
 	return $utf;
 }

 1;
	#
	# Copyright (c) 2001-2023, PostgreSQL Global Development Group
	#
	# src/backend/utils/mb/Unicode/convutils.pm

	package convutils;

	use strict;
	use warnings;

	use Carp;
	use Exporter 'import';

	our @EXPORT =
	qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables);

	# Constants used in the 'direction' field of the character maps
	use constant {
	NONE => 0,
	TO_UNICODE => 1,
	FROM_UNICODE => 2,
	BOTH => 3
	};

	#######################################################################
	# read_source - common routine to read source file
	#
	# fname ; input file name
	#
	sub read_source
	{
	my ($fname) = @_;
	my @r;

	open(my $in, '<', $fname) \|\| die("cannot open $fname");

	while (<$in>)
	{
	next if (/^#/);
	chop;

	next if (/^$/); # Ignore empty lines

	next if (/^0x([0-9A-F]+)\s+(#.*)$/);

	# The Unicode source files have three columns
	# 1: The "foreign" code (in hex)
	# 2: Unicode code point (in hex)
	# 3: Unicode name
	if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(#.*)$/)
	{
	print STDERR "READ ERROR at line $. in $fname: $_\n";
	exit;
	}
	my $out = {
	code => hex($1),
	ucs => hex($2),
	comment => $4,
	direction => BOTH,
	f => $fname,
	l => $.
	};

	# Ignore pure ASCII mappings. PostgreSQL character conversion code
	# never even passes these to the conversion code.
	next if ($out->{code} < 0x80 \|\| $out->{ucs} < 0x80);

	push(@r, $out);
	}
	close($in);

	return \@r;
	}

	##################################################################
	# print_conversion_tables - output mapping tables
	#
	# print_conversion_tables($this_script, $csname, \%charset)
	#
	# this_script - the name of the caller script of this feature
	# csname - character set name other than ucs
	# charset - ref to character set array
	#
	# Input character set array format:
	#
	# Each element in the character set array is a hash. Each hash has the following fields:
	# direction - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether)
	# ucs - Unicode code point
	# ucs_second - Second Unicode code point, if this is a "combined" character.
	# code - Byte sequence in the "other" character set, as an integer
	# comment - Text representation of the character
	# f - Source filename
	# l - Line number in source file
	#
	sub print_conversion_tables
	{
	my ($this_script, $csname, $charset) = @_;

	print_conversion_tables_direction($this_script, $csname, FROM_UNICODE,
	$charset);
	print_conversion_tables_direction($this_script, $csname, TO_UNICODE,
	$charset);
	return;
	}

	#############################################################################
	# INTERNAL ROUTINES

	#######################################################################
	# print_conversion_tables_direction - write the whole content of C source of radix tree
	#
	# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth)
	#
	# this_script - the name of the caller script of this feature
	# csname - character set name other than ucs
	# direction - desired direction, TO_UNICODE or FROM_UNICODE
	# charset - ref to character set array
	#
	sub print_conversion_tables_direction
	{
	my ($this_script, $csname, $direction, $charset) = @_;

	my $fname;
	my $tblname;
	if ($direction == TO_UNICODE)
	{
	$fname = lc("${csname}_to_utf8.map");
	$tblname = lc("${csname}_to_unicode_tree");

	print "- Writing ${csname}=>UTF8 conversion table: $fname\n";
	}
	else
	{
	$fname = lc("utf8_to_${csname}.map");
	$tblname = lc("${csname}_from_unicode_tree");

	print "- Writing UTF8=>${csname} conversion table: $fname\n";
	}

	open(my $out, '>', $fname) \|\| die("cannot open $fname");

	print $out "/* src/backend/utils/mb/Unicode/$fname */\n";
	print $out "/* This file is generated by $this_script */\n\n";

	# Collect regular, non-combined, mappings, and create the radix tree from them.
	my $charmap = &make_charmap($out, $charset, $direction, 0);
	print_radix_table($out, $tblname, $charmap);

	# Collect combined characters, and create combined character table (if any)
	my $charmap_combined = &make_charmap_combined($charset, $direction);

	if (scalar @{$charmap_combined} > 0)
	{
	if ($direction == TO_UNICODE)
	{
	print_to_utf8_combined_map($out, $csname, $charmap_combined, 1);
	}
	else
	{
	print_from_utf8_combined_map($out, $csname, $charmap_combined, 1);
	}
	}

	close($out);
	return;
	}

	sub print_from_utf8_combined_map
	{
	my ($out, $charset, $table, $verbose) = @_;

	my $last_comment = "";

	printf $out "\n/* Combined character map */\n";
	printf $out
	"static const pg_utf_to_local_combined ULmap${charset}_combined[%d] = {",
	scalar(@$table);
	my $first = 1;
	foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table)
	{
	print($out ",") if (!$first);
	$first = 0;
	print $out "\t/* $last_comment */"
	if ($verbose && $last_comment ne "");

	printf $out "\n {0x%08x, 0x%08x, 0x%04x}",
	$i->{utf8}, $i->{utf8_second}, $i->{code};
	if ($verbose >= 2)
	{
	$last_comment =
	sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
	}
	elsif ($verbose >= 1)
	{
	$last_comment = $i->{comment};
	}
	}
	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
	print $out "\n};\n";
	return;
	}

	sub print_to_utf8_combined_map
	{
	my ($out, $charset, $table, $verbose) = @_;

	my $last_comment = "";

	printf $out "\n/* Combined character map */\n";
	printf $out
	"static const pg_local_to_utf_combined LUmap${charset}_combined[%d] = {",
	scalar(@$table);

	my $first = 1;
	foreach my $i (sort { $a->{code} <=> $b->{code} } @$table)
	{
	print($out ",") if (!$first);
	$first = 0;
	print $out "\t/* $last_comment */"
	if ($verbose && $last_comment ne "");

	printf $out "\n {0x%04x, 0x%08x, 0x%08x}",
	$i->{code}, $i->{utf8}, $i->{utf8_second};

	if ($verbose >= 2)
	{
	$last_comment =
	sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
	}
	elsif ($verbose >= 1)
	{
	$last_comment = $i->{comment};
	}
	}
	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
	print $out "\n};\n";
	return;
	}

	#######################################################################
	# print_radix_table(<output handle>, <table name>, <charmap hash ref>)
	#
	# Input: A hash, mapping an input character to an output character.
	#
	# Constructs a radix tree from the hash, and prints it out as a C-struct.
	#
	sub print_radix_table
	{
	my ($out, $tblname, $c) = @_;

	###
	### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each
	### radix tree is represented as a nested hash, each hash indexed by
	### input byte
	###
	my %b1map;
	my %b2map;
	my %b3map;
	my %b4map;
	foreach my $in (keys %$c)
	{
	my $out = $c->{$in};

	if ($in <= 0xff)
	{
	$b1map{$in} = $out;
	}
	elsif ($in <= 0xffff)
	{
	my $b1 = $in >> 8;
	my $b2 = $in & 0xff;

	$b2map{$b1}{$b2} = $out;
	}
	elsif ($in <= 0xffffff)
	{
	my $b1 = $in >> 16;
	my $b2 = ($in >> 8) & 0xff;
	my $b3 = $in & 0xff;

	$b3map{$b1}{$b2}{$b3} = $out;
	}
	elsif ($in <= 0xffffffff)
	{
	my $b1 = $in >> 24;
	my $b2 = ($in >> 16) & 0xff;
	my $b3 = ($in >> 8) & 0xff;
	my $b4 = $in & 0xff;

	$b4map{$b1}{$b2}{$b3}{$b4} = $out;
	}
	else
	{
	die sprintf("up to 4 byte code is supported: %x", $in);
	}
	}

	my @segments;

	###
	### Build a linear list of "segments", from the nested hashes.
	###
	### Each segment is a lookup table, keyed by the next byte in the input.
	### The segments are written out physically to one big array in the final
	### step, but logically, they form a radix tree. Or rather, four radix
	### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte
	### inputs, and 4-byte inputs.
	###
	### Each segment is represented by a hash with following fields:
	###
	### comment => <string to output as a comment>
	### label => <label that can be used to refer to this segment from elsewhere>
	### values => <a hash, keyed by byte, 0-0xff>
	###
	### Entries in 'values' can be integers (for leaf-level segments), or
	### string labels, pointing to a segment with that label. Any missing
	### values are treated as zeros. If 'values' hash is missing altogether,
	### it's treated as all-zeros.
	###
	### Subsequent steps will enrich the segments with more fields.
	###

	# Add the segments for the radix trees themselves.
	push @segments,
	build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map);
	push @segments,
	build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map);
	push @segments,
	build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map);
	push @segments,
	build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map);

	###
	### Find min and max index used in each level of each tree.
	###
	### These are stored separately, and we can then leave out the unused
	### parts of every segment. (When using the resulting tree, you must
	### check each input byte against the min and max.)
	###
	my %min_idx;
	my %max_idx;
	foreach my $seg (@segments)
	{
	my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} };
	my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} };

	foreach my $i (keys %{ $seg->{values} })
	{
	$this_min = $i if (!defined $this_min \|\| $i < $this_min);
	$this_max = $i if (!defined $this_max \|\| $i > $this_max);
	}

	$min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min;
	$max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max;
	}

	# Copy the mins and max's back to every segment, for convenience.
	foreach my $seg (@segments)
	{
	$seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} };
	$seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} };
	}

	###
	### Prepend a dummy all-zeros map to the beginning.
	###
	### A 0 is an invalid value anywhere in the table, and this allows us to
	### point to 0 offset from any table, to get a 0 result.
	###

	# Find the max range between min and max indexes in any of the segments.
	my $widest_range = 0;
	foreach my $seg (@segments)
	{
	my $this_range = $seg->{max_idx} - $seg->{min_idx};
	$widest_range = $this_range if ($this_range > $widest_range);
	}

	unshift @segments,
	{
	header => "Dummy map, for invalid values",
	min_idx => 0,
	max_idx => $widest_range,
	label => "dummy map"
	};

	###
	### Eliminate overlapping zeros
	###
	### For each segment, if there are zero values at the end of, and there
	### are also zero values at the beginning of the next segment, we can
	### overlay the tail of this segment with the head of next segment, to
	### save space.
	###
	### To achieve that, we subtract the 'max_idx' of each segment with the
	### amount of zeros that can be overlaid.
	###
	for (my $j = 0; $j < $#segments - 1; $j++)
	{
	my $seg = $segments[$j];
	my $nextseg = $segments[ $j + 1 ];

	# Count the number of zero values at the end of this segment.
	my $this_trail_zeros = 0;
	for (
	my $i = $seg->{max_idx};
	$i >= $seg->{min_idx} && !$seg->{values}->{$i};
	$i--)
	{
	$this_trail_zeros++;
	}

	# Count the number of zeros at the beginning of next segment.
	my $next_lead_zeros = 0;
	for (
	my $i = $nextseg->{min_idx};
	$i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i};
	$i++)
	{
	$next_lead_zeros++;
	}

	# How many zeros in common?
	my $overlaid_trail_zeros =
	($this_trail_zeros > $next_lead_zeros)
	? $next_lead_zeros
	: $this_trail_zeros;

	$seg->{overlaid_trail_zeros} = $overlaid_trail_zeros;
	$seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros;
	}

	###
	### Replace label references with real offsets.
	###
	### So far, the non-leaf segments have referred to other segments by
	### their labels. Replace them with numerical offsets from the beginning
	### of the final array. You cannot move, add, or remove segments after
	### this step, as that would invalidate the offsets calculated here!
	###
	my $flatoff = 0;
	my %segmap;

	# First pass: assign offsets to each segment, and build hash
	# of label => offset.
	foreach my $seg (@segments)
	{
	$seg->{offset} = $flatoff;
	$segmap{ $seg->{label} } = $flatoff;
	$flatoff += $seg->{max_idx} - $seg->{min_idx} + 1;
	}
	my $tblsize = $flatoff;

	# Second pass: look up the offset of each label reference in the hash.
	foreach my $seg (@segments)
	{
	while (my ($i, $val) = each %{ $seg->{values} })
	{
	if (!($val =~ /^[0-9,.E]+$/))
	{
	my $segoff = $segmap{$val};
	if ($segoff)
	{
	$seg->{values}->{$i} = $segoff;
	}
	else
	{
	die "no segment with label $val";
	}
	}
	}
	}

	# Also look up the positions of the roots in the table.
	# Missing map represents dummy mapping.
	my $b1root = $segmap{"1-byte"} \|\| 0;
	my $b2root = $segmap{"2-byte"} \|\| 0;
	my $b3root = $segmap{"3-byte"} \|\| 0;
	my $b4root = $segmap{"4-byte"} \|\| 0;

	# And the lower-upper values of each level in each radix tree.
	# Missing values represent zero.
	my $b1_lower = $min_idx{1}{1} \|\| 0;
	my $b1_upper = $max_idx{1}{1} \|\| 0;

	my $b2_1_lower = $min_idx{2}{1} \|\| 0;
	my $b2_1_upper = $max_idx{2}{1} \|\| 0;
	my $b2_2_lower = $min_idx{2}{2} \|\| 0;
	my $b2_2_upper = $max_idx{2}{2} \|\| 0;

	my $b3_1_lower = $min_idx{3}{1} \|\| 0;
	my $b3_1_upper = $max_idx{3}{1} \|\| 0;
	my $b3_2_lower = $min_idx{3}{2} \|\| 0;
	my $b3_2_upper = $max_idx{3}{2} \|\| 0;
	my $b3_3_lower = $min_idx{3}{3} \|\| 0;
	my $b3_3_upper = $max_idx{3}{3} \|\| 0;

	my $b4_1_lower = $min_idx{4}{1} \|\| 0;
	my $b4_1_upper = $max_idx{4}{1} \|\| 0;
	my $b4_2_lower = $min_idx{4}{2} \|\| 0;
	my $b4_2_upper = $max_idx{4}{2} \|\| 0;
	my $b4_3_lower = $min_idx{4}{3} \|\| 0;
	my $b4_3_upper = $max_idx{4}{3} \|\| 0;
	my $b4_4_lower = $min_idx{4}{4} \|\| 0;
	my $b4_4_upper = $max_idx{4}{4} \|\| 0;

	###
	### Find the maximum value in the whole table, to determine if we can
	### use uint16 or if we need to use uint32.
	###
	my $max_val = 0;
	foreach my $seg (@segments)
	{
	foreach my $val (values %{ $seg->{values} })
	{
	$max_val = $val if ($val > $max_val);
	}
	}

	my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32";

	# For formatting, determine how many values we can fit on a single
	# line, and how wide each value needs to be to align nicely.
	my $vals_per_line;
	my $colwidth;

	if ($max_val <= 0xffff)
	{
	$vals_per_line = 8;
	$colwidth = 4;
	}
	elsif ($max_val <= 0xffffff)
	{
	$vals_per_line = 4;
	$colwidth = 6;
	}
	else
	{
	$vals_per_line = 4;
	$colwidth = 8;
	}

	###
	### Print the struct and array.
	###
	printf $out "static const $datatype ${tblname}_table[$tblsize];\n";
	printf $out "\n";
	printf $out "static const pg_mb_radix_tree $tblname =\n";
	printf $out "{\n";
	if ($datatype eq "uint16")
	{
	print $out " ${tblname}_table,\n";
	print $out " NULL, /* 32-bit table not used */\n";
	}
	if ($datatype eq "uint32")
	{
	print $out " NULL, /* 16-bit table not used */\n";
	print $out " ${tblname}_table,\n";
	}
	printf $out "\n";
	printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n",
	$b1root;
	printf $out " 0x%02x, /* b1_lower */\n", $b1_lower;
	printf $out " 0x%02x, /* b1_upper */\n", $b1_upper;
	printf $out "\n";
	printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n",
	$b2root;
	printf $out " 0x%02x, /* b2_1_lower */\n", $b2_1_lower;
	printf $out " 0x%02x, /* b2_1_upper */\n", $b2_1_upper;
	printf $out " 0x%02x, /* b2_2_lower */\n", $b2_2_lower;
	printf $out " 0x%02x, /* b2_2_upper */\n", $b2_2_upper;
	printf $out "\n";
	printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n",
	$b3root;
	printf $out " 0x%02x, /* b3_1_lower */\n", $b3_1_lower;
	printf $out " 0x%02x, /* b3_1_upper */\n", $b3_1_upper;
	printf $out " 0x%02x, /* b3_2_lower */\n", $b3_2_lower;
	printf $out " 0x%02x, /* b3_2_upper */\n", $b3_2_upper;
	printf $out " 0x%02x, /* b3_3_lower */\n", $b3_3_lower;
	printf $out " 0x%02x, /* b3_3_upper */\n", $b3_3_upper;
	printf $out "\n";
	printf $out " 0x%04x, /* offset of table for 4-byte inputs */\n",
	$b4root;
	printf $out " 0x%02x, /* b4_1_lower */\n", $b4_1_lower;
	printf $out " 0x%02x, /* b4_1_upper */\n", $b4_1_upper;
	printf $out " 0x%02x, /* b4_2_lower */\n", $b4_2_lower;
	printf $out " 0x%02x, /* b4_2_upper */\n", $b4_2_upper;
	printf $out " 0x%02x, /* b4_3_lower */\n", $b4_3_lower;
	printf $out " 0x%02x, /* b4_3_upper */\n", $b4_3_upper;
	printf $out " 0x%02x, /* b4_4_lower */\n", $b4_4_lower;
	printf $out " 0x%02x /* b4_4_upper */\n", $b4_4_upper;
	print $out "};\n";
	print $out "\n";
	print $out "static const $datatype ${tblname}_table[$tblsize] =\n";
	print $out "{";
	my $off = 0;

	foreach my $seg (@segments)
	{
	printf $out "\n";
	printf $out " /* %s - offset 0x%05x */\n", $seg->{header}, $off;
	printf $out "\n";

	for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};)
	{

	# Print the next line's worth of values.
	# XXX pad to begin at a nice boundary
	printf $out " /* %02x */ ", $i;
	for (
	my $j = 0;
	$j < $vals_per_line && $i <= $seg->{max_idx};
	$j++)
	{
	# missing values represent zero.
	my $val = $seg->{values}->{$i} \|\| 0;

	printf $out " 0x%0*x", $colwidth, $val;
	$off++;
	if ($off != $tblsize)
	{
	print $out ",";
	}
	$i++;
	}
	print $out "\n";
	}
	if ($seg->{overlaid_trail_zeros})
	{
	printf $out
	" /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n";
	}
	}

	# Sanity check.
	if ($off != $tblsize) { die "table size didn't match!"; }

	print $out "};\n";
	return;
	}

	###
	sub build_segments_from_tree
	{
	my ($header, $rootlabel, $depth, $map) = @_;

	my @segments;

	if (%{$map})
	{
	@segments =
	build_segments_recurse($header, $rootlabel, "", 1, $depth, $map);

	# Sort the segments into "breadth-first" order. Not strictly required,
	# but makes the maps nicer to read.
	@segments =
	sort { $a->{level} cmp $b->{level} or $a->{path} cmp $b->{path} }
	@segments;
	}

	return @segments;
	}

	###
	sub build_segments_recurse
	{
	my ($header, $label, $path, $level, $depth, $map) = @_;

	my @segments;

	if ($level == $depth)
	{
	push @segments,
	{
	header => $header . ", leaf: ${path}xx",
	label => $label,
	level => $level,
	depth => $depth,
	path => $path,
	values => $map
	};
	}
	else
	{
	my %children;

	while (my ($i, $val) = each %$map)
	{
	my $childpath = $path . sprintf("%02x", $i);
	my $childlabel = "$depth-level-$level-$childpath";

	push @segments,
	build_segments_recurse($header, $childlabel, $childpath,
	$level + 1, $depth, $val);
	$children{$i} = $childlabel;
	}

	push @segments,
	{
	header => $header . ", byte #$level: ${path}xx",
	label => $label,
	level => $level,
	depth => $depth,
	path => $path,
	values => \%children
	};
	}
	return @segments;
	}

	#######################################################################
	# make_charmap - convert charset table to charmap hash
	#
	# make_charmap(\@charset, $direction)
	# charset - ref to charset table : see print_conversion_tables
	# direction - conversion direction
	#
	sub make_charmap
	{
	my ($out, $charset, $direction, $verbose) = @_;

	croak "unacceptable direction : $direction"
	if ($direction != TO_UNICODE && $direction != FROM_UNICODE);

	# In verbose mode, print a large comment with the source and comment of
	# each character
	if ($verbose)
	{
	print $out "/*\n";
	print $out "<src> <dst> <file>:<lineno> <comment>\n";
	}

	my %charmap;
	foreach my $c (@$charset)
	{

	# combined characters are handled elsewhere
	next if (defined $c->{ucs_second});

	next if ($c->{direction} != $direction && $c->{direction} != BOTH);

	my ($src, $dst) =
	$direction == TO_UNICODE
	? ($c->{code}, ucs2utf($c->{ucs}))
	: (ucs2utf($c->{ucs}), $c->{code});

	# check for duplicate source codes
	if (defined $charmap{$src})
	{
	printf STDERR
	"Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n",
	$c->{f}, $c->{l}, $src, $charmap{$src}, $dst;
	exit;
	}
	$charmap{$src} = $dst;

	if ($verbose)
	{
	printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f},
	$c->{l}, $c->{comment};
	}
	}
	if ($verbose)
	{
	print $out "*/\n\n";
	}

	return \%charmap;
	}

	#######################################################################
	# make_charmap_combined - convert charset table to charmap hash
	# with checking duplicate source code
	#
	# make_charmap_combined(\@charset, $direction)
	# charset - ref to charset table : see print_conversion_tables
	# direction - conversion direction
	#
	sub make_charmap_combined
	{
	my ($charset, $direction) = @_;

	croak "unacceptable direction : $direction"
	if ($direction != TO_UNICODE && $direction != FROM_UNICODE);

	my @combined;
	foreach my $c (@$charset)
	{
	next if ($c->{direction} != $direction && $c->{direction} != BOTH);

	if (defined $c->{ucs_second})
	{
	my $entry = {
	utf8 => ucs2utf($c->{ucs}),
	utf8_second => ucs2utf($c->{ucs_second}),
	code => $c->{code},
	comment => $c->{comment},
	f => $c->{f},
	l => $c->{l}
	};
	push @combined, $entry;
	}
	}

	return \@combined;
	}

	#######################################################################
	# convert UCS-4 to UTF-8
	#
	sub ucs2utf
	{
	my ($ucs) = @_;
	my $utf;

	if ($ucs <= 0x007f)
	{
	$utf = $ucs;
	}
	elsif ($ucs > 0x007f && $ucs <= 0x07ff)
	{
	$utf = (($ucs & 0x003f) \| 0x80) \| ((($ucs >> 6) \| 0xc0) << 8);
	}
	elsif ($ucs > 0x07ff && $ucs <= 0xffff)
	{
	$utf =
	((($ucs >> 12) \| 0xe0) << 16) \|
	(((($ucs & 0x0fc0) >> 6) \| 0x80) << 8) \| (($ucs & 0x003f) \| 0x80);
	}
	else
	{
	$utf =
	((($ucs >> 18) \| 0xf0) << 24) \|
	(((($ucs & 0x3ffff) >> 12) \| 0x80) << 16) \|
	(((($ucs & 0x0fc0) >> 6) \| 0x80) << 8) \| (($ucs & 0x003f) \| 0x80);
	}
	return $utf;
	}

	1;