src/include/catalog/reformat_dat_file.pl - cloudberry - Git at Google

 #!/usr/bin/perl
 #----------------------------------------------------------------------
 #
 # reformat_dat_file.pl
 #    Perl script that reads in catalog data file(s) and writes out
 #    functionally equivalent file(s) in a standard format.
 #
 #    In each entry of a reformatted file, metadata fields (if present)
 #    come first, with normal attributes starting on the following line,
 #    in the same order as the columns of the corresponding catalog.
 #    Comments and blank lines are preserved.
 #
 # Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
 # Portions Copyright (c) 1994, Regents of the University of California
 #
 # src/include/catalog/reformat_dat_file.pl
 #
 #----------------------------------------------------------------------

 use strict;
 use warnings;

 use FindBin;
 use Getopt::Long;

 # If you copy this script to somewhere other than src/include/catalog,
 # you'll need to modify this "use lib" or provide a suitable -I switch.
 use lib "$FindBin::RealBin/../../backend/catalog/";
 use Catalog;

 # Names of the metadata fields of a catalog entry.
 # Note: oid is a normal column from a storage perspective, but it's more
 # important than the rest, so it's listed first among the metadata fields.
 # Note: line_number is also a metadata field, but we never write it out,
 # so it's not listed here.
 my @METADATA =
   ('oid', 'oid_symbol', 'array_type_oid', 'descr', 'autogenerated');

 # Process command line switches.
 my $output_path = '';
 my $full_tuples = 0;

 GetOptions(
 	'output=s'    => \$output_path,
 	'full-tuples' => \$full_tuples) || usage();

 # Sanity check arguments.
 die "No input files.\n" unless @ARGV;

 # Make sure output_path ends in a slash.
 if ($output_path ne '' && substr($output_path, -1) ne '/')
 {
 	$output_path .= '/';
 }

 # Read all the input files into internal data structures.
 # We pass data file names as arguments and then look for matching
 # headers to parse the schema from.
 my %catalogs;
 my %catalog_data;
 my @catnames;
 foreach my $datfile (@ARGV)
 {
 	$datfile =~ /(.+)\.dat$/
 	  or die "Input files need to be data (.dat) files.\n";

 	my $header = "$1.h";
 	die "There in no header file corresponding to $datfile"
 	  if !-e $header;

 	my $catalog = Catalog::ParseHeader($header);
 	my $catname = $catalog->{catname};
 	my $schema  = $catalog->{columns};

 	push @catnames, $catname;
 	$catalogs{$catname} = $catalog;

 	$catalog_data{$catname} = Catalog::ParseData($datfile, $schema, 1);
 }

 ########################################################################
 # At this point, we have read all the data. If you are modifying this
 # script for bulk editing, this is a good place to build lookup tables,
 # if you need to. In the following example, the "next if !ref $row"
 # check below is a hack to filter out non-hash objects. This is because
 # we build the lookup tables from data that we read using the
 # "preserve_formatting" parameter.
 #
 ##Index access method lookup.
 #my %amnames;
 #foreach my $row (@{ $catalog_data{pg_am} })
 #{
 #	next if !ref $row;
 #	$amnames{$row->{oid}} = $row->{amname};
 #}
 ########################################################################

 # Write the data.
 foreach my $catname (@catnames)
 {
 	my $catalog = $catalogs{$catname};
 	my @attnames;
 	my $schema = $catalog->{columns};

 	foreach my $column (@$schema)
 	{
 		my $attname = $column->{name};

 		# We may have ordinary columns at the storage level that we still
 		# want to format as a special value. Exclude these from the column
 		# list so they are not written twice.
 		push @attnames, $attname
 		  if !(grep { $_ eq $attname } @METADATA);
 	}

 	# Write output files to specified directory.
 	my $datfile = "$output_path$catname.dat";
 	open my $dat, '>', $datfile
 	  or die "can't open $datfile: $!";

 	foreach my $data (@{ $catalog_data{$catname} })
 	{

 		# Hash ref representing a data entry.
 		if (ref $data eq 'HASH')
 		{
 			my %values = %$data;

 			############################################################
 			# At this point we have the full tuple in memory as a hash
 			# and can do any operations we want. As written, it only
 			# removes default values, but this script can be adapted to
 			# do one-off bulk-editing.
 			############################################################

 			if (!$full_tuples)
 			{
 				# If it's an autogenerated entry, drop it completely.
 				next if $values{autogenerated};
 				# Else, just drop any default/computed fields.
 				strip_default_values(\%values, $schema, $catname);
 			}

 			print $dat "{";

 			# Separate out metadata fields for readability.
 			my $metadata_str = format_hash(\%values, @METADATA);
 			if ($metadata_str)
 			{
 				print $dat $metadata_str;

 				# User attributes start on next line.
 				print $dat ",\n ";
 			}

 			my $data_str = format_hash(\%values, @attnames);
 			print $dat $data_str;
 			print $dat " },\n";
 		}

 		# Preserve blank lines.
 		elsif ($data =~ /^\s*$/)
 		{
 			print $dat "\n";
 		}

 		# Preserve comments or brackets that are on their own line.
 		elsif ($data =~ /^\s*(\[|\]|#.*?)\s*$/)
 		{
 			print $dat "$1\n";
 		}
 	}
 	close $dat;
 }

 # Remove column values for which there is a matching default,
 # or if the value can be computed from other columns.
 sub strip_default_values
 {
 	my ($row, $schema, $catname) = @_;

 	# Delete values that match defaults.
 	foreach my $column (@$schema)
 	{
 		my $attname = $column->{name};

 		# It's okay if we have no oid value, since it will be assigned
 		# automatically before bootstrap.
 		die "strip_default_values: $catname.$attname undefined\n"
 		  if !defined $row->{$attname} and $attname ne 'oid';

 		if (defined $column->{default}
 			and ($row->{$attname} eq $column->{default}))
 		{
 			delete $row->{$attname};
 		}
 	}

 	# Delete computed values.  See AddDefaultValues() in Catalog.pm.
 	# Note: This must be done after deleting values matching defaults.
 	if ($catname eq 'pg_proc')
 	{
 		delete $row->{pronargs} if defined $row->{proargtypes};
 	}

 	# If a pg_type entry has an auto-generated array type, then its
 	# typarray field is a computed value too (see GenerateArrayTypes).
 	if ($catname eq 'pg_type')
 	{
 		delete $row->{typarray} if defined $row->{array_type_oid};
 	}

 	return;
 }

 # Format the individual elements of a Perl hash into a valid string
 # representation. We do this ourselves, rather than use native Perl
 # facilities, so we can keep control over the exact formatting of the
 # data files.
 sub format_hash
 {
 	my $data          = shift;
 	my @orig_attnames = @_;

 	# Copy attname to new array if it has a value, so we can determine
 	# the last populated element. We do this because we may have default
 	# values or empty metadata fields.
 	my @attnames;
 	foreach my $orig_attname (@orig_attnames)
 	{
 		push @attnames, $orig_attname
 		  if defined $data->{$orig_attname};
 	}

 	# When calling this function, we ether have an open-bracket or a
 	# leading space already.
 	my $char_count = 1;

 	my $threshold;
 	my $hash_str      = '';
 	my $element_count = 0;

 	foreach my $attname (@attnames)
 	{
 		$element_count++;

 		# To limit the line to 80 chars, we need to account for the
 		# trailing characters.
 		if ($element_count == $#attnames + 1)
 		{
 			# Last element, so allow space for ' },'
 			$threshold = 77;
 		}
 		else
 		{
 			# Just need space for trailing comma
 			$threshold = 79;
 		}

 		if ($element_count > 1)
 		{
 			$hash_str .= ',';
 			$char_count++;
 		}

 		my $value = $data->{$attname};

 		# Escape single quotes.
 		$value =~ s/'/\\'/g;

 		# Include a leading space in the key-value pair, since this will
 		# always go after either a comma or an additional padding space on
 		# the next line.
 		my $element        = " $attname => '$value'";
 		my $element_length = length($element);

 		# If adding the element to the current line would expand the line
 		# beyond 80 chars, put it on the next line. We don't do this for
 		# the first element, since that would create a blank line.
 		if ($element_count > 1 and $char_count + $element_length > $threshold)
 		{

 			# Put on next line with an additional space preceding. There
 			# are now two spaces in front of the key-value pair, lining
 			# it up with the line above it.
 			$hash_str .= "\n $element";
 			$char_count = $element_length + 1;
 		}
 		else
 		{
 			$hash_str .= $element;
 			$char_count += $element_length;
 		}
 	}
 	return $hash_str;
 }

 sub usage
 {
 	die <<EOM;
 Usage: reformat_dat_file.pl [options] datafile...

 Options:
     --output PATH    output directory (default '.')
     --full-tuples    write out full tuples, including default values

 Non-option arguments are the names of input .dat files.
 Updated files are written to the output directory,
 possibly overwriting the input files.

 EOM
 }
	#!/usr/bin/perl
	#----------------------------------------------------------------------
	#
	# reformat_dat_file.pl
	# Perl script that reads in catalog data file(s) and writes out
	# functionally equivalent file(s) in a standard format.
	#
	# In each entry of a reformatted file, metadata fields (if present)
	# come first, with normal attributes starting on the following line,
	# in the same order as the columns of the corresponding catalog.
	# Comments and blank lines are preserved.
	#
	# Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
	# Portions Copyright (c) 1994, Regents of the University of California
	#
	# src/include/catalog/reformat_dat_file.pl
	#
	#----------------------------------------------------------------------

	use strict;
	use warnings;

	use FindBin;
	use Getopt::Long;

	# If you copy this script to somewhere other than src/include/catalog,
	# you'll need to modify this "use lib" or provide a suitable -I switch.
	use lib "$FindBin::RealBin/../../backend/catalog/";
	use Catalog;

	# Names of the metadata fields of a catalog entry.
	# Note: oid is a normal column from a storage perspective, but it's more
	# important than the rest, so it's listed first among the metadata fields.
	# Note: line_number is also a metadata field, but we never write it out,
	# so it's not listed here.
	my @METADATA =
	('oid', 'oid_symbol', 'array_type_oid', 'descr', 'autogenerated');

	# Process command line switches.
	my $output_path = '';
	my $full_tuples = 0;

	GetOptions(
	'output=s' => \$output_path,
	'full-tuples' => \$full_tuples) \|\| usage();

	# Sanity check arguments.
	die "No input files.\n" unless @ARGV;

	# Make sure output_path ends in a slash.
	if ($output_path ne '' && substr($output_path, -1) ne '/')
	{
	$output_path .= '/';
	}

	# Read all the input files into internal data structures.
	# We pass data file names as arguments and then look for matching
	# headers to parse the schema from.
	my %catalogs;
	my %catalog_data;
	my @catnames;
	foreach my $datfile (@ARGV)
	{
	$datfile =~ /(.+)\.dat$/
	or die "Input files need to be data (.dat) files.\n";

	my $header = "$1.h";
	die "There in no header file corresponding to $datfile"
	if !-e $header;

	my $catalog = Catalog::ParseHeader($header);
	my $catname = $catalog->{catname};
	my $schema = $catalog->{columns};

	push @catnames, $catname;
	$catalogs{$catname} = $catalog;

	$catalog_data{$catname} = Catalog::ParseData($datfile, $schema, 1);
	}

	########################################################################
	# At this point, we have read all the data. If you are modifying this
	# script for bulk editing, this is a good place to build lookup tables,
	# if you need to. In the following example, the "next if !ref $row"
	# check below is a hack to filter out non-hash objects. This is because
	# we build the lookup tables from data that we read using the
	# "preserve_formatting" parameter.
	#
	##Index access method lookup.
	#my %amnames;
	#foreach my $row (@{ $catalog_data{pg_am} })
	#{
	# next if !ref $row;
	# $amnames{$row->{oid}} = $row->{amname};
	#}
	########################################################################

	# Write the data.
	foreach my $catname (@catnames)
	{
	my $catalog = $catalogs{$catname};
	my @attnames;
	my $schema = $catalog->{columns};

	foreach my $column (@$schema)
	{
	my $attname = $column->{name};

	# We may have ordinary columns at the storage level that we still
	# want to format as a special value. Exclude these from the column
	# list so they are not written twice.
	push @attnames, $attname
	if !(grep { $_ eq $attname } @METADATA);
	}

	# Write output files to specified directory.
	my $datfile = "$output_path$catname.dat";
	open my $dat, '>', $datfile
	or die "can't open $datfile: $!";

	foreach my $data (@{ $catalog_data{$catname} })
	{

	# Hash ref representing a data entry.
	if (ref $data eq 'HASH')
	{
	my %values = %$data;

	############################################################
	# At this point we have the full tuple in memory as a hash
	# and can do any operations we want. As written, it only
	# removes default values, but this script can be adapted to
	# do one-off bulk-editing.
	############################################################

	if (!$full_tuples)
	{
	# If it's an autogenerated entry, drop it completely.
	next if $values{autogenerated};
	# Else, just drop any default/computed fields.
	strip_default_values(\%values, $schema, $catname);
	}

	print $dat "{";

	# Separate out metadata fields for readability.
	my $metadata_str = format_hash(\%values, @METADATA);
	if ($metadata_str)
	{
	print $dat $metadata_str;

	# User attributes start on next line.
	print $dat ",\n ";
	}

	my $data_str = format_hash(\%values, @attnames);
	print $dat $data_str;
	print $dat " },\n";
	}

	# Preserve blank lines.
	elsif ($data =~ /^\s*$/)
	{
	print $dat "\n";
	}

	# Preserve comments or brackets that are on their own line.
	elsif ($data =~ /^\s(\[\|\]\|#.?)\s*$/)
	{
	print $dat "$1\n";
	}
	}
	close $dat;
	}

	# Remove column values for which there is a matching default,
	# or if the value can be computed from other columns.
	sub strip_default_values
	{
	my ($row, $schema, $catname) = @_;

	# Delete values that match defaults.
	foreach my $column (@$schema)
	{
	my $attname = $column->{name};

	# It's okay if we have no oid value, since it will be assigned
	# automatically before bootstrap.
	die "strip_default_values: $catname.$attname undefined\n"
	if !defined $row->{$attname} and $attname ne 'oid';

	if (defined $column->{default}
	and ($row->{$attname} eq $column->{default}))
	{
	delete $row->{$attname};
	}
	}

	# Delete computed values. See AddDefaultValues() in Catalog.pm.
	# Note: This must be done after deleting values matching defaults.
	if ($catname eq 'pg_proc')
	{
	delete $row->{pronargs} if defined $row->{proargtypes};
	}

	# If a pg_type entry has an auto-generated array type, then its
	# typarray field is a computed value too (see GenerateArrayTypes).
	if ($catname eq 'pg_type')
	{
	delete $row->{typarray} if defined $row->{array_type_oid};
	}

	return;
	}

	# Format the individual elements of a Perl hash into a valid string
	# representation. We do this ourselves, rather than use native Perl
	# facilities, so we can keep control over the exact formatting of the
	# data files.
	sub format_hash
	{
	my $data = shift;
	my @orig_attnames = @_;

	# Copy attname to new array if it has a value, so we can determine
	# the last populated element. We do this because we may have default
	# values or empty metadata fields.
	my @attnames;
	foreach my $orig_attname (@orig_attnames)
	{
	push @attnames, $orig_attname
	if defined $data->{$orig_attname};
	}

	# When calling this function, we ether have an open-bracket or a
	# leading space already.
	my $char_count = 1;

	my $threshold;
	my $hash_str = '';
	my $element_count = 0;

	foreach my $attname (@attnames)
	{
	$element_count++;

	# To limit the line to 80 chars, we need to account for the
	# trailing characters.
	if ($element_count == $#attnames + 1)
	{
	# Last element, so allow space for ' },'
	$threshold = 77;
	}
	else
	{
	# Just need space for trailing comma
	$threshold = 79;
	}

	if ($element_count > 1)
	{
	$hash_str .= ',';
	$char_count++;
	}

	my $value = $data->{$attname};

	# Escape single quotes.
	$value =~ s/'/\\'/g;

	# Include a leading space in the key-value pair, since this will
	# always go after either a comma or an additional padding space on
	# the next line.
	my $element = " $attname => '$value'";
	my $element_length = length($element);

	# If adding the element to the current line would expand the line
	# beyond 80 chars, put it on the next line. We don't do this for
	# the first element, since that would create a blank line.
	if ($element_count > 1 and $char_count + $element_length > $threshold)
	{

	# Put on next line with an additional space preceding. There
	# are now two spaces in front of the key-value pair, lining
	# it up with the line above it.
	$hash_str .= "\n $element";
	$char_count = $element_length + 1;
	}
	else
	{
	$hash_str .= $element;
	$char_count += $element_length;
	}
	}
	return $hash_str;
	}

	sub usage
	{
	die <<EOM;
	Usage: reformat_dat_file.pl [options] datafile...

	Options:
	--output PATH output directory (default '.')
	--full-tuples write out full tuples, including default values

	Non-option arguments are the names of input .dat files.
	Updated files are written to the output directory,
	possibly overwriting the input files.

	EOM
	}