blob: 330e0a5dae3e9bcb14e016ae04e59f8eb3b2a83a [file] [log] [blame]
#!/usr/bin/env perl
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
use Getopt::Long;
use Pod::Usage;
use strict;
use warnings;
=head1 NAME
B<gpsd> - Greenplum Statistics Dumper
B<gpsd> [-?] [-h hostname] [-p port] [-U username] dbname
gpsd creates a schema and statistics dump of a given Greenplum database instance which can be used to create a database environment in which all optimizer issues can be reproduced without having to copy any user data. This facilitates troubleshooting as well as verification of changes to the optimizer logic without costly data transfer.
The unit of a dump is a complete database instance. The resulting output is an SQL script.
=head1 OPTIONS
gpsd supports the same connection options as pg_dump and psql:
=over 6
=item -?
Print usage information.
=item -h hostname
Hostname of target server. Default: localhost.
=item -p port
Port of target server. Default: 5432.
=item -U username
Database user name. Default: current userid.
=head1 USAGE
The dump file contains all information necessary to re-create the database
schema and populate the statistics contained in pg_statistic and pg_class.
Since gpsd writes to STDOUT the standard usage pattern is like this
gpsd -h myserver mydb > mydb.clone.sql
createdb mydb.clone
psql -h myserver -f mydb.clone.sql
gpsd uses the standard utilities pg_dump and psql. Consequently it inherits their limitations. Currently known issues include:
=over 6
=item MPP-1890
pg_dump does not script constraints on inherited tables correctly, in particular, constraints on the parent table which have been explicitly removed from the children. The removal of the constraint on the children is ignored which can result in contradicting constraints on the children. For example, range partitioning implemented with inherited tables may place a contradiction as constraint on the parent table to exclude it from DML and query operations.
=head1 AUTHORS
Apache HAWQ
Address bug reports and comments to:
my $glob_id = "";
my $GPSD = "gpsd";
my $copyright = "Copyright 2015, The Apache Software Foundation";
my $psqlClient = "psql";
my $pgdump = "pg_dump";
my $pgdumpall = "pg_dumpall";
my $commentPrefix = "-- ";
my $sysnslist ="('pg_toast', 'pg_bitmapindex', 'pg_temp_1', ".
"'pg_catalog', 'information_schema')";
# connection variables set from options
my $user = "";
my $port = "";
my $host = "";
# format text as SQL comment
sub comment
my $text = shift;
my @lines = split '\n', $text."\<";
my $comment = join "\n".$commentPrefix, @lines;
$comment =~ s/\<$//g;
return $commentPrefix.$comment. "\n";
sub getGlobals
my $pid = open (PGDUMPALL, "$pgdumpall $host $port $user -g --gp-syntax |");
die unless defined($pid);
my @globals = <PGDUMPALL>;
die "$GPSD: reading from $pgdumpall ($pid) failed. \n" unless @globals;
return @globals;
# extract all schemas from a given database using $pgdump
sub getSchema
my $db = shift;
my $pid = open(PGDUMP, "$pgdump $host $port $user -s -x --gp-syntax -O $db|");
die unless defined($pid);
my @schema = <PGDUMP>;
close PGDUMP;
die "$GPSD: reading from $pgdump ($pid) failed.\n" unless @schema;
return @schema;
# simple SQL executor using psql
# not using DBI as it doesn't come with the standard distribution
sub executeSQL
my $db = shift;
my $stmt = shift;
# print "EXECUTING: $host $port $user $db -c \"$stmt\"\n";
my $pid = open(PSQL, "$psqlClient $host $port $user $db -c \"$stmt\" |");
die unless defined($pid);
my @result = <PSQL>;
close PSQL;
my $len = $#result;
@result = splice(@result, 0, $len - 1);
die "$GPSD: reading from $psqlClient ($pid) failed.\n" unless @result;
return @result;
# get server-side date,time,version
sub getDateTimeVersion
my $db = shift;
(&executeSQL($db, "select current_date, current_time, version()"))[2];
# strip off leading and trailing blanks
sub mstrip
$_ = shift @_;
return $_;
# basic escaping of special chars
sub escValue
$_ = shift;
return $_;
# break up a row as it is returned from psql
# use the +'s in the formatting to decide on the width of the fields
sub splituprow
my $line = shift;
my $data = shift;
#print "$line$data";
my @parts = split '\+', $line;
my $pos = 0;
my @res = ();
foreach my $part (@parts)
my $len = length($part);
push @res, &mstrip(substr($data, $pos, $len));
$pos += $len + 1;
return @res;
# construct SET part of UPDATE statement
sub setStmt
my($names, $values, $types, $incl) = @_;
my $first = 1;
my @res = ();
foreach my $rec (@$incl)
my $type = @$types[$rec];
my $name = @$names[$rec];
my $value = @$values[$rec];
$value = mkValue($value, $type);
# print "r = $rec, t = $type, n = $name, v = '$value'\n";
push @res , "$name = $value"."::$type";
return "\t".join (",\n\t", @res);
# determine page and tuple counts
# build UPDATE statement
sub setPagesTuples
my $db = shift;
my $stmt =
"SELECT pgc.relname, pgn.nspname, pgc.relpages, pgc.reltuples ".
"FROM pg_class pgc, pg_namespace pgn ".
"WHERE pgc.relnamespace = pgn.oid ".
"and pgn.nspname NOT IN $sysnslist";
my ($header, $line, @pgc) = &executeSQL($db, $stmt);
my $res = "-- set pages and tuple counts\n\n";
my @names = &splituprow($line, $header);
# print "names = '$header', '@names'\n";
my @types = ("ignore","ignore","int","real");
my @incls = (2..3);
foreach my $row (@pgc)
my @values = &splituprow($line, $row);
my ($table, $namespace) = @values;
$res .=
&comment("\nTable: $table\n").
"UPDATE pg_class\nSET\n".
&setStmt(\@names, \@values, \@types, \@incls).
"\nWHERE relname = '$table' ".
"AND relnamespace = ".
"(SELECT oid FROM pg_namespace WHERE nspname = '$namespace');\n\n";
return $res."\n\n";
# extract all data from pg_statistic
sub getStats
my $db = shift;
my $stmt =
"SELECT pgc.relname, pgn.nspname, pga.attname, pgt.typname, pgs.* ".
"FROM pg_class pgc, pg_statistic pgs, ".
"pg_namespace pgn, ".
"pg_attribute pga, pg_type pgt ".
"pgc.relnamespace = pgn.oid ".
"and pgn.nspname NOT IN $sysnslist ".
"and pgc.oid = pgs.starelid ".
"and pga.attrelid = pgc.oid ".
"and pga.attnum = pgs.staattnum ".
"and pga.atttypid = pgt.oid ".
"ORDER BY pgc.relname, pgs.staattnum";
return &executeSQL($db, $stmt);
# quote a value based on its original value and its type
sub mkValue
my $value = shift;
my $type = shift;
my $lquote = "";
# quoting of array types
my $isArray = $type =~ /^\_|\]$/;
$lquote= "'" if ($isArray);
# escape value
$value = &escValue($value);
if ($value eq "NaN")
# patch up NaN's for real/float
if ($type eq "real" || $type eq "float")
$value = "'NaN'"
# empty columns are NULL values
$value = "NULL" if $value eq "";
# BUG: this shouldn't happen -- scanner problem!
$value = "NULL" if ($isArray && not $value =~/^\{/);
# do not quote a NULL in an array
$lquote= "" if ($isArray && $value eq "NULL");
return $lquote.$value.$lquote;
# format value and type, quote accordingly
sub combineValuesTypes
my $values = shift;
my $types = shift;
my @res = ();
foreach my $v (@$values)
my $type = shift @$types;
$v = mkValue($v, $type);
push @res, $v."::".$type;
return @res;
sub setStats
my ($header, $line, @qry) = @_;
my $res = "";
my @names = &splituprow($line, $header);
# print "names = '$header', '@names'\n";
my @types = (
"ignore", # relname
"ignore", # ns name
"ignore", # attname
"ignore", # typname
#columms of pg_statistic
"oid", # starelid
"smallint", # staattnum
# incomplete -- types for stavalues1-4 are added
# on a per row basis
foreach my $row (@qry)
my @values = &splituprow($line, $row);
my ($table, $namespace, $attname, $specType, $relid, $column)
= @values;
$specType .= "[]" unless $specType =~ /^_/;
# add row specific types to the type array
my @typesTmp = @types;
push @typesTmp, ($specType, $specType, $specType, $specType);
my @combvals = &combineValuesTypes(\@values, \@typesTmp);
my @newvals = ("'$namespace.$table'::regclass", splice(@combvals, 5, $#combvals - 4));
$res .=
&comment ("\nTable: $table, Attribute: $attname\n").
"INSERT INTO pg_statistic VALUES (\n".
"\t". join(",\n\t", @newvals).
return $res;
# dumper routine
sub dumper
my $db = shift;
my $dateTimeVersion =
"SELECT current_date, current_time, version()";
my ($header, $line, $row, @rem) = &executeSQL($db, $dateTimeVersion);
my ($date, $time, $version) = &splituprow($line, $row);
my @output = ();
push @output,
"\nGreenplum Database Statistics Dump\n".
"Database: $db\n".
"Date: $date\n".
"Time: $time\n".
"CmdLine: $host $port $user\n".
"Version: $version\n");
push @output, &getGlobals;
push @output, &comment("\nConnecting to target database\n");
push @output, "\\connect $db\n";
push @output, &getSchema($db);
push @output, &comment("\nAllow system table modifications\n");
push @output, "set allow_system_table_mods=\"DML\";\n";
push @output, &setPagesTuples($db);
push @output, &setStats(&getStats($db));
push @output, &comment("\nGreenplum Database Statistics Dump complete\n");
return @output;
# parse options
my $man =0;
'help|?' => \$man,
"h:s" => \$host,
"p:i" => \$port,
"U:s" => \$user
pod2usage(-msg => $glob_id, -verbose => 2);
# display man page
pod2usage(-msg => $glob_id, -verbose => 2) if ($man || $#ARGV == -1);
$host = "-h $host" if $host;
$port = "-p $port" if $port;
$user = "-U $user" if $user;
# call actual dumper
print &dumper($ARGV[0]);