src/include/catalog/caqltrack.pl - hawq - Git at Google

 #!/usr/bin/perl
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 # SLZY_HDR_END

 use POSIX;
 use Pod::Usage;
 use Getopt::Long;
 use Data::Dumper;
 #use JSON;
 use strict;
 use warnings;

 # SLZY_POD_HDR_BEGIN
 # WARNING: DO NOT MODIFY THE FOLLOWING POD DOCUMENT:
 # Generated by sleazy.pl version 6 (release Mon Aug 20 12:30:03 2012)
 # Make any changes under SLZY_TOP_BEGIN/SLZY_LONG_BEGIN

 =head1 NAME

 B<caqltrack.pl> - track caql usage in csv files

 =head1 VERSION

  This document describes version 5 of caqltrack.pl, released
  Wed Nov  7 19:51:35 2012.

 =head1 SYNOPSIS

 B<caqltrack.pl>

 Options:

     -help       brief help message
     -man        full documentation
     -connect    psql connect parameters
     -csv        process csv (log) files and extract/pre-aggregate caql usage statistics
     -missing    process aggregated output and list missing entries
     -aggregate  process pre-agg output from caqltrack to produce final statistics

 =head1 OPTIONS

 =over 8

 =item B<-help>

     Print a brief help message and exits.

 =item B<-man>

     Prints the manual page and exits.

 =item B<-connect>

      psql connect string,

  e.g: -connect '-p 11000 template1'

  If this option is supplied, connect to the database to determine the
  location of the data directories, and then find the CSV log files and
  perform phase 1.

 =item B<-csv>

      In the first phase, consume the contents of the CSV log files and
  extract the caql references to construct pre-aggregated output with
  reference count statistics.

 =item B<-missing>


  In the third phase, consume the aggregated output of the second
  phase and list the "missing" functions.

 =item B<-aggregate>

      In the second phase, consume the pre-aggregated output of the first
  phase and produce a final set of totals.


 =back

 =head1 DESCRIPTION

 caqltrack processes CSV log files and extracts references to caql
 basic functions.  The "csv" phase is designed to iterate over sets of
 CSV files (as supplied by xargs) and produce pre-aggregated output.
 Since xargs may invoke caqltrack.pl multiple times, we cannot
 guarantee that a single invocation of caqltrack will process all the
 csv files.  Thus, the first phase uses caqltrack to pre-aggregate the
 caql basic function references, and the second phase invokes caqltrack
 to consume the pre-aggregated output and produce the final totals.

 =head2 USAGE

 find [data-directory] -name '*.csv' | xargs perl caqltrack.pl -csv | perl caqltrack.pl -agg

 =head2 OUTPUT

 The output of the first phase is sorted by basic query function,
 filename, lineno, unique query code, and the "first arg" (if it is an
 oid) followed by a reference count, with vertical bar separators, eg:

 caql_basic_fn_94|aclchk.c|3141|33|101|6

 The output of the first phase is "pre-aggregated", so it may have
 multiple rows for a single (query function, filename, lineno) key.

 The second phase consumes this output, and produces a final of
 fully-aggregated output.  Also, this phase sums all references to a
 basic function into a #TOTAL# row:

  caql_basic_fn_85|ruleutils.c|7137|33|101306
  caql_basic_fn_85|tablecmds.c|1464333|101|6
  caql_basic_fn_85|#TOTAL#|0|0|0312

 Finally, the second phase output is terminated with grand total and
 total unique query counts:

  #GRANDTOTAL#|#TOTAL#|0|0|0|1676343
  #NUMQUERY#|#TOTAL#|0|0|0|105


 =head1 CAVEATS/Future Work


 =head1 AUTHORS

 Apache HAWQ

 Address bug reports and comments to: dev@hawq.apache.org

 =cut
 # SLZY_POD_HDR_END

 # SLZY_GLOB_BEGIN
 my $glob_id = "";
 #my $glob_tabstr = "\t";
 #my $glob_tabstr = " " x $glob_tabwidth;
 my $glob_glob2 = {tabwidth => 4, spacedtab => 1, tabstr => " " x 4};
 my $glob_glob;
 # SLZY_GLOB_END

 sub glob_validate
 {
 	unless ((exists($glob_glob->{csv}) && $glob_glob->{csv}) ||
 			(exists($glob_glob->{connect}) && $glob_glob->{connect}) ||
 			(exists($glob_glob->{missing}) && $glob_glob->{missing}) ||
 			(exists($glob_glob->{aggregate}) && $glob_glob->{aggregate}))
 	{
 		warn("ERROR: Must specify either CSV (process csv) or Aggregate options");
 		pod2usage(-msg => $glob_id, -exitstatus => 1) ;
 	}
 }

 # SLZY_CMDLINE_BEGIN
 # WARNING: DO NOT MODIFY THE FOLLOWING SECTION:
 # Generated by sleazy.pl version 6 (release Mon Aug 20 12:30:03 2012)
 # Make any changes under SLZY_TOP_BEGIN/SLZY_LONG_BEGIN
 # Any additional validation logic belongs in glob_validate()

 BEGIN {
 	    my $s_help      = 0;     # brief help message
 	    my $s_man       = 0;     # full documentation
 	    my $s_connect;           # psql connect parameters
 	    my $s_csv       = 0;     # process csv (log) files and extract/pre-aggregate caql usage statistics
 	    my $s_missing   = 0;     # process aggregated output and list missing entries
 	    my $s_aggregate = 0;     # process pre-agg output from caqltrack to produce final statistics

 	my $slzy_argv_str;
 	$slzy_argv_str = quotemeta(join(" ", @ARGV))
 		if (scalar(@ARGV));

     GetOptions(
 		'help|?'     =>     \$s_help,
 		'man'        =>     \$s_man,
 		'connect:s'  =>     \$s_connect,
 		'csv|log'    =>     \$s_csv,
 		'missing'    =>     \$s_missing,
 		'aggregate'  =>     \$s_aggregate,
                )
         or pod2usage(2);

 	pod2usage(-msg => $glob_id, -exitstatus => 1) if $s_help;
 	pod2usage(-msg => $glob_id, -exitstatus => 0, -verbose => 2) if $s_man;


 	$glob_glob = {};


 	# version and properties from json definition
 	$glob_glob->{_sleazy_properties} = {};
 	$glob_glob->{_sleazy_properties}->{version} = '5';
 	$glob_glob->{_sleazy_properties}->{slzy_date} = '1352346695';
 	$glob_glob->{_sleazy_properties}->{slzy_argv_str} = $slzy_argv_str;

 	$glob_glob->{connect}    =  $s_connect      if (defined($s_connect));
 	$glob_glob->{csv}        =  $s_csv          if (defined($s_csv));
 	$glob_glob->{missing}    =  $s_missing      if (defined($s_missing));
 	$glob_glob->{aggregate}  =  $s_aggregate    if (defined($s_aggregate));

 	glob_validate();


 }
 # SLZY_CMDLINE_END

 # convert a postgresql psql formatted table into an array of hashes
 sub tablelizer
 {
     my ($ini, $got_line1) = @_;

     # first, split into separate lines, the find all the column headings

     my @lines = split(/\n/, $ini);

     return undef
         unless (scalar(@lines));

     # if the first line is supplied, then it has the column headers,
     # so don't try to find them (or the ---+---- separator) in
     # "lines"
     my $line1 = $got_line1;
     $line1 = shift @lines
         unless (defined($got_line1));

     # look for <space>|<space>
     my @colheads = split(/\s+\|\s+/, $line1);

     # fixup first, last column head (remove leading,trailing spaces)

     $colheads[0] =~ s/^\s+//;
     $colheads[0] =~ s/\s+$//;
     $colheads[-1] =~ s/^\s+//;
     $colheads[-1] =~ s/\s+$//;

     return undef
         unless (scalar(@lines));

     shift @lines # skip dashed separator (unless it was skipped already)
         unless (defined($got_line1));

     my @rows;

     for my $lin (@lines)
     {
         my @cols = split(/\|/, $lin, scalar(@colheads));
         last
             unless (scalar(@cols) == scalar(@colheads));

         my $rowh = {};

         for my $colhdcnt (0..(scalar(@colheads)-1))
         {
             my $rawcol = shift @cols;

             $rawcol =~ s/^\s+//;
             $rawcol =~ s/\s+$//;

             my $colhd = $colheads[$colhdcnt];
 #            $rowh->{($colhdcnt+1)} = $rawcol;
             $rowh->{$colhd} = $rawcol;
         }
         push @rows, $rowh;
     }

     return \@rows;
 }

 sub do_conn
 {

 	my $bigstr = <<'EOF_bigstr';
 select gscp.dbid,
 gscp.content,
 gscp.hostname as hostname, gscp.address as address,
 fep.fselocation as loc,
 pfs.oid fsoid,
 pfs.fsname,
 gscp.mode,
 gscp.status,
 gscp.preferred_role
 from
 gp_segment_configuration gscp, pg_filespace_entry fep,
 pg_filespace pfs
 where
 fsname = $q$pg_system$q$
 and
 fep.fsedbid=gscp.dbid
 and pfs.oid = fep.fsefsoid
 order by 1,2
 EOF_bigstr

 	$bigstr .= ' ; ';

     my $psql_str = "psql ";

     $psql_str .= $glob_glob->{connect};

 	$psql_str .= "  -c  \'  $bigstr  \'";

 	my $tabdef = `$psql_str`;

     my $seg_config_table = tablelizer($tabdef);

 #	print Data::Dumper->Dump([$seg_config_table]);

 	my %h1;

 	for my $rowh (@{$seg_config_table})
     {
         my $dir		 = $rowh->{loc};
 		my $hostname = $rowh->{hostname};

         my @foo = File::Spec->splitdir($dir);

         pop @foo;

         $dir = File::Spec->catdir(@foo);

 		$h1{$hostname . ":" . $dir} = {hostname => $hostname,
 									   dir => $dir};
 	}

 	for my $hh (sort(keys(%h1)))
 	{
 		my $dir = $h1{$hh}->{dir};
 		my $prog = $0;

 		system("find $dir -name '*.csv' | xargs perl $prog -csv ");
 #		system("find $dir -name '*.csv'  ");
 #		print `find $dir -name '*.csv' `;

 	}


 } # end do_conn

 sub do_csv
 {
 	my %bigh;

 	while (<>)
 	{
 		my $ini = $_;

 		next unless ($ini =~ m/catquery.*caql\_basic/);

 		# make sure is actual message, not sql string for view:
 		# 'catquery: caql_basic_fn_', ''), 'caller: ', ''), ' ') \
 		# as caql_mess_arr
 		next unless ($ini =~ m/caql\_basic\_fn\_\d+\s+caller/);

 #		print $ini;
 		my @ggg = split(/catquery: /, $ini);
 		my @fff = split(/\,/, $ggg[1]);
 		my $lin = $fff[0];
 		die "bad line: $ini" unless (defined($lin) && length($lin));

 #		print $lin;
 		$lin =~ s/\"//g; # remove quotes;
 		# change to vertical bar separated list
 		$lin =~ s/\s*caller\:\s*/\|/;
 		$lin =~ s/\s+/\|/g;

 		$bigh{$lin} = 0 unless (exists($bigh{$lin}));
 		$bigh{$lin} += 1;
 	} # end while

 	# output in the form of
 	# <func name>|<filename>|<lineno>|<uniqqno>|<firstarg>|<reference count>,
 	# eg:
 	#
 	# caql_basic_fn_118|dbcommands.c|2270|33|101|3

 	for my $kk (sort(keys(%bigh)))
 	{
 		print $kk, "|", $bigh{$kk}, "\n";
 	}

 } # end do_csv

 sub do_agg
 {
 	my %bigh;

 	# input in the form of
 	# <func name>|<filename>|<lineno>|<uniqqno>|<firstarg>|<reference count>,
 	# eg:
 	#
 	# caql_basic_fn_118|dbcommands.c|2270|33|101|3

 	# NOTE: since caqltrack -csv is called via xargs, may have
 	# multiple sets of pre-aggregated lines,
 	# so split by "key" portion (func, file, lineno, uniqqno, firstarg) and
 	# sum the reference counts

 	while (<>)
 	{
 		my $ini = $_;
 		my @ggg = split(/\|/, $ini);
 		die "bad line: $ini" unless (1 < scalar(@ggg));

 		# extract the reference count from the end of the line
 		my $num = pop @ggg;
 		die "bad line count: $ini" unless ($num =~ m/\d+/);

 		# convert the "firstarg" to zero to fix aggregation
 		$ggg[-1] = 0;

 		my $lin = join("|", @ggg);
 		$bigh{$lin} = 0 unless (exists($bigh{$lin}));
 		$bigh{$lin} += $num; # NOTE: not 1, sum the reference counts

 	}

 	my $currqry;
 	my $numcurr	 = 0;
 	my $numqry	 = 0;
 	my $grandtot = 0;

 	for my $kk (sort(keys(%bigh)))
 	{
 		my @ggg = split(/\|/, $kk);

 		if (defined($currqry))
 		{
 			if ($currqry eq $ggg[0])
 			{
 				$numcurr += $bigh{$kk};
 			}
 			else
 			{
 				print $currqry, "|#TOTAL#|0|0|0|", $numcurr, "\n";
 				$currqry = $ggg[0];
 				$numcurr = $bigh{$kk};
 				$numqry += 1;
 			}
 		}
 		else
 		{
 			$currqry = $ggg[0];
 			$numcurr = $bigh{$kk};
 			$numqry += 1;
 		}
 		print $kk, "|", $bigh{$kk}, "\n";
 		$grandtot += $bigh{$kk};

 	} # end for $kk
 	if (defined($currqry))
 	{
 		print $currqry, "|#TOTAL#|0|0|0|", $numcurr, "\n";
 	}
 	if ($grandtot)
 	{
 		print "#GRANDTOTAL#|#TOTAL#|0|0|0|", $grandtot, "\n";
 		print "#NUMQUERY#|#TOTAL#|0|0|0|", $numqry, "\n";
 	}

 } # end do_agg

 sub do_missing
 {
 	my $biga = [];

 	$biga->[0] = undef;

 	# input in the form of
 	# <func name>|#TOTAL#|0|0|0|<reference count>, eg:
 	#
 	# caql_basic_fn_118|#TOTAL#|0|0|0|3

 	while (<>)
 	{
 		my $ini = $_;

 		next unless ($ini =~ m/^caql\_basic\_fn/);
 		next unless ($ini =~ m/\#TOTAL\#/);

 		my @ggg = ($ini =~ m/^caql\_basic\_fn\_(\d+)/);
 		die "bad line: $ini" unless (1 == scalar(@ggg));

 		# extract the function number
 		my $num = pop @ggg;
 		$num--; # convert from 1 to zero based for array
 		die "bad fn: $ini" unless ($num >= 0);

 		$biga->[$num] = $ini;
 	}

 	my $numMiss = 0;
 	for my $qnum (0..(scalar(@{$biga})-1))
 	{
 		unless (exists($biga->[$qnum]) &&
 				defined($biga->[$qnum]))
 		{
 			$numMiss += 1;
 			print "caql_basic_fn_" . ($qnum+1) . "\n";
 		}
 	}
 	print "#MISSING: ", $numMiss, "\n";
 	print "#MAXNUM: ", scalar(@{$biga}), "\n";

 } # end do_missing

 if (1)
 {
 	if (exists($glob_glob->{connect}) && $glob_glob->{connect})
 	{
 		do_conn();
 	}
 	elsif (exists($glob_glob->{csv}) && $glob_glob->{csv})
 	{
 		do_csv();
 	}
 	elsif (exists($glob_glob->{aggregate}) && $glob_glob->{aggregate})
 	{
 		do_agg();
 	}
 	elsif (exists($glob_glob->{missing}) && $glob_glob->{missing})
 	{
 		do_missing();
 	}
 	else
 	{
 		die "invalid options!";
 	}


 }

 # SLZY_TOP_BEGIN
 if (0)
 {
     my $bigstr = <<'EOF_bigstr';
 {
    "args" : [
       {
          "alias" : "?",
          "long" : "Print a brief help message and exits.",
          "name" : "help",
          "required" : "0",
          "short" : "brief help message",
          "type" : "untyped"
       },
       {
          "long" : "Prints the manual page and exits.",
          "name" : "man",
          "required" : "0",
          "short" : "full documentation",
          "type" : "untyped"
       },
       {
          "long" : "$connlong",
          "name" : "connect",
          "short" : "psql connect parameters",
          "type" : "string"
       },
       {
          "alias" : "log",
          "long" : "$csvlong",
          "name" : "csv",
          "short" : "process csv (log) files and extract/pre-aggregate caql usage statistics",
          "type" : "u"
       },
       {
          "long" : "$missinglong",
          "name" : "missing",
          "short" : "process aggregated output and list missing entries",
          "type" : "u"
       },
       {
          "long" : "$agglong",
          "name" : "aggregate",
          "short" : "process pre-agg output from caqltrack to produce final statistics",
          "type" : "u"
       }
    ],
    "long" : "$toplong",
    "properties" : {
       "slzy_date" : 1352346695
    },
    "short" : "track caql usage in csv files",
    "version" : "5"
 }

 EOF_bigstr
 }
 # SLZY_TOP_END

 # SLZY_LONG_BEGIN
 if (0)
 {

 	my $toplong = <<'EOF_longstr';
 caqltrack processes CSV log files and extracts references to caql
 basic functions.  The "csv" phase is designed to iterate over sets of
 CSV files (as supplied by xargs) and produce pre-aggregated output.
 Since xargs may invoke caqltrack.pl multiple times, we cannot
 guarantee that a single invocation of caqltrack will process all the
 csv files.  Thus, the first phase uses caqltrack to pre-aggregate the
 caql basic function references, and the second phase invokes caqltrack
 to consume the pre-aggregated output and produce the final totals.

 {HEAD2} USAGE

 find [data-directory] -name '*.csv' | xargs perl caqltrack.pl -csv | perl caqltrack.pl -agg

 {HEAD2} OUTPUT

 The output of the first phase is sorted by basic query function,
 filename, lineno, unique query code, and the "first arg" (if it is an
 oid) followed by a reference count, with vertical bar separators, eg:

 caql_basic_fn_94|aclchk.c|3141|33|101|6

 The output of the first phase is "pre-aggregated", so it may have
 multiple rows for a single (query function, filename, lineno) key.

 The second phase consumes this output, and produces a final of
 fully-aggregated output.  Also, this phase sums all references to a
 basic function into a #TOTAL# row:

  caql_basic_fn_85|ruleutils.c|7137|33|101306
  caql_basic_fn_85|tablecmds.c|1464333|101|6
  caql_basic_fn_85|#TOTAL#|0|0|0312

 Finally, the second phase output is terminated with grand total and
 total unique query counts:

  #GRANDTOTAL#|#TOTAL#|0|0|0|1676343
  #NUMQUERY#|#TOTAL#|0|0|0|105


 {HEAD1} CAVEATS/Future Work


 EOF_longstr

 my $missinglong = <<'EOF_missinglong';

  In the third phase, consume the aggregated output of the second
  phase and list the "missing" functions.
 EOF_missinglong

 my $agglong = <<'EOF_agglong';
  In the second phase, consume the pre-aggregated output of the first
  phase and produce a final set of totals.
 EOF_agglong

 my $csvlong = <<'EOF_csvlong';
  In the first phase, consume the contents of the CSV log files and
  extract the caql references to construct pre-aggregated output with
  reference count statistics.
 EOF_csvlong

 my $connlong = <<'EOF_connlong';
  psql connect string,

  e.g: -connect '-p 11000 template1'

  If this option is supplied, connect to the database to determine the
  location of the data directories, and then find the CSV log files and
  perform phase 1.
 EOF_connlong

 }
 # SLZY_LONG_END
	#!/usr/bin/perl
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	# SLZY_HDR_END

	use POSIX;
	use Pod::Usage;
	use Getopt::Long;
	use Data::Dumper;
	#use JSON;
	use strict;
	use warnings;

	# SLZY_POD_HDR_BEGIN
	# WARNING: DO NOT MODIFY THE FOLLOWING POD DOCUMENT:
	# Generated by sleazy.pl version 6 (release Mon Aug 20 12:30:03 2012)
	# Make any changes under SLZY_TOP_BEGIN/SLZY_LONG_BEGIN

	=head1 NAME

	B<caqltrack.pl> - track caql usage in csv files

	=head1 VERSION

	This document describes version 5 of caqltrack.pl, released
	Wed Nov 7 19:51:35 2012.

	=head1 SYNOPSIS

	B<caqltrack.pl>

	Options:

	-help brief help message
	-man full documentation
	-connect psql connect parameters
	-csv process csv (log) files and extract/pre-aggregate caql usage statistics
	-missing process aggregated output and list missing entries
	-aggregate process pre-agg output from caqltrack to produce final statistics

	=head1 OPTIONS

	=over 8

	=item B<-help>

	Print a brief help message and exits.

	=item B<-man>

	Prints the manual page and exits.

	=item B<-connect>

	psql connect string,

	e.g: -connect '-p 11000 template1'

	If this option is supplied, connect to the database to determine the
	location of the data directories, and then find the CSV log files and
	perform phase 1.

	=item B<-csv>

	In the first phase, consume the contents of the CSV log files and
	extract the caql references to construct pre-aggregated output with
	reference count statistics.

	=item B<-missing>


	In the third phase, consume the aggregated output of the second
	phase and list the "missing" functions.

	=item B<-aggregate>

	In the second phase, consume the pre-aggregated output of the first
	phase and produce a final set of totals.


	=back

	=head1 DESCRIPTION

	caqltrack processes CSV log files and extracts references to caql
	basic functions. The "csv" phase is designed to iterate over sets of
	CSV files (as supplied by xargs) and produce pre-aggregated output.
	Since xargs may invoke caqltrack.pl multiple times, we cannot
	guarantee that a single invocation of caqltrack will process all the
	csv files. Thus, the first phase uses caqltrack to pre-aggregate the
	caql basic function references, and the second phase invokes caqltrack
	to consume the pre-aggregated output and produce the final totals.

	=head2 USAGE

	find [data-directory] -name '*.csv' \| xargs perl caqltrack.pl -csv \| perl caqltrack.pl -agg

	=head2 OUTPUT

	The output of the first phase is sorted by basic query function,
	filename, lineno, unique query code, and the "first arg" (if it is an
	oid) followed by a reference count, with vertical bar separators, eg:

	caql_basic_fn_94\|aclchk.c\|3141\|33\|101\|6

	The output of the first phase is "pre-aggregated", so it may have
	multiple rows for a single (query function, filename, lineno) key.

	The second phase consumes this output, and produces a final of
	fully-aggregated output. Also, this phase sums all references to a
	basic function into a #TOTAL# row:

	caql_basic_fn_85\|ruleutils.c\|7137\|33\|101306
	caql_basic_fn_85\|tablecmds.c\|1464333\|101\|6
	caql_basic_fn_85\|#TOTAL#\|0\|0\|0312

	Finally, the second phase output is terminated with grand total and
	total unique query counts:

	#GRANDTOTAL#\|#TOTAL#\|0\|0\|0\|1676343
	#NUMQUERY#\|#TOTAL#\|0\|0\|0\|105



	=head1 CAVEATS/Future Work



	=head1 AUTHORS

	Apache HAWQ

	Address bug reports and comments to: dev@hawq.apache.org

	=cut
	# SLZY_POD_HDR_END

	# SLZY_GLOB_BEGIN
	my $glob_id = "";
	#my $glob_tabstr = "\t";
	#my $glob_tabstr = " " x $glob_tabwidth;
	my $glob_glob2 = {tabwidth => 4, spacedtab => 1, tabstr => " " x 4};
	my $glob_glob;
	# SLZY_GLOB_END

	sub glob_validate
	{
	unless ((exists($glob_glob->{csv}) && $glob_glob->{csv}) \|\|
	(exists($glob_glob->{connect}) && $glob_glob->{connect}) \|\|
	(exists($glob_glob->{missing}) && $glob_glob->{missing}) \|\|
	(exists($glob_glob->{aggregate}) && $glob_glob->{aggregate}))
	{
	warn("ERROR: Must specify either CSV (process csv) or Aggregate options");
	pod2usage(-msg => $glob_id, -exitstatus => 1) ;
	}
	}

	# SLZY_CMDLINE_BEGIN
	# WARNING: DO NOT MODIFY THE FOLLOWING SECTION:
	# Generated by sleazy.pl version 6 (release Mon Aug 20 12:30:03 2012)
	# Make any changes under SLZY_TOP_BEGIN/SLZY_LONG_BEGIN
	# Any additional validation logic belongs in glob_validate()

	BEGIN {
	my $s_help = 0; # brief help message
	my $s_man = 0; # full documentation
	my $s_connect; # psql connect parameters
	my $s_csv = 0; # process csv (log) files and extract/pre-aggregate caql usage statistics
	my $s_missing = 0; # process aggregated output and list missing entries
	my $s_aggregate = 0; # process pre-agg output from caqltrack to produce final statistics

	my $slzy_argv_str;
	$slzy_argv_str = quotemeta(join(" ", @ARGV))
	if (scalar(@ARGV));

	GetOptions(
	'help\|?' => \$s_help,
	'man' => \$s_man,
	'connect:s' => \$s_connect,
	'csv\|log' => \$s_csv,
	'missing' => \$s_missing,
	'aggregate' => \$s_aggregate,
	)
	or pod2usage(2);

	pod2usage(-msg => $glob_id, -exitstatus => 1) if $s_help;
	pod2usage(-msg => $glob_id, -exitstatus => 0, -verbose => 2) if $s_man;


	$glob_glob = {};


	# version and properties from json definition
	$glob_glob->{_sleazy_properties} = {};
	$glob_glob->{_sleazy_properties}->{version} = '5';
	$glob_glob->{_sleazy_properties}->{slzy_date} = '1352346695';
	$glob_glob->{_sleazy_properties}->{slzy_argv_str} = $slzy_argv_str;

	$glob_glob->{connect} = $s_connect if (defined($s_connect));
	$glob_glob->{csv} = $s_csv if (defined($s_csv));
	$glob_glob->{missing} = $s_missing if (defined($s_missing));
	$glob_glob->{aggregate} = $s_aggregate if (defined($s_aggregate));

	glob_validate();


	}
	# SLZY_CMDLINE_END

	# convert a postgresql psql formatted table into an array of hashes
	sub tablelizer
	{
	my ($ini, $got_line1) = @_;

	# first, split into separate lines, the find all the column headings

	my @lines = split(/\n/, $ini);

	return undef
	unless (scalar(@lines));

	# if the first line is supplied, then it has the column headers,
	# so don't try to find them (or the ---+---- separator) in
	# "lines"
	my $line1 = $got_line1;
	$line1 = shift @lines
	unless (defined($got_line1));

	# look for <space>\|<space>
	my @colheads = split(/\s+\\|\s+/, $line1);

	# fixup first, last column head (remove leading,trailing spaces)

	$colheads[0] =~ s/^\s+//;
	$colheads[0] =~ s/\s+$//;
	$colheads[-1] =~ s/^\s+//;
	$colheads[-1] =~ s/\s+$//;

	return undef
	unless (scalar(@lines));

	shift @lines # skip dashed separator (unless it was skipped already)
	unless (defined($got_line1));

	my @rows;

	for my $lin (@lines)
	{
	my @cols = split(/\\|/, $lin, scalar(@colheads));
	last
	unless (scalar(@cols) == scalar(@colheads));

	my $rowh = {};

	for my $colhdcnt (0..(scalar(@colheads)-1))
	{
	my $rawcol = shift @cols;

	$rawcol =~ s/^\s+//;
	$rawcol =~ s/\s+$//;

	my $colhd = $colheads[$colhdcnt];
	# $rowh->{($colhdcnt+1)} = $rawcol;
	$rowh->{$colhd} = $rawcol;
	}
	push @rows, $rowh;
	}

	return \@rows;
	}

	sub do_conn
	{

	my $bigstr = <<'EOF_bigstr';
	select gscp.dbid,
	gscp.content,
	gscp.hostname as hostname, gscp.address as address,
	fep.fselocation as loc,
	pfs.oid fsoid,
	pfs.fsname,
	gscp.mode,
	gscp.status,
	gscp.preferred_role
	from
	gp_segment_configuration gscp, pg_filespace_entry fep,
	pg_filespace pfs
	where
	fsname = $q$pg_system$q$
	and
	fep.fsedbid=gscp.dbid
	and pfs.oid = fep.fsefsoid
	order by 1,2
	EOF_bigstr

	$bigstr .= ' ; ';

	my $psql_str = "psql ";

	$psql_str .= $glob_glob->{connect};

	$psql_str .= " -c \' $bigstr \'";

	my $tabdef = `$psql_str`;

	my $seg_config_table = tablelizer($tabdef);

	# print Data::Dumper->Dump([$seg_config_table]);

	my %h1;

	for my $rowh (@{$seg_config_table})
	{
	my $dir = $rowh->{loc};
	my $hostname = $rowh->{hostname};

	my @foo = File::Spec->splitdir($dir);

	pop @foo;

	$dir = File::Spec->catdir(@foo);

	$h1{$hostname . ":" . $dir} = {hostname => $hostname,
	dir => $dir};
	}

	for my $hh (sort(keys(%h1)))
	{
	my $dir = $h1{$hh}->{dir};
	my $prog = $0;

	system("find $dir -name '*.csv' \| xargs perl $prog -csv ");
	# system("find $dir -name '*.csv' ");
	# print `find $dir -name '*.csv' `;

	}


	} # end do_conn

	sub do_csv
	{
	my %bigh;

	while (<>)
	{
	my $ini = $_;

	next unless ($ini =~ m/catquery.*caql\_basic/);

	# make sure is actual message, not sql string for view:
	# 'catquery: caql_basic_fn_', ''), 'caller: ', ''), ' ') \
	# as caql_mess_arr
	next unless ($ini =~ m/caql\_basic\_fn\_\d+\s+caller/);

	# print $ini;
	my @ggg = split(/catquery: /, $ini);
	my @fff = split(/\,/, $ggg[1]);
	my $lin = $fff[0];
	die "bad line: $ini" unless (defined($lin) && length($lin));

	# print $lin;
	$lin =~ s/\"//g; # remove quotes;
	# change to vertical bar separated list
	$lin =~ s/\scaller\:\s/\\|/;
	$lin =~ s/\s+/\\|/g;

	$bigh{$lin} = 0 unless (exists($bigh{$lin}));
	$bigh{$lin} += 1;
	} # end while

	# output in the form of
	# <func name>\|<filename>\|<lineno>\|<uniqqno>\|<firstarg>\|<reference count>,
	# eg:
	#
	# caql_basic_fn_118\|dbcommands.c\|2270\|33\|101\|3

	for my $kk (sort(keys(%bigh)))
	{
	print $kk, "\|", $bigh{$kk}, "\n";
	}

	} # end do_csv

	sub do_agg
	{
	my %bigh;

	# input in the form of
	# <func name>\|<filename>\|<lineno>\|<uniqqno>\|<firstarg>\|<reference count>,
	# eg:
	#
	# caql_basic_fn_118\|dbcommands.c\|2270\|33\|101\|3

	# NOTE: since caqltrack -csv is called via xargs, may have
	# multiple sets of pre-aggregated lines,
	# so split by "key" portion (func, file, lineno, uniqqno, firstarg) and
	# sum the reference counts

	while (<>)
	{
	my $ini = $_;
	my @ggg = split(/\\|/, $ini);
	die "bad line: $ini" unless (1 < scalar(@ggg));

	# extract the reference count from the end of the line
	my $num = pop @ggg;
	die "bad line count: $ini" unless ($num =~ m/\d+/);

	# convert the "firstarg" to zero to fix aggregation
	$ggg[-1] = 0;

	my $lin = join("\|", @ggg);
	$bigh{$lin} = 0 unless (exists($bigh{$lin}));
	$bigh{$lin} += $num; # NOTE: not 1, sum the reference counts

	}

	my $currqry;
	my $numcurr = 0;
	my $numqry = 0;
	my $grandtot = 0;

	for my $kk (sort(keys(%bigh)))
	{
	my @ggg = split(/\\|/, $kk);

	if (defined($currqry))
	{
	if ($currqry eq $ggg[0])
	{
	$numcurr += $bigh{$kk};
	}
	else
	{
	print $currqry, "\|#TOTAL#\|0\|0\|0\|", $numcurr, "\n";
	$currqry = $ggg[0];
	$numcurr = $bigh{$kk};
	$numqry += 1;
	}
	}
	else
	{
	$currqry = $ggg[0];
	$numcurr = $bigh{$kk};
	$numqry += 1;
	}
	print $kk, "\|", $bigh{$kk}, "\n";
	$grandtot += $bigh{$kk};

	} # end for $kk
	if (defined($currqry))
	{
	print $currqry, "\|#TOTAL#\|0\|0\|0\|", $numcurr, "\n";
	}
	if ($grandtot)
	{
	print "#GRANDTOTAL#\|#TOTAL#\|0\|0\|0\|", $grandtot, "\n";
	print "#NUMQUERY#\|#TOTAL#\|0\|0\|0\|", $numqry, "\n";
	}

	} # end do_agg

	sub do_missing
	{
	my $biga = [];

	$biga->[0] = undef;

	# input in the form of
	# <func name>\|#TOTAL#\|0\|0\|0\|<reference count>, eg:
	#
	# caql_basic_fn_118\|#TOTAL#\|0\|0\|0\|3

	while (<>)
	{
	my $ini = $_;

	next unless ($ini =~ m/^caql\_basic\_fn/);
	next unless ($ini =~ m/\#TOTAL\#/);

	my @ggg = ($ini =~ m/^caql\_basic\_fn\_(\d+)/);
	die "bad line: $ini" unless (1 == scalar(@ggg));

	# extract the function number
	my $num = pop @ggg;
	$num--; # convert from 1 to zero based for array
	die "bad fn: $ini" unless ($num >= 0);

	$biga->[$num] = $ini;
	}

	my $numMiss = 0;
	for my $qnum (0..(scalar(@{$biga})-1))
	{
	unless (exists($biga->[$qnum]) &&
	defined($biga->[$qnum]))
	{
	$numMiss += 1;
	print "caql_basic_fn_" . ($qnum+1) . "\n";
	}
	}
	print "#MISSING: ", $numMiss, "\n";
	print "#MAXNUM: ", scalar(@{$biga}), "\n";

	} # end do_missing

	if (1)
	{
	if (exists($glob_glob->{connect}) && $glob_glob->{connect})
	{
	do_conn();
	}
	elsif (exists($glob_glob->{csv}) && $glob_glob->{csv})
	{
	do_csv();
	}
	elsif (exists($glob_glob->{aggregate}) && $glob_glob->{aggregate})
	{
	do_agg();
	}
	elsif (exists($glob_glob->{missing}) && $glob_glob->{missing})
	{
	do_missing();
	}
	else
	{
	die "invalid options!";
	}


	}

	# SLZY_TOP_BEGIN
	if (0)
	{
	my $bigstr = <<'EOF_bigstr';
	{
	"args" : [
	{
	"alias" : "?",
	"long" : "Print a brief help message and exits.",
	"name" : "help",
	"required" : "0",
	"short" : "brief help message",
	"type" : "untyped"
	},
	{
	"long" : "Prints the manual page and exits.",
	"name" : "man",
	"required" : "0",
	"short" : "full documentation",
	"type" : "untyped"
	},
	{
	"long" : "$connlong",
	"name" : "connect",
	"short" : "psql connect parameters",
	"type" : "string"
	},
	{
	"alias" : "log",
	"long" : "$csvlong",
	"name" : "csv",
	"short" : "process csv (log) files and extract/pre-aggregate caql usage statistics",
	"type" : "u"
	},
	{
	"long" : "$missinglong",
	"name" : "missing",
	"short" : "process aggregated output and list missing entries",
	"type" : "u"
	},
	{
	"long" : "$agglong",
	"name" : "aggregate",
	"short" : "process pre-agg output from caqltrack to produce final statistics",
	"type" : "u"
	}
	],
	"long" : "$toplong",
	"properties" : {
	"slzy_date" : 1352346695
	},
	"short" : "track caql usage in csv files",
	"version" : "5"
	}

	EOF_bigstr
	}
	# SLZY_TOP_END

	# SLZY_LONG_BEGIN
	if (0)
	{

	my $toplong = <<'EOF_longstr';
	caqltrack processes CSV log files and extracts references to caql
	basic functions. The "csv" phase is designed to iterate over sets of
	CSV files (as supplied by xargs) and produce pre-aggregated output.
	Since xargs may invoke caqltrack.pl multiple times, we cannot
	guarantee that a single invocation of caqltrack will process all the
	csv files. Thus, the first phase uses caqltrack to pre-aggregate the
	caql basic function references, and the second phase invokes caqltrack
	to consume the pre-aggregated output and produce the final totals.

	{HEAD2} USAGE

	find [data-directory] -name '*.csv' \| xargs perl caqltrack.pl -csv \| perl caqltrack.pl -agg

	{HEAD2} OUTPUT

	The output of the first phase is sorted by basic query function,
	filename, lineno, unique query code, and the "first arg" (if it is an
	oid) followed by a reference count, with vertical bar separators, eg:

	caql_basic_fn_94\|aclchk.c\|3141\|33\|101\|6

	The output of the first phase is "pre-aggregated", so it may have
	multiple rows for a single (query function, filename, lineno) key.

	The second phase consumes this output, and produces a final of
	fully-aggregated output. Also, this phase sums all references to a
	basic function into a #TOTAL# row:

	caql_basic_fn_85\|ruleutils.c\|7137\|33\|101306
	caql_basic_fn_85\|tablecmds.c\|1464333\|101\|6
	caql_basic_fn_85\|#TOTAL#\|0\|0\|0312

	Finally, the second phase output is terminated with grand total and
	total unique query counts:

	#GRANDTOTAL#\|#TOTAL#\|0\|0\|0\|1676343
	#NUMQUERY#\|#TOTAL#\|0\|0\|0\|105



	{HEAD1} CAVEATS/Future Work


	EOF_longstr

	my $missinglong = <<'EOF_missinglong';

	In the third phase, consume the aggregated output of the second
	phase and list the "missing" functions.
	EOF_missinglong

	my $agglong = <<'EOF_agglong';
	In the second phase, consume the pre-aggregated output of the first
	phase and produce a final set of totals.
	EOF_agglong

	my $csvlong = <<'EOF_csvlong';
	In the first phase, consume the contents of the CSV log files and
	extract the caql references to construct pre-aggregated output with
	reference count statistics.
	EOF_csvlong

	my $connlong = <<'EOF_connlong';
	psql connect string,

	e.g: -connect '-p 11000 template1'

	If this option is supplied, connect to the database to determine the
	location of the data directories, and then find the CSV log files and
	perform phase 1.
	EOF_connlong

	}
	# SLZY_LONG_END