blob: ab70e95c48913889ed5a6923429b8e8909409da3 [file] [log] [blame]
#!/usr/bin/perl
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# SLZY_HDR_END
use POSIX;
use Pod::Usage;
use Getopt::Long;
use Data::Dumper;
#use JSON;
use strict;
use warnings;
# SLZY_POD_HDR_BEGIN
# WARNING: DO NOT MODIFY THE FOLLOWING POD DOCUMENT:
# Generated by sleazy.pl version 6 (release Mon Aug 20 12:30:03 2012)
# Make any changes under SLZY_TOP_BEGIN/SLZY_LONG_BEGIN
=head1 NAME
B<caqltrack.pl> - track caql usage in csv files
=head1 VERSION
This document describes version 5 of caqltrack.pl, released
Wed Nov 7 19:51:35 2012.
=head1 SYNOPSIS
B<caqltrack.pl>
Options:
-help brief help message
-man full documentation
-connect psql connect parameters
-csv process csv (log) files and extract/pre-aggregate caql usage statistics
-missing process aggregated output and list missing entries
-aggregate process pre-agg output from caqltrack to produce final statistics
=head1 OPTIONS
=over 8
=item B<-help>
Print a brief help message and exits.
=item B<-man>
Prints the manual page and exits.
=item B<-connect>
psql connect string,
e.g: -connect '-p 11000 template1'
If this option is supplied, connect to the database to determine the
location of the data directories, and then find the CSV log files and
perform phase 1.
=item B<-csv>
In the first phase, consume the contents of the CSV log files and
extract the caql references to construct pre-aggregated output with
reference count statistics.
=item B<-missing>
In the third phase, consume the aggregated output of the second
phase and list the "missing" functions.
=item B<-aggregate>
In the second phase, consume the pre-aggregated output of the first
phase and produce a final set of totals.
=back
=head1 DESCRIPTION
caqltrack processes CSV log files and extracts references to caql
basic functions. The "csv" phase is designed to iterate over sets of
CSV files (as supplied by xargs) and produce pre-aggregated output.
Since xargs may invoke caqltrack.pl multiple times, we cannot
guarantee that a single invocation of caqltrack will process all the
csv files. Thus, the first phase uses caqltrack to pre-aggregate the
caql basic function references, and the second phase invokes caqltrack
to consume the pre-aggregated output and produce the final totals.
=head2 USAGE
find [data-directory] -name '*.csv' | xargs perl caqltrack.pl -csv | perl caqltrack.pl -agg
=head2 OUTPUT
The output of the first phase is sorted by basic query function,
filename, lineno, unique query code, and the "first arg" (if it is an
oid) followed by a reference count, with vertical bar separators, eg:
caql_basic_fn_94|aclchk.c|3141|33|101|6
The output of the first phase is "pre-aggregated", so it may have
multiple rows for a single (query function, filename, lineno) key.
The second phase consumes this output, and produces a final of
fully-aggregated output. Also, this phase sums all references to a
basic function into a #TOTAL# row:
caql_basic_fn_85|ruleutils.c|7137|33|101306
caql_basic_fn_85|tablecmds.c|1464333|101|6
caql_basic_fn_85|#TOTAL#|0|0|0312
Finally, the second phase output is terminated with grand total and
total unique query counts:
#GRANDTOTAL#|#TOTAL#|0|0|0|1676343
#NUMQUERY#|#TOTAL#|0|0|0|105
=head1 CAVEATS/Future Work
=head1 AUTHORS
Apache HAWQ
Address bug reports and comments to: dev@hawq.apache.org
=cut
# SLZY_POD_HDR_END
# SLZY_GLOB_BEGIN
my $glob_id = "";
#my $glob_tabstr = "\t";
#my $glob_tabstr = " " x $glob_tabwidth;
my $glob_glob2 = {tabwidth => 4, spacedtab => 1, tabstr => " " x 4};
my $glob_glob;
# SLZY_GLOB_END
sub glob_validate
{
unless ((exists($glob_glob->{csv}) && $glob_glob->{csv}) ||
(exists($glob_glob->{connect}) && $glob_glob->{connect}) ||
(exists($glob_glob->{missing}) && $glob_glob->{missing}) ||
(exists($glob_glob->{aggregate}) && $glob_glob->{aggregate}))
{
warn("ERROR: Must specify either CSV (process csv) or Aggregate options");
pod2usage(-msg => $glob_id, -exitstatus => 1) ;
}
}
# SLZY_CMDLINE_BEGIN
# WARNING: DO NOT MODIFY THE FOLLOWING SECTION:
# Generated by sleazy.pl version 6 (release Mon Aug 20 12:30:03 2012)
# Make any changes under SLZY_TOP_BEGIN/SLZY_LONG_BEGIN
# Any additional validation logic belongs in glob_validate()
BEGIN {
my $s_help = 0; # brief help message
my $s_man = 0; # full documentation
my $s_connect; # psql connect parameters
my $s_csv = 0; # process csv (log) files and extract/pre-aggregate caql usage statistics
my $s_missing = 0; # process aggregated output and list missing entries
my $s_aggregate = 0; # process pre-agg output from caqltrack to produce final statistics
my $slzy_argv_str;
$slzy_argv_str = quotemeta(join(" ", @ARGV))
if (scalar(@ARGV));
GetOptions(
'help|?' => \$s_help,
'man' => \$s_man,
'connect:s' => \$s_connect,
'csv|log' => \$s_csv,
'missing' => \$s_missing,
'aggregate' => \$s_aggregate,
)
or pod2usage(2);
pod2usage(-msg => $glob_id, -exitstatus => 1) if $s_help;
pod2usage(-msg => $glob_id, -exitstatus => 0, -verbose => 2) if $s_man;
$glob_glob = {};
# version and properties from json definition
$glob_glob->{_sleazy_properties} = {};
$glob_glob->{_sleazy_properties}->{version} = '5';
$glob_glob->{_sleazy_properties}->{slzy_date} = '1352346695';
$glob_glob->{_sleazy_properties}->{slzy_argv_str} = $slzy_argv_str;
$glob_glob->{connect} = $s_connect if (defined($s_connect));
$glob_glob->{csv} = $s_csv if (defined($s_csv));
$glob_glob->{missing} = $s_missing if (defined($s_missing));
$glob_glob->{aggregate} = $s_aggregate if (defined($s_aggregate));
glob_validate();
}
# SLZY_CMDLINE_END
# convert a postgresql psql formatted table into an array of hashes
sub tablelizer
{
my ($ini, $got_line1) = @_;
# first, split into separate lines, the find all the column headings
my @lines = split(/\n/, $ini);
return undef
unless (scalar(@lines));
# if the first line is supplied, then it has the column headers,
# so don't try to find them (or the ---+---- separator) in
# "lines"
my $line1 = $got_line1;
$line1 = shift @lines
unless (defined($got_line1));
# look for <space>|<space>
my @colheads = split(/\s+\|\s+/, $line1);
# fixup first, last column head (remove leading,trailing spaces)
$colheads[0] =~ s/^\s+//;
$colheads[0] =~ s/\s+$//;
$colheads[-1] =~ s/^\s+//;
$colheads[-1] =~ s/\s+$//;
return undef
unless (scalar(@lines));
shift @lines # skip dashed separator (unless it was skipped already)
unless (defined($got_line1));
my @rows;
for my $lin (@lines)
{
my @cols = split(/\|/, $lin, scalar(@colheads));
last
unless (scalar(@cols) == scalar(@colheads));
my $rowh = {};
for my $colhdcnt (0..(scalar(@colheads)-1))
{
my $rawcol = shift @cols;
$rawcol =~ s/^\s+//;
$rawcol =~ s/\s+$//;
my $colhd = $colheads[$colhdcnt];
# $rowh->{($colhdcnt+1)} = $rawcol;
$rowh->{$colhd} = $rawcol;
}
push @rows, $rowh;
}
return \@rows;
}
sub do_conn
{
my $bigstr = <<'EOF_bigstr';
select gscp.dbid,
gscp.content,
gscp.hostname as hostname, gscp.address as address,
fep.fselocation as loc,
pfs.oid fsoid,
pfs.fsname,
gscp.mode,
gscp.status,
gscp.preferred_role
from
gp_segment_configuration gscp, pg_filespace_entry fep,
pg_filespace pfs
where
fsname = $q$pg_system$q$
and
fep.fsedbid=gscp.dbid
and pfs.oid = fep.fsefsoid
order by 1,2
EOF_bigstr
$bigstr .= ' ; ';
my $psql_str = "psql ";
$psql_str .= $glob_glob->{connect};
$psql_str .= " -c \' $bigstr \'";
my $tabdef = `$psql_str`;
my $seg_config_table = tablelizer($tabdef);
# print Data::Dumper->Dump([$seg_config_table]);
my %h1;
for my $rowh (@{$seg_config_table})
{
my $dir = $rowh->{loc};
my $hostname = $rowh->{hostname};
my @foo = File::Spec->splitdir($dir);
pop @foo;
$dir = File::Spec->catdir(@foo);
$h1{$hostname . ":" . $dir} = {hostname => $hostname,
dir => $dir};
}
for my $hh (sort(keys(%h1)))
{
my $dir = $h1{$hh}->{dir};
my $prog = $0;
system("find $dir -name '*.csv' | xargs perl $prog -csv ");
# system("find $dir -name '*.csv' ");
# print `find $dir -name '*.csv' `;
}
} # end do_conn
sub do_csv
{
my %bigh;
while (<>)
{
my $ini = $_;
next unless ($ini =~ m/catquery.*caql\_basic/);
# make sure is actual message, not sql string for view:
# 'catquery: caql_basic_fn_', ''), 'caller: ', ''), ' ') \
# as caql_mess_arr
next unless ($ini =~ m/caql\_basic\_fn\_\d+\s+caller/);
# print $ini;
my @ggg = split(/catquery: /, $ini);
my @fff = split(/\,/, $ggg[1]);
my $lin = $fff[0];
die "bad line: $ini" unless (defined($lin) && length($lin));
# print $lin;
$lin =~ s/\"//g; # remove quotes;
# change to vertical bar separated list
$lin =~ s/\s*caller\:\s*/\|/;
$lin =~ s/\s+/\|/g;
$bigh{$lin} = 0 unless (exists($bigh{$lin}));
$bigh{$lin} += 1;
} # end while
# output in the form of
# <func name>|<filename>|<lineno>|<uniqqno>|<firstarg>|<reference count>,
# eg:
#
# caql_basic_fn_118|dbcommands.c|2270|33|101|3
for my $kk (sort(keys(%bigh)))
{
print $kk, "|", $bigh{$kk}, "\n";
}
} # end do_csv
sub do_agg
{
my %bigh;
# input in the form of
# <func name>|<filename>|<lineno>|<uniqqno>|<firstarg>|<reference count>,
# eg:
#
# caql_basic_fn_118|dbcommands.c|2270|33|101|3
# NOTE: since caqltrack -csv is called via xargs, may have
# multiple sets of pre-aggregated lines,
# so split by "key" portion (func, file, lineno, uniqqno, firstarg) and
# sum the reference counts
while (<>)
{
my $ini = $_;
my @ggg = split(/\|/, $ini);
die "bad line: $ini" unless (1 < scalar(@ggg));
# extract the reference count from the end of the line
my $num = pop @ggg;
die "bad line count: $ini" unless ($num =~ m/\d+/);
# convert the "firstarg" to zero to fix aggregation
$ggg[-1] = 0;
my $lin = join("|", @ggg);
$bigh{$lin} = 0 unless (exists($bigh{$lin}));
$bigh{$lin} += $num; # NOTE: not 1, sum the reference counts
}
my $currqry;
my $numcurr = 0;
my $numqry = 0;
my $grandtot = 0;
for my $kk (sort(keys(%bigh)))
{
my @ggg = split(/\|/, $kk);
if (defined($currqry))
{
if ($currqry eq $ggg[0])
{
$numcurr += $bigh{$kk};
}
else
{
print $currqry, "|#TOTAL#|0|0|0|", $numcurr, "\n";
$currqry = $ggg[0];
$numcurr = $bigh{$kk};
$numqry += 1;
}
}
else
{
$currqry = $ggg[0];
$numcurr = $bigh{$kk};
$numqry += 1;
}
print $kk, "|", $bigh{$kk}, "\n";
$grandtot += $bigh{$kk};
} # end for $kk
if (defined($currqry))
{
print $currqry, "|#TOTAL#|0|0|0|", $numcurr, "\n";
}
if ($grandtot)
{
print "#GRANDTOTAL#|#TOTAL#|0|0|0|", $grandtot, "\n";
print "#NUMQUERY#|#TOTAL#|0|0|0|", $numqry, "\n";
}
} # end do_agg
sub do_missing
{
my $biga = [];
$biga->[0] = undef;
# input in the form of
# <func name>|#TOTAL#|0|0|0|<reference count>, eg:
#
# caql_basic_fn_118|#TOTAL#|0|0|0|3
while (<>)
{
my $ini = $_;
next unless ($ini =~ m/^caql\_basic\_fn/);
next unless ($ini =~ m/\#TOTAL\#/);
my @ggg = ($ini =~ m/^caql\_basic\_fn\_(\d+)/);
die "bad line: $ini" unless (1 == scalar(@ggg));
# extract the function number
my $num = pop @ggg;
$num--; # convert from 1 to zero based for array
die "bad fn: $ini" unless ($num >= 0);
$biga->[$num] = $ini;
}
my $numMiss = 0;
for my $qnum (0..(scalar(@{$biga})-1))
{
unless (exists($biga->[$qnum]) &&
defined($biga->[$qnum]))
{
$numMiss += 1;
print "caql_basic_fn_" . ($qnum+1) . "\n";
}
}
print "#MISSING: ", $numMiss, "\n";
print "#MAXNUM: ", scalar(@{$biga}), "\n";
} # end do_missing
if (1)
{
if (exists($glob_glob->{connect}) && $glob_glob->{connect})
{
do_conn();
}
elsif (exists($glob_glob->{csv}) && $glob_glob->{csv})
{
do_csv();
}
elsif (exists($glob_glob->{aggregate}) && $glob_glob->{aggregate})
{
do_agg();
}
elsif (exists($glob_glob->{missing}) && $glob_glob->{missing})
{
do_missing();
}
else
{
die "invalid options!";
}
}
# SLZY_TOP_BEGIN
if (0)
{
my $bigstr = <<'EOF_bigstr';
{
"args" : [
{
"alias" : "?",
"long" : "Print a brief help message and exits.",
"name" : "help",
"required" : "0",
"short" : "brief help message",
"type" : "untyped"
},
{
"long" : "Prints the manual page and exits.",
"name" : "man",
"required" : "0",
"short" : "full documentation",
"type" : "untyped"
},
{
"long" : "$connlong",
"name" : "connect",
"short" : "psql connect parameters",
"type" : "string"
},
{
"alias" : "log",
"long" : "$csvlong",
"name" : "csv",
"short" : "process csv (log) files and extract/pre-aggregate caql usage statistics",
"type" : "u"
},
{
"long" : "$missinglong",
"name" : "missing",
"short" : "process aggregated output and list missing entries",
"type" : "u"
},
{
"long" : "$agglong",
"name" : "aggregate",
"short" : "process pre-agg output from caqltrack to produce final statistics",
"type" : "u"
}
],
"long" : "$toplong",
"properties" : {
"slzy_date" : 1352346695
},
"short" : "track caql usage in csv files",
"version" : "5"
}
EOF_bigstr
}
# SLZY_TOP_END
# SLZY_LONG_BEGIN
if (0)
{
my $toplong = <<'EOF_longstr';
caqltrack processes CSV log files and extracts references to caql
basic functions. The "csv" phase is designed to iterate over sets of
CSV files (as supplied by xargs) and produce pre-aggregated output.
Since xargs may invoke caqltrack.pl multiple times, we cannot
guarantee that a single invocation of caqltrack will process all the
csv files. Thus, the first phase uses caqltrack to pre-aggregate the
caql basic function references, and the second phase invokes caqltrack
to consume the pre-aggregated output and produce the final totals.
{HEAD2} USAGE
find [data-directory] -name '*.csv' | xargs perl caqltrack.pl -csv | perl caqltrack.pl -agg
{HEAD2} OUTPUT
The output of the first phase is sorted by basic query function,
filename, lineno, unique query code, and the "first arg" (if it is an
oid) followed by a reference count, with vertical bar separators, eg:
caql_basic_fn_94|aclchk.c|3141|33|101|6
The output of the first phase is "pre-aggregated", so it may have
multiple rows for a single (query function, filename, lineno) key.
The second phase consumes this output, and produces a final of
fully-aggregated output. Also, this phase sums all references to a
basic function into a #TOTAL# row:
caql_basic_fn_85|ruleutils.c|7137|33|101306
caql_basic_fn_85|tablecmds.c|1464333|101|6
caql_basic_fn_85|#TOTAL#|0|0|0312
Finally, the second phase output is terminated with grand total and
total unique query counts:
#GRANDTOTAL#|#TOTAL#|0|0|0|1676343
#NUMQUERY#|#TOTAL#|0|0|0|105
{HEAD1} CAVEATS/Future Work
EOF_longstr
my $missinglong = <<'EOF_missinglong';
In the third phase, consume the aggregated output of the second
phase and list the "missing" functions.
EOF_missinglong
my $agglong = <<'EOF_agglong';
In the second phase, consume the pre-aggregated output of the first
phase and produce a final set of totals.
EOF_agglong
my $csvlong = <<'EOF_csvlong';
In the first phase, consume the contents of the CSV log files and
extract the caql references to construct pre-aggregated output with
reference count statistics.
EOF_csvlong
my $connlong = <<'EOF_connlong';
psql connect string,
e.g: -connect '-p 11000 template1'
If this option is supplied, connect to the database to determine the
location of the data directories, and then find the CSV log files and
perform phase 1.
EOF_connlong
}
# SLZY_LONG_END