| #!/usr/bin/perl |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| # SLZY_HDR_END |
| |
| use POSIX; |
| use Pod::Usage; |
| use Getopt::Long; |
| use Data::Dumper; |
| #use JSON; |
| use strict; |
| use warnings; |
| |
| # SLZY_POD_HDR_BEGIN |
| # WARNING: DO NOT MODIFY THE FOLLOWING POD DOCUMENT: |
| # Generated by sleazy.pl version 6 (release Mon Aug 20 12:30:03 2012) |
| # Make any changes under SLZY_TOP_BEGIN/SLZY_LONG_BEGIN |
| |
| =head1 NAME |
| |
| B<caqltrack.pl> - track caql usage in csv files |
| |
| =head1 VERSION |
| |
| This document describes version 5 of caqltrack.pl, released |
| Wed Nov 7 19:51:35 2012. |
| |
| =head1 SYNOPSIS |
| |
| B<caqltrack.pl> |
| |
| Options: |
| |
| -help brief help message |
| -man full documentation |
| -connect psql connect parameters |
| -csv process csv (log) files and extract/pre-aggregate caql usage statistics |
| -missing process aggregated output and list missing entries |
| -aggregate process pre-agg output from caqltrack to produce final statistics |
| |
| =head1 OPTIONS |
| |
| =over 8 |
| |
| =item B<-help> |
| |
| Print a brief help message and exits. |
| |
| =item B<-man> |
| |
| Prints the manual page and exits. |
| |
| =item B<-connect> |
| |
| psql connect string, |
| |
| e.g: -connect '-p 11000 template1' |
| |
| If this option is supplied, connect to the database to determine the |
| location of the data directories, and then find the CSV log files and |
| perform phase 1. |
| |
| =item B<-csv> |
| |
| In the first phase, consume the contents of the CSV log files and |
| extract the caql references to construct pre-aggregated output with |
| reference count statistics. |
| |
| =item B<-missing> |
| |
| |
| In the third phase, consume the aggregated output of the second |
| phase and list the "missing" functions. |
| |
| =item B<-aggregate> |
| |
| In the second phase, consume the pre-aggregated output of the first |
| phase and produce a final set of totals. |
| |
| |
| =back |
| |
| =head1 DESCRIPTION |
| |
| caqltrack processes CSV log files and extracts references to caql |
| basic functions. The "csv" phase is designed to iterate over sets of |
| CSV files (as supplied by xargs) and produce pre-aggregated output. |
| Since xargs may invoke caqltrack.pl multiple times, we cannot |
| guarantee that a single invocation of caqltrack will process all the |
| csv files. Thus, the first phase uses caqltrack to pre-aggregate the |
| caql basic function references, and the second phase invokes caqltrack |
| to consume the pre-aggregated output and produce the final totals. |
| |
| =head2 USAGE |
| |
| find [data-directory] -name '*.csv' | xargs perl caqltrack.pl -csv | perl caqltrack.pl -agg |
| |
| =head2 OUTPUT |
| |
| The output of the first phase is sorted by basic query function, |
| filename, lineno, unique query code, and the "first arg" (if it is an |
| oid) followed by a reference count, with vertical bar separators, eg: |
| |
| caql_basic_fn_94|aclchk.c|3141|33|101|6 |
| |
| The output of the first phase is "pre-aggregated", so it may have |
| multiple rows for a single (query function, filename, lineno) key. |
| |
| The second phase consumes this output, and produces a final of |
| fully-aggregated output. Also, this phase sums all references to a |
| basic function into a #TOTAL# row: |
| |
| caql_basic_fn_85|ruleutils.c|7137|33|101306 |
| caql_basic_fn_85|tablecmds.c|1464333|101|6 |
| caql_basic_fn_85|#TOTAL#|0|0|0312 |
| |
| Finally, the second phase output is terminated with grand total and |
| total unique query counts: |
| |
| #GRANDTOTAL#|#TOTAL#|0|0|0|1676343 |
| #NUMQUERY#|#TOTAL#|0|0|0|105 |
| |
| |
| |
| =head1 CAVEATS/Future Work |
| |
| |
| |
| =head1 AUTHORS |
| |
| Apache HAWQ |
| |
| Address bug reports and comments to: dev@hawq.apache.org |
| |
| =cut |
| # SLZY_POD_HDR_END |
| |
| # SLZY_GLOB_BEGIN |
| my $glob_id = ""; |
| #my $glob_tabstr = "\t"; |
| #my $glob_tabstr = " " x $glob_tabwidth; |
| my $glob_glob2 = {tabwidth => 4, spacedtab => 1, tabstr => " " x 4}; |
| my $glob_glob; |
| # SLZY_GLOB_END |
| |
| sub glob_validate |
| { |
| unless ((exists($glob_glob->{csv}) && $glob_glob->{csv}) || |
| (exists($glob_glob->{connect}) && $glob_glob->{connect}) || |
| (exists($glob_glob->{missing}) && $glob_glob->{missing}) || |
| (exists($glob_glob->{aggregate}) && $glob_glob->{aggregate})) |
| { |
| warn("ERROR: Must specify either CSV (process csv) or Aggregate options"); |
| pod2usage(-msg => $glob_id, -exitstatus => 1) ; |
| } |
| } |
| |
| # SLZY_CMDLINE_BEGIN |
| # WARNING: DO NOT MODIFY THE FOLLOWING SECTION: |
| # Generated by sleazy.pl version 6 (release Mon Aug 20 12:30:03 2012) |
| # Make any changes under SLZY_TOP_BEGIN/SLZY_LONG_BEGIN |
| # Any additional validation logic belongs in glob_validate() |
| |
| BEGIN { |
| my $s_help = 0; # brief help message |
| my $s_man = 0; # full documentation |
| my $s_connect; # psql connect parameters |
| my $s_csv = 0; # process csv (log) files and extract/pre-aggregate caql usage statistics |
| my $s_missing = 0; # process aggregated output and list missing entries |
| my $s_aggregate = 0; # process pre-agg output from caqltrack to produce final statistics |
| |
| my $slzy_argv_str; |
| $slzy_argv_str = quotemeta(join(" ", @ARGV)) |
| if (scalar(@ARGV)); |
| |
| GetOptions( |
| 'help|?' => \$s_help, |
| 'man' => \$s_man, |
| 'connect:s' => \$s_connect, |
| 'csv|log' => \$s_csv, |
| 'missing' => \$s_missing, |
| 'aggregate' => \$s_aggregate, |
| ) |
| or pod2usage(2); |
| |
| pod2usage(-msg => $glob_id, -exitstatus => 1) if $s_help; |
| pod2usage(-msg => $glob_id, -exitstatus => 0, -verbose => 2) if $s_man; |
| |
| |
| $glob_glob = {}; |
| |
| |
| # version and properties from json definition |
| $glob_glob->{_sleazy_properties} = {}; |
| $glob_glob->{_sleazy_properties}->{version} = '5'; |
| $glob_glob->{_sleazy_properties}->{slzy_date} = '1352346695'; |
| $glob_glob->{_sleazy_properties}->{slzy_argv_str} = $slzy_argv_str; |
| |
| $glob_glob->{connect} = $s_connect if (defined($s_connect)); |
| $glob_glob->{csv} = $s_csv if (defined($s_csv)); |
| $glob_glob->{missing} = $s_missing if (defined($s_missing)); |
| $glob_glob->{aggregate} = $s_aggregate if (defined($s_aggregate)); |
| |
| glob_validate(); |
| |
| |
| } |
| # SLZY_CMDLINE_END |
| |
| # convert a postgresql psql formatted table into an array of hashes |
| sub tablelizer |
| { |
| my ($ini, $got_line1) = @_; |
| |
| # first, split into separate lines, the find all the column headings |
| |
| my @lines = split(/\n/, $ini); |
| |
| return undef |
| unless (scalar(@lines)); |
| |
| # if the first line is supplied, then it has the column headers, |
| # so don't try to find them (or the ---+---- separator) in |
| # "lines" |
| my $line1 = $got_line1; |
| $line1 = shift @lines |
| unless (defined($got_line1)); |
| |
| # look for <space>|<space> |
| my @colheads = split(/\s+\|\s+/, $line1); |
| |
| # fixup first, last column head (remove leading,trailing spaces) |
| |
| $colheads[0] =~ s/^\s+//; |
| $colheads[0] =~ s/\s+$//; |
| $colheads[-1] =~ s/^\s+//; |
| $colheads[-1] =~ s/\s+$//; |
| |
| return undef |
| unless (scalar(@lines)); |
| |
| shift @lines # skip dashed separator (unless it was skipped already) |
| unless (defined($got_line1)); |
| |
| my @rows; |
| |
| for my $lin (@lines) |
| { |
| my @cols = split(/\|/, $lin, scalar(@colheads)); |
| last |
| unless (scalar(@cols) == scalar(@colheads)); |
| |
| my $rowh = {}; |
| |
| for my $colhdcnt (0..(scalar(@colheads)-1)) |
| { |
| my $rawcol = shift @cols; |
| |
| $rawcol =~ s/^\s+//; |
| $rawcol =~ s/\s+$//; |
| |
| my $colhd = $colheads[$colhdcnt]; |
| # $rowh->{($colhdcnt+1)} = $rawcol; |
| $rowh->{$colhd} = $rawcol; |
| } |
| push @rows, $rowh; |
| } |
| |
| return \@rows; |
| } |
| |
| sub do_conn |
| { |
| |
| my $bigstr = <<'EOF_bigstr'; |
| select gscp.dbid, |
| gscp.content, |
| gscp.hostname as hostname, gscp.address as address, |
| fep.fselocation as loc, |
| pfs.oid fsoid, |
| pfs.fsname, |
| gscp.mode, |
| gscp.status, |
| gscp.preferred_role |
| from |
| gp_segment_configuration gscp, pg_filespace_entry fep, |
| pg_filespace pfs |
| where |
| fsname = $q$pg_system$q$ |
| and |
| fep.fsedbid=gscp.dbid |
| and pfs.oid = fep.fsefsoid |
| order by 1,2 |
| EOF_bigstr |
| |
| $bigstr .= ' ; '; |
| |
| my $psql_str = "psql "; |
| |
| $psql_str .= $glob_glob->{connect}; |
| |
| $psql_str .= " -c \' $bigstr \'"; |
| |
| my $tabdef = `$psql_str`; |
| |
| my $seg_config_table = tablelizer($tabdef); |
| |
| # print Data::Dumper->Dump([$seg_config_table]); |
| |
| my %h1; |
| |
| for my $rowh (@{$seg_config_table}) |
| { |
| my $dir = $rowh->{loc}; |
| my $hostname = $rowh->{hostname}; |
| |
| my @foo = File::Spec->splitdir($dir); |
| |
| pop @foo; |
| |
| $dir = File::Spec->catdir(@foo); |
| |
| $h1{$hostname . ":" . $dir} = {hostname => $hostname, |
| dir => $dir}; |
| } |
| |
| for my $hh (sort(keys(%h1))) |
| { |
| my $dir = $h1{$hh}->{dir}; |
| my $prog = $0; |
| |
| system("find $dir -name '*.csv' | xargs perl $prog -csv "); |
| # system("find $dir -name '*.csv' "); |
| # print `find $dir -name '*.csv' `; |
| |
| } |
| |
| |
| } # end do_conn |
| |
| sub do_csv |
| { |
| my %bigh; |
| |
| while (<>) |
| { |
| my $ini = $_; |
| |
| next unless ($ini =~ m/catquery.*caql\_basic/); |
| |
| # make sure is actual message, not sql string for view: |
| # 'catquery: caql_basic_fn_', ''), 'caller: ', ''), ' ') \ |
| # as caql_mess_arr |
| next unless ($ini =~ m/caql\_basic\_fn\_\d+\s+caller/); |
| |
| # print $ini; |
| my @ggg = split(/catquery: /, $ini); |
| my @fff = split(/\,/, $ggg[1]); |
| my $lin = $fff[0]; |
| die "bad line: $ini" unless (defined($lin) && length($lin)); |
| |
| # print $lin; |
| $lin =~ s/\"//g; # remove quotes; |
| # change to vertical bar separated list |
| $lin =~ s/\s*caller\:\s*/\|/; |
| $lin =~ s/\s+/\|/g; |
| |
| $bigh{$lin} = 0 unless (exists($bigh{$lin})); |
| $bigh{$lin} += 1; |
| } # end while |
| |
| # output in the form of |
| # <func name>|<filename>|<lineno>|<uniqqno>|<firstarg>|<reference count>, |
| # eg: |
| # |
| # caql_basic_fn_118|dbcommands.c|2270|33|101|3 |
| |
| for my $kk (sort(keys(%bigh))) |
| { |
| print $kk, "|", $bigh{$kk}, "\n"; |
| } |
| |
| } # end do_csv |
| |
| sub do_agg |
| { |
| my %bigh; |
| |
| # input in the form of |
| # <func name>|<filename>|<lineno>|<uniqqno>|<firstarg>|<reference count>, |
| # eg: |
| # |
| # caql_basic_fn_118|dbcommands.c|2270|33|101|3 |
| |
| # NOTE: since caqltrack -csv is called via xargs, may have |
| # multiple sets of pre-aggregated lines, |
| # so split by "key" portion (func, file, lineno, uniqqno, firstarg) and |
| # sum the reference counts |
| |
| while (<>) |
| { |
| my $ini = $_; |
| my @ggg = split(/\|/, $ini); |
| die "bad line: $ini" unless (1 < scalar(@ggg)); |
| |
| # extract the reference count from the end of the line |
| my $num = pop @ggg; |
| die "bad line count: $ini" unless ($num =~ m/\d+/); |
| |
| # convert the "firstarg" to zero to fix aggregation |
| $ggg[-1] = 0; |
| |
| my $lin = join("|", @ggg); |
| $bigh{$lin} = 0 unless (exists($bigh{$lin})); |
| $bigh{$lin} += $num; # NOTE: not 1, sum the reference counts |
| |
| } |
| |
| my $currqry; |
| my $numcurr = 0; |
| my $numqry = 0; |
| my $grandtot = 0; |
| |
| for my $kk (sort(keys(%bigh))) |
| { |
| my @ggg = split(/\|/, $kk); |
| |
| if (defined($currqry)) |
| { |
| if ($currqry eq $ggg[0]) |
| { |
| $numcurr += $bigh{$kk}; |
| } |
| else |
| { |
| print $currqry, "|#TOTAL#|0|0|0|", $numcurr, "\n"; |
| $currqry = $ggg[0]; |
| $numcurr = $bigh{$kk}; |
| $numqry += 1; |
| } |
| } |
| else |
| { |
| $currqry = $ggg[0]; |
| $numcurr = $bigh{$kk}; |
| $numqry += 1; |
| } |
| print $kk, "|", $bigh{$kk}, "\n"; |
| $grandtot += $bigh{$kk}; |
| |
| } # end for $kk |
| if (defined($currqry)) |
| { |
| print $currqry, "|#TOTAL#|0|0|0|", $numcurr, "\n"; |
| } |
| if ($grandtot) |
| { |
| print "#GRANDTOTAL#|#TOTAL#|0|0|0|", $grandtot, "\n"; |
| print "#NUMQUERY#|#TOTAL#|0|0|0|", $numqry, "\n"; |
| } |
| |
| } # end do_agg |
| |
| sub do_missing |
| { |
| my $biga = []; |
| |
| $biga->[0] = undef; |
| |
| # input in the form of |
| # <func name>|#TOTAL#|0|0|0|<reference count>, eg: |
| # |
| # caql_basic_fn_118|#TOTAL#|0|0|0|3 |
| |
| while (<>) |
| { |
| my $ini = $_; |
| |
| next unless ($ini =~ m/^caql\_basic\_fn/); |
| next unless ($ini =~ m/\#TOTAL\#/); |
| |
| my @ggg = ($ini =~ m/^caql\_basic\_fn\_(\d+)/); |
| die "bad line: $ini" unless (1 == scalar(@ggg)); |
| |
| # extract the function number |
| my $num = pop @ggg; |
| $num--; # convert from 1 to zero based for array |
| die "bad fn: $ini" unless ($num >= 0); |
| |
| $biga->[$num] = $ini; |
| } |
| |
| my $numMiss = 0; |
| for my $qnum (0..(scalar(@{$biga})-1)) |
| { |
| unless (exists($biga->[$qnum]) && |
| defined($biga->[$qnum])) |
| { |
| $numMiss += 1; |
| print "caql_basic_fn_" . ($qnum+1) . "\n"; |
| } |
| } |
| print "#MISSING: ", $numMiss, "\n"; |
| print "#MAXNUM: ", scalar(@{$biga}), "\n"; |
| |
| } # end do_missing |
| |
| if (1) |
| { |
| if (exists($glob_glob->{connect}) && $glob_glob->{connect}) |
| { |
| do_conn(); |
| } |
| elsif (exists($glob_glob->{csv}) && $glob_glob->{csv}) |
| { |
| do_csv(); |
| } |
| elsif (exists($glob_glob->{aggregate}) && $glob_glob->{aggregate}) |
| { |
| do_agg(); |
| } |
| elsif (exists($glob_glob->{missing}) && $glob_glob->{missing}) |
| { |
| do_missing(); |
| } |
| else |
| { |
| die "invalid options!"; |
| } |
| |
| |
| } |
| |
| # SLZY_TOP_BEGIN |
| if (0) |
| { |
| my $bigstr = <<'EOF_bigstr'; |
| { |
| "args" : [ |
| { |
| "alias" : "?", |
| "long" : "Print a brief help message and exits.", |
| "name" : "help", |
| "required" : "0", |
| "short" : "brief help message", |
| "type" : "untyped" |
| }, |
| { |
| "long" : "Prints the manual page and exits.", |
| "name" : "man", |
| "required" : "0", |
| "short" : "full documentation", |
| "type" : "untyped" |
| }, |
| { |
| "long" : "$connlong", |
| "name" : "connect", |
| "short" : "psql connect parameters", |
| "type" : "string" |
| }, |
| { |
| "alias" : "log", |
| "long" : "$csvlong", |
| "name" : "csv", |
| "short" : "process csv (log) files and extract/pre-aggregate caql usage statistics", |
| "type" : "u" |
| }, |
| { |
| "long" : "$missinglong", |
| "name" : "missing", |
| "short" : "process aggregated output and list missing entries", |
| "type" : "u" |
| }, |
| { |
| "long" : "$agglong", |
| "name" : "aggregate", |
| "short" : "process pre-agg output from caqltrack to produce final statistics", |
| "type" : "u" |
| } |
| ], |
| "long" : "$toplong", |
| "properties" : { |
| "slzy_date" : 1352346695 |
| }, |
| "short" : "track caql usage in csv files", |
| "version" : "5" |
| } |
| |
| EOF_bigstr |
| } |
| # SLZY_TOP_END |
| |
| # SLZY_LONG_BEGIN |
| if (0) |
| { |
| |
| my $toplong = <<'EOF_longstr'; |
| caqltrack processes CSV log files and extracts references to caql |
| basic functions. The "csv" phase is designed to iterate over sets of |
| CSV files (as supplied by xargs) and produce pre-aggregated output. |
| Since xargs may invoke caqltrack.pl multiple times, we cannot |
| guarantee that a single invocation of caqltrack will process all the |
| csv files. Thus, the first phase uses caqltrack to pre-aggregate the |
| caql basic function references, and the second phase invokes caqltrack |
| to consume the pre-aggregated output and produce the final totals. |
| |
| {HEAD2} USAGE |
| |
| find [data-directory] -name '*.csv' | xargs perl caqltrack.pl -csv | perl caqltrack.pl -agg |
| |
| {HEAD2} OUTPUT |
| |
| The output of the first phase is sorted by basic query function, |
| filename, lineno, unique query code, and the "first arg" (if it is an |
| oid) followed by a reference count, with vertical bar separators, eg: |
| |
| caql_basic_fn_94|aclchk.c|3141|33|101|6 |
| |
| The output of the first phase is "pre-aggregated", so it may have |
| multiple rows for a single (query function, filename, lineno) key. |
| |
| The second phase consumes this output, and produces a final of |
| fully-aggregated output. Also, this phase sums all references to a |
| basic function into a #TOTAL# row: |
| |
| caql_basic_fn_85|ruleutils.c|7137|33|101306 |
| caql_basic_fn_85|tablecmds.c|1464333|101|6 |
| caql_basic_fn_85|#TOTAL#|0|0|0312 |
| |
| Finally, the second phase output is terminated with grand total and |
| total unique query counts: |
| |
| #GRANDTOTAL#|#TOTAL#|0|0|0|1676343 |
| #NUMQUERY#|#TOTAL#|0|0|0|105 |
| |
| |
| |
| {HEAD1} CAVEATS/Future Work |
| |
| |
| EOF_longstr |
| |
| my $missinglong = <<'EOF_missinglong'; |
| |
| In the third phase, consume the aggregated output of the second |
| phase and list the "missing" functions. |
| EOF_missinglong |
| |
| my $agglong = <<'EOF_agglong'; |
| In the second phase, consume the pre-aggregated output of the first |
| phase and produce a final set of totals. |
| EOF_agglong |
| |
| my $csvlong = <<'EOF_csvlong'; |
| In the first phase, consume the contents of the CSV log files and |
| extract the caql references to construct pre-aggregated output with |
| reference count statistics. |
| EOF_csvlong |
| |
| my $connlong = <<'EOF_connlong'; |
| psql connect string, |
| |
| e.g: -connect '-p 11000 template1' |
| |
| If this option is supplied, connect to the database to determine the |
| location of the data directories, and then find the CSV log files and |
| perform phase 1. |
| EOF_connlong |
| |
| } |
| # SLZY_LONG_END |
| |