blob: ac9b8ecc69be0168edcbb0800e54332e2e6fc508 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
package TestDriverHadoop;
# Test driver for pig nightly tests.
use TestDriver;
use IPC::Run; # don't do qw(run), it screws up TestDriver which also has a run method
use Digest::MD5 qw(md5_hex);
use Util;
use File::Path;
use Cwd;
use English;
our $className= "TestDriver";
our @ISA = "$className";
our $ROOT = (defined $ENV{'HARNESS_ROOT'} ? $ENV{'HARNESS_ROOT'} : die "ERROR: You must set environment variable HARNESS_ROOT\n");
our $toolpath = "$ROOT/libexec/HCatTest";
my $passedStr = 'passed';
my $failedStr = 'failed';
my $abortedStr = 'aborted';
my $skippedStr = 'skipped';
my $dependStr = 'failed_dependency';
sub new
# Call our parent
my ($proto) = @_;
my $class = ref($proto) || $proto;
my $self = $class->SUPER::new;
bless($self, $class);
return $self;
sub globalSetup
my ($self, $globalHash, $log) = @_;
my $subName = (caller(0))[3];
# Setup the output path
my $me = `whoami`;
chomp $me;
$globalHash->{'runid'} = $me . "." . time;
# if "-ignore false" was provided on the command line,
# it means do run tests even when marked as 'ignore'
if(defined($globalHash->{'ignore'}) && $globalHash->{'ignore'} eq 'false')
$self->{'ignore'} = 'false';
$globalHash->{'outpath'} = $globalHash->{'outpathbase'} . "/" . $globalHash->{'runid'} . "/";
$globalHash->{'localpath'} = $globalHash->{'localpathbase'} . "/" . $globalHash->{'runid'} . "/";
# add libexec location to the path
if (defined($ENV{'PATH'})) {
$ENV{'PATH'} = $globalHash->{'scriptPath'} . ":" . $ENV{'PATH'};
else {
$ENV{'PATH'} = $globalHash->{'scriptPath'};
Util::runHadoopCmd($globalHash, $log, "fs -mkdir $globalHash->{'outpath'}");
IPC::Run::run(['mkdir', '-p', $globalHash->{'localpath'}], \undef, $log, $log) or
die "Cannot create localpath directory " . $globalHash->{'localpath'} .
" " . "$ERRNO\n";
IPC::Run::run(['mkdir', '-p', $globalHash->{'benchmarkPath'}], \undef, $log, $log) or
die "Cannot create benchmark directory " . $globalHash->{'benchmarkPath'} .
" " . "$ERRNO\n";
# Create the temporary directory
IPC::Run::run(['mkdir', '-p', $globalHash->{'tmpPath'}], \undef, $log, $log) or
die "Cannot create temporary directory " . $globalHash->{'tmpPath'} .
" " . "$ERRNO\n";
Util::runHadoopCmd($globalHash, $log, "fs -mkdir tmp/$globalHash->{'runid'}");
sub globalCleanup
sub runTest
my ($self, $testCmd, $log) = @_;
my $subName = (caller(0))[3];
# Handle the various methods of running used in
# the original TestDrivers
if ( $testCmd->{'hcat_prep'} ) {
Util::prepareHCat($self, $testCmd, $log);
if ( $testCmd->{'hadoop'} ) {
my $result;
if (defined($testCmd->{'result_table'})) {
$result = $self->runHadoop( $testCmd, $log );
my @results = ();
my @outputs = ();
if (ref($testCmd->{'result_table'}) ne 'ARRAY') {
$results[0] = $testCmd->{'result_table'};
} else {
@results = @{$testCmd->{'result_table'}};
my $id = 0; # regular ouput count
for (my $i = 0; $i < @results; $i++) {
if ($results[$i] ne '?') {
my %modifiedTestCmd = %{$testCmd};
$pigfiles[$i] = $testCmd->{'localpath'} .
$testCmd->{'group'} . "_" . $testCmd->{'num'} .
$outfiles[$i] = $testCmd->{'thisResultsPath'} . "/" .
$testCmd->{'group'} . "_" . $testCmd->{'num'} . ".$i.out";
$tableName = $results[$i];
$modifiedTestCmd{'num'} = $testCmd->{'num'} . "_" . $i . "_benchmark";
$modifiedTestCmd{'pig'} = "a = load '$tableName' using org.apache.hcatalog.pig.HCatLoader(); store a into ':OUTPATH:';";
my $r = $self->runPig(\%modifiedTestCmd, $log, 1);
$outputs[$i] = $r->{'output'};
} else {
$localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out/$id";
my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out";
# Copy result file out of hadoop
my @baseCmd = Util::getPigCmd($testCmd, $log);
my $testOut = $self->postProcessSingleOutputFile($outfile, $localdir, $testCmd, $log);
$outputs[$i] = $testOut;
if ($self->countStores($testCmd)==1) {
else {
$result = $self->runHadoop( $testCmd, $log );
return $result;
} else {
die "$subName FATAL Did not find a testCmd that I know how to handle";
sub dumpPigTable
my ($self, $testCmd, $table, $log, $id) = @_;
my $subName = (caller(0))[3];
my %result;
# Write the pig script to a file.
my $pigfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . $id . ".dump.pig";
my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . $id . "dump.out";
open(FH, "> $pigfile") or die "Unable to open file $pigfile to write pig script, $ERRNO\n";
print FH "a = load '$table' using org.apache.hcatalog.pig.HCatLoader(); store a into '$outfile';\n";
# Build the command
my @baseCmd = Util::getPigCmd($testCmd, $log);
my @cmd = @baseCmd;
push(@cmd, $pigfile);
# Run the command
print $log "$0::$className::$subName INFO: Going to run pig command: @cmd\n";
IPC::Run::run(\@cmd, \undef, $log, $log) or die "Failed running $pigfile\n";
$result{'rc'} = $? >> 8;
# Get results from the command locally
my $localoutfile;
my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . $id . ".dump.out";
$outfile = $self->postProcessSingleOutputFile($outfile, $localdir, $testCmd, $log);
return $outfile;
sub postProcessSingleOutputFile
my ($self, $outfile, $localdir, $testCmd, $log) = @_;
my $subName = (caller(0))[3];
Util::runHadoopCmd($globalHash, $log, "fs -copyToLocal $outfile $localdir");
# Sort the result if necessary. Keep the original output in one large file.
# Use system not IPC run so that the '*' gets interpolated by the shell.
# Build command to:
# 1. Combine part files
my $fppCmd = "cat $localdir/map* $localdir/part* 2>/dev/null";
# 2. Standardize float precision
if (defined $testCmd->{'floatpostprocess'} &&
defined $testCmd->{'delimiter'}) {
$fppCmd .= " | $toolpath/ '" .
$testCmd->{'delimiter'} . "'";
$fppCmd .= " > $localdir/out_original";
# run command
print $log "$fppCmd\n";
# Sort the results for the benchmark compare.
my @sortCmd = ('sort', "$localdir/out_original");
print $log join(" ", @sortCmd) . "\n";
IPC::Run::run(\@sortCmd, '>', "$localdir/out_sorted");
return "$localdir/out_sorted";
sub runHadoop
my ($self, $testCmd, $log) = @_;
my $subName = (caller(0))[3];
my %result;
# Write the hadoop command to a file.
my $hadoopfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".hadoop";
my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out";
# Get all of the additional jars we'll need.
my $additionalJars = Util::getHBaseLibs($testCmd, $log); #hbase before hive for precedence over bundled hbase
$additionalJars .= Util::getHCatLibs($testCmd, $log);
$additionalJars .= Util::getHiveLibs($testCmd, $log);
$testCmd->{'libjars'} = $additionalJars;
$testCmd->{'libjars'} =~ s/:/,/g;
my $hadoopcmd = Util::replaceParameters( $testCmd->{'hadoop'}, $outfile, $testCmd, $log );
# adjust for the leading and trailing new line often seen in the conf file's command directives
$hadoopcmd =~ s/^\s*(.*?)\s*$/\1/s;
open(FH, "> $hadoopfile") or die "Unable to open file $hadoopfile to write hadoop command file, $ERRNO\n";
print FH $hadoopcmd . "\n";
# Build the command
my @cmd = Util::getHadoopCmd($testCmd);
# Add command line arguments if they're provided
if (defined($testCmd->{'hadoop_cmdline_args'})) {
push(@cmd, @{$testCmd->{'hadoop_cmdline_args'}});
# Add the test command elements
push(@cmd, split(/ +/,$hadoopcmd));
# Set HADOOP_CLASSPATH environment variable if provided
my $cp = $testCmd->{'hcatalog.jar'};
$cp =~ s/,/:/g;
# Add in the hcat config file
$cp .= ":" . $testCmd->{'hiveconf'};
$cp .= ":" . $additionalJars;
if (defined($testCmd->{'hbaseconf'})) {
$ENV{'HADOOP_CLASSPATH'} = "$ENV{'HADOOP_CLASSPATH'}:$testCmd->{'hbaseconf'}";
# Add su user if provided
if (defined($testCmd->{'run_as'})) {
my $cmd = '"' . join (" ", @cmd) . '"';
@cmd = ("echo", $cmd, "|", "su", $testCmd->{'run_as'});
my $script = $hadoopfile . ".sh";
open(FH, ">$script") or die "Unable to open file $script to write script, $ERRNO\n";
print FH join (" ", @cmd) . "\n";
my @result=`chmod +x $script`;
# Run the command
print $log "$0::$className::$subName INFO: Going to run hadoop command in shell script: $script\n";
print $log "$0::$className::$subName INFO: Going to run hadoop command: " . join(" ", @cmd) . "\n";
print $log "With HADOOP_CLASSPATH set to " . $ENV{'HADOOP_CLASSPATH'} . " and HADOOP_OPTS set to " . $ENV{'HADOOP_OPTS'} . "\n";
my @runhadoop = ("$script");
IPC::Run::run(\@runhadoop, \undef, $log, $log) or
die "Failed running $script\n";
my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . $id . ".dump.out";
my @baseCmd = Util::getPigCmd($testCmd, $log);
if ($self->countStores($testCmd)==1) {
@outputs = ();
$outputs[0] = $self->postProcessSingleOutputFile($outfile, $localdir, $testCmd, $log);
$result{'outputs'} = \@outputs;
return \%result;
} # end sub runHadoop
sub compare
my ($self, $testResult, $benchmarkResult, $log, $testCmd) = @_;
my $subName = (caller(0))[3];
my $result;
if (defined($testResult->{'outputs'})) {
my $res = 0;
my @outputs = $testResult->{'outputs'};
my $count = @outputs;
for (my $id = 0; $id < $count; $id++) {
my $testOutput = ($testResult->{'outputs'})->[$id];
my $benchmarkOutput = ($benchmarkResult->{'outputs'})->[$id];
$res += $self->compareSingleOutput($testResult, $testOutput,
$benchmarkOutput, $log);
$result = ($res == ($count)) ? 1 : 0;
} else {
$result = $self->compareSingleOutput($testResult, $testResult->{'output'},
$benchmarkResult->{'output'}, $log);
return $result;
sub generateBenchmark
my ($self, $testCmd, $log) = @_;
my %result;
my @SQLQuery = @{$testCmd->{'sql'}};
my @SQLQuery = ();
if (ref($testCmd->{'sql'}) ne 'ARRAY') {
$SQLQuery[0] = $testCmd->{'sql'};
} else {
@SQLQuery = @{$testCmd->{'sql'}};
my @outfiles = ();
for (my $id = 0; $id < ($#SQLQuery + 1); $id++) {
my $sql = $SQLQuery[$id];
my $outfile = $self->generateSingleSQLBenchmark($testCmd, $sql, ($id+1), $log);
push(@outfiles, $outfile);
$result{'outputs'} = \@outfiles;
return \%result;
sub generateSingleSQLBenchmark
my ($self, $testCmd, $sql, $id, $log) = @_;
my $qmd5 = substr(md5_hex($testCmd->{'pig'}), 0, 5);
my $sqlfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".benchmark.$id.sql";
my $outfile = $testCmd->{'benchmarkPath'} . "/" . $testCmd->{'group'} . "_" . $testCmd->{'num'};
$outfile .= defined($id) ? ".$id" . ".out" : ".out";
my $outfp;
open($outfp, "> $outfile") or
die "Unable to open output file $outfile, $!\n";
open(FH, "> $sqlfile") or
die "Unable to open file $sqlfile to write SQL script, $ERRNO\n";
print FH $sql;
Util::runDbCmd($testCmd, $log, $sqlfile, $outfp);
$rcs[$i] = $? >> 8;
my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out";
$outfile =
$self->postProcessSingleSQLOutputFile($outfile, $testCmd, $log);
return $outfile;
sub postProcessSingleSQLOutputFile
my ($self, $outfile, $testCmd, $log, $isBenchmark) = @_;
# If requested, process the data to smooth over floating point
# differences.
if (defined $testCmd->{'floatpostprocess'} &&
defined $testCmd->{'delimiter'}) {
# Move the file to a temp file and run through the pre-processor.
my $tmpfile = "$outfile.tmp";
link($outfile, $tmpfile) or
die "Unable to create temporary file $tmpfile, $!\n";
unlink($outfile) or
die "Unable to unlink file $outfile, $!\n";
open(IFH, "< $tmpfile") or
die "Unable to open file $tmpfile, $!\n";
open(OFH, "> $outfile") or
die "Unable to open file $outfile, $!\n";
my @cmd = ("$toolpath/",
print $log "Going to run [" . join(" ", @cmd) . "]\n";
IPC::Run::run(\@cmd, \*IFH, \*OFH, $log) or
die "Failed to run float postprocessor, $!\n";
if ($isBenchmark && defined $testCmd->{'nullpostprocess'}) {
# Move the file to a temp file and run through the pre-processor.
my $tmpfile = "$outfile.tmp";
link($outfile, $tmpfile) or
die "Unable to create temporary file $tmpfile, $!\n";
unlink($outfile) or
die "Unable to unlink file $outfile, $!\n";
open(IFH, "< $tmpfile") or
die "Unable to open file $tmpfile, $!\n";
open(OFH, "> $outfile") or
die "Unable to open file $outfile, $!\n";
my @cmd = ("sed", "s/NULL//g");
print $log "Going to run [" . join(" ", @cmd) . "]\n";
IPC::Run::run(\@cmd, \*IFH, \*OFH, $log) or
die "Failed to run float postprocessor, $!\n";
# Sort the results for the benchmark compare.
my $sortfile = "$outfile.sorted";
my @cmd = ("sort", $outfile);
print $log "Going to run [" . join(" ", @cmd) . "]\n";
IPC::Run::run(\@cmd, '>', "$sortfile");
return $sortfile;
sub runPig
my ($self, $testCmd, $log, $copyResults) = @_;
my $subName = (caller(0))[3];
my %result;
# Write the pig script to a file.
my $pigfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".pig";
my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out";
my $pigcmd = Util::replaceParameters( $testCmd->{'pig'}, $outfile, $testCmd, $log );
open(FH, "> $pigfile") or die "Unable to open file $pigfile to write pig script, $ERRNO\n";
print FH $pigcmd . "\n";
# Build the command
#my @baseCmd = $self->getPigCmd($testCmd, $log);
my @baseCmd = Util::getPigCmd($testCmd, $log);
my @cmd = @baseCmd;
# Add option -l giving location for secondary logs
my $locallog = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".log";
push(@cmd, "-logfile");
push(@cmd, $locallog);
# Add pig parameters if they're provided
if (defined($testCmd->{'pig_params'})) {
# Processing :PARAMPATH: in parameters
foreach my $param (@{$testCmd->{'pig_params'}}) {
$param =~ s/:PARAMPATH:/$testCmd->{'paramPath'}/g;
push(@cmd, @{$testCmd->{'pig_params'}});
push(@cmd, $pigfile);
# Run the command
print $log "Setting PIG_CLASSPATH to $ENV{'PIG_CLASSPATH'}\n";
print $log "$0::$className::$subName INFO: Going to run pig command: @cmd\n";
IPC::Run::run(\@cmd, \undef, $log, $log) or
die "Failed running $pigfile\n";
$result{'rc'} = $? >> 8;
# Get results from the command locally
my $localoutfile;
my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out";
my $stores = $self->countStores($testCmd);
# single query
if ($stores == 1) {
if ($copyResults) {
$result{'output'} = $self->postProcessSingleOutputFile($outfile, $localdir, $testCmd, $log);
$result{'originalOutput'} = "$localdir/out_original"; # populated by postProcessSingleOutputFile
} else {
$result{'output'} = "NO_COPY";
# multi query
else {
my @outfiles = ();
for (my $id = 1; $id <= ($stores); $id++) {
$localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out/$id";
$localoutfile = $outfile . ".$id";
# Copy result file out of hadoop
my $testOut;
if ($copyResults) {
$testOut = $self->postProcessSingleOutputFile($localoutfile, $localdir, $testCmd, $log);
} else {
$testOut = "NO_COPY";
push(@outfiles, $testOut);
##!!! originalOutputs not set! Needed?
$result{'outputs'} = \@outfiles;
# Compare doesn't get the testCmd hash, so I need to stuff the necessary
# info about sorting into the result.
if (defined $testCmd->{'sortArgs'} && $testCmd->{'sortArgs'}) {
$result{'sortArgs'} = $testCmd->{'sortArgs'};
return \%result;
sub compareSingleOutput
my ($self, $testResult, $testOutput, $benchmarkOutput, $log) = @_;
print $log "testResult: $testResult testOutput: $testOutput benchmarkOutput: $benchmarkOutput\n";
# cksum the the two files to see if they are the same
my ($testChksm, $benchmarkChksm);
IPC::Run::run((['cat', $testOutput], '|', ['cksum']), \$testChksm,
$log) or die "$0: error: cannot run cksum on test results\n";
IPC::Run::run((['cat', $benchmarkOutput], '|', ['cksum']),
\$benchmarkChksm, $log) or die "$0: error: cannot run cksum on benchmark\n";
chomp $testChksm;
chomp $benchmarkChksm;
print $log "test cksum: $testChksm\nbenchmark cksum: $benchmarkChksm\n";
my $result;
if ($testChksm ne $benchmarkChksm) {
print $log "Test output checksum does not match benchmark checksum\n";
print $log "Test checksum = <$testChksm>\n";
print $log "Expected checksum = <$benchmarkChksm>\n";
print $log "RESULTS DIFFER: vimdiff " . cwd . "/$testOutput " . cwd . "/$benchmarkOutput\n";
} else {
$result = 1;
# Now, check if the sort order is specified
if (defined($testResult->{'sortArgs'})) {
my @sortChk = ('sort', '-cs');
push(@sortChk, @{$testResult->{'sortArgs'}});
push(@sortChk, $testResult->{'originalOutput'});
print $log "Going to run sort check command: " . join(" ", @sortChk) . "\n";
IPC::Run::run(\@sortChk, \undef, $log, $log);
my $sortrc = $?;
if ($sortrc) {
print $log "Sort check failed\n";
$result = 0;
return $result;
# Count the number of stores in a Pig Latin script, so we know how many files
# we need to compare.
sub countStores($$)
my ($self, $testCmd) = @_;
if (defined $testCmd->{'pig'}) {
my $count;
# hope they don't have more than store per line
# also note that this won't work if you comment out a store
my @q = split(/\n/, $testCmd->{'pig'});
for (my $i = 0; $i < @q; $i++) {
$count += $q[$i] =~ /store\s+[a-zA-Z][a-zA-Z0-9_]*\s+into/i;
return $count;
else {
#defined $testCmd->{'hadoop'}
my $count;
my @q = split(/\n/, $testCmd->{'hadoop'});
for (my $i = 0; $i < @q; $i++) {
$count += $q[$i] =~ /OUTPATH/ig;
return $count;