blob: 8ac7a0b1f6dd889ec2ec0f3d0ed6eefaa0776a01 [file] [log] [blame]
############################################################################
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
package HCatExistingClusterDeployer;
use IPC::Run qw(run);
use TestDeployer;
use Util;
use strict;
use English;
our @ISA = "TestDeployer";
###########################################################################
# Class: HiveExistingClusterDeployer
# Deploy the Pig harness to a cluster and database that already exists.
##############################################################################
# Sub: new
# Constructor
#
# Paramaters:
# None
#
# Returns:
# None.
sub new
{
my $proto = shift;
my $class = ref($proto) || $proto;
my $self = {};
bless($self, $class);
return $self;
}
##############################################################################
# Sub: checkPrerequisites
# Check any prerequisites before a deployment is begun. For example if a
# particular deployment required the use of a database system it could
# check here that the db was installed and accessible.
#
# Paramaters:
# globalHash - hash from config file, including deployment config
# log - log file handle
#
# Returns:
# None
#
sub checkPrerequisites
{
my ($self, $cfg, $log) = @_;
if (! defined $ENV{'HADOOP_HOME'} || $ENV{'HADOOP_HOME'} eq "") {
print $log "You must set the environment variable HADOOP_HOME";
die "HADOOP_HOME not defined";
}
if (! defined $ENV{'HCAT_HOME'} || $ENV{'HCAT_HOME'} eq "") {
print $log "You must set the environment variable HCAT_HOME";
die "HCAT_HOME not defined";
}
if (! defined $ENV{'HIVE_HOME'} || $ENV{'HIVE_HOME'} eq "") {
print $log "You must set the environment variable HIVEOP_HOME";
die "HIVE_HOME not defined";
}
# Run a quick and easy Hadoop command to make sure we can
Util::runHadoopCmd($cfg, $log, "fs -ls /");
}
##############################################################################
# Sub: deploy
# Deploy any required packages
# This is a no-op in this case because we're assuming both the cluster and the
# database already exist
#
# Paramaters:
# globalHash - hash from config file, including deployment config
# log - log file handle
#
# Returns:
# None
#
sub deploy
{
}
##############################################################################
# Sub: start
# Start any software modules that are needed.
# This is a no-op in this case because we're assuming both the cluster and the
# database already exist
#
# Paramaters:
# globalHash - hash from config file, including deployment config
# log - log file handle
#
# Returns:
# None
#
sub start
{
}
##############################################################################
# Sub: generateData
# Generate any data needed for this test run.
#
# Paramaters:
# globalHash - hash from config file, including deployment config
# log - log file handle
#
# Returns:
# None
#
sub generateData
{
my ($self, $cfg, $log) = @_;
my @tables = (
{
'name' => "studenttab10k",
'filetype' => "studenttab",
'rows' => 10000,
'hdfs' => "studenttab10k",
}, {
'name' => "votertab10k",
'filetype' => "votertab",
'rows' => 10000,
'hdfs' => "votertab10k",
}, {
'name' => "studentparttab30k",
'filetype' => "studentparttab",
'rows' => 10000,
'hdfs' => "studentparttab30k",
'partitions' => ['20110924', '20110925', '20110926']
},{
'name' => "studentnull10k",
'filetype' => "studentnull",
'rows' => 10000,
'hdfs' => "studentnull10k",
},{
'name' => "all100k",
'filetype' => "allscalars",
'rows' => 100000,
'hdfs' => "all100k",
},{
'name' => "all100kjson",
'filetype' => "json",
'rows' => 100000,
'hdfs' => "all100kjson",
},{
'name' => "all100krc",
'filetype' => "studenttab",
'rows' => 100000,
'hdfs' => "all100krc",
'format' => "rc",
}, {
'name' => "studentcomplextab10k",
'filetype' => "studentcomplextab",
'rows' => 10000,
'hdfs' => "studentcomplextab10k",
}
);
# if (defined($cfg->{'load_hive_only'}) && $cfg->{'load_hive_only'} == 1) {
# return $self->hiveMetaOnly($cfg, $log, \@tables);
# }
# Create the HDFS directories
my $mkdirCmd = "fs -mkdir";
if ($ENV{'HCAT_HADOOPVERSION'} eq "23") {
$mkdirCmd = "fs -mkdir -p"
}
Util::runHadoopCmd($cfg, $log, "$mkdirCmd $cfg->{'hcat_data_dir'}");
foreach my $table (@tables) {
print "Generating data for $table->{'name'}\n";
# Generate the data
my @cmd;
if (defined($table->{'format'})) {
@cmd = ($cfg->{'gentool'}, $table->{'filetype'}, $table->{'rows'},
$table->{'name'}, $cfg->{'hcat_data_dir'}, $table->{'format'});
} else {
@cmd = ($cfg->{'gentool'}, $table->{'filetype'}, $table->{'rows'},
$table->{'name'}, $cfg->{'hcat_data_dir'});
}
$self->runCmd($log, \@cmd);
# Copy the data to HDFS
my $hadoop = "$mkdirCmd $cfg->{'hcat_data_dir'}/$table->{'hdfs'}";
Util::runHadoopCmd($cfg, $log, $hadoop);
if (defined($table->{'partitions'})) {
foreach my $part (@{$table->{'partitions'}}) {
my $hadoop = "$mkdirCmd
$cfg->{'hcat_data_dir'}/$table->{'hdfs'}/$table->{'name'}.$part";
Util::runHadoopCmd($cfg, $log, $hadoop);
my $hadoop = "fs -copyFromLocal $table->{'name'}.$part " .
"$cfg->{'hcat_data_dir'}/$table->{'hdfs'}/$table->{'name'}.$part/$table->{'name'}.$part";
Util::runHadoopCmd($cfg, $log, $hadoop);
}
} else {
my $hadoop = "fs -copyFromLocal $table->{'name'} ".
"$cfg->{'hcat_data_dir'}/$table->{'hdfs'}/$table->{'name'}";
Util::runHadoopCmd($cfg, $log, $hadoop);
}
print "Loading data into Hive for $table->{'name'}\n";
Util::runHCatCmdFromFile($cfg, $log,
"./" . $table->{'name'} . ".hcat.sql");
print "Loading data into MySQL for $table->{'name'}\n";
Util::runDbCmd($cfg, $log, $table->{'name'} . ".mysql.sql");
}
}
###########################################################################
# Sub: hiveMetaOnly
# Load metadata into Hive, but don't load Mysql or HDFS, as we assume
# these have already been loaded.
#
# Paramaters:
# cfg - hash from config file, including deployment config
# log - log file handle
#
# Returns:
# None
#
sub hiveMetaOnly
{
my ($self, $cfg, $log, $tables) = @_;
foreach my $table (@{$tables}) {
print "Generating data for $table->{'name'}\n";
# Generate the data
my @cmd = ($cfg->{'gentool'}, $table->{'filetype'}, $table->{'rows'},
$table->{'name'}, $cfg->{'hcat_data_dir'});
$self->runCmd($log, \@cmd);
print "Loading data into Hive for $table->{'name'}\n";
Util::runHCatCmdFromFile($cfg, $log, "./" . $table->{'name'} .
".hive.sql");
}
}
##############################################################################
# Sub: confirmDeployment
# Run checks to confirm that the deployment was successful. When this is
# done the testing environment should be ready to run.
#
# Paramaters:
# globalHash - hash from config file, including deployment config
# log - log file handle
#
# Returns:
# Nothing
# This method should die with an appropriate error message if there is
# an issue.
#
sub confirmDeployment
{
}
##############################################################################
# Sub: deleteData
# Remove any data created that will not be removed by undeploying.
#
# Paramaters:
# globalHash - hash from config file, including deployment config
# log - log file handle
#
# Returns:
# None
#
sub deleteData
{
}
##############################################################################
# Sub: stop
# Stop any servers or systems that are no longer needed once testing is
# completed.
#
# Paramaters:
# globalHash - hash from config file, including deployment config
# log - log file handle
#
# Returns:
# None
#
sub stop
{
}
##############################################################################
# Sub: undeploy
# Remove any packages that were installed as part of the deployment.
#
# Paramaters:
# globalHash - hash from config file, including deployment config
# log - log file handle
#
# Returns:
# None
#
sub undeploy
{
}
##############################################################################
# Sub: confirmUndeployment
# Run checks to confirm that the undeployment was successful. When this is
# done anything that must be turned off or removed should be turned off or
# removed.
#
# Paramaters:
# globalHash - hash from config file, including deployment config
# log - log file handle
#
# Returns:
# Nothing
# This method should die with an appropriate error message if there is
# an issue.
#
sub confirmUndeployment
{
die "$0 INFO : confirmUndeployment is a virtual function!";
}
sub runCmd($$$)
{
my ($self, $log, $cmd) = @_;
print $log "Going to run [" . join(" ", @$cmd) . "]\n";
run($cmd, \undef, $log, $log) or
die "Failed running " . join(" ", @$cmd) . "\n";
}
1;