blob: 2701e0277d538b99f5f7e3d2d16b6f4d39f99fce [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.falcon.hive;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.falcon.hive.exception.HiveReplicationException;
import org.apache.falcon.hive.mapreduce.CopyMapper;
import org.apache.falcon.hive.mapreduce.CopyReducer;
import org.apache.falcon.hive.util.DRStatusStore;
import org.apache.falcon.hive.util.DelimiterUtils;
import org.apache.falcon.hive.util.EventSourcerUtils;
import org.apache.falcon.hive.util.FileUtils;
import org.apache.falcon.hive.util.HiveDRStatusStore;
import org.apache.falcon.hive.util.HiveDRUtils;
import org.apache.falcon.hive.util.HiveMetastoreUtils;
import org.apache.falcon.job.JobCounters;
import org.apache.falcon.job.JobCountersHandler;
import org.apache.falcon.job.JobType;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobStatus;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hive.hcatalog.api.HCatClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;
/**
* DR Tool Driver.
*/
public class HiveDRTool extends Configured implements Tool {
private static final String META_PATH_FILE_SUFFIX = ".metapath";
private FileSystem jobFS;
private FileSystem sourceClusterFS;
private FileSystem targetClusterFS;
private HiveDROptions inputOptions;
private DRStatusStore drStore;
private String eventsMetaFile;
private EventSourcerUtils eventSoucerUtil;
private Configuration jobConf;
private String executionStage;
public static final FsPermission STAGING_DIR_PERMISSION =
new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL);
private static final Logger LOG = LoggerFactory.getLogger(HiveDRTool.class);
public HiveDRTool() {
}
@Override
public int run(String[] args) throws Exception {
if (args.length < 1) {
usage();
return -1;
}
try {
init(args);
} catch (Throwable e) {
LOG.error("Invalid arguments: ", e);
System.err.println("Invalid arguments: " + e.getMessage());
usage();
return -1;
}
try {
Job job = execute();
if ((job != null) && (inputOptions.getExecutionStage().equalsIgnoreCase(
HiveDRUtils.ExecutionStage.EXPORT.name()))) {
if ((job.getStatus().getState() == JobStatus.State.SUCCEEDED)
&& (job.getConfiguration().get("counterLogDir") != null)) {
LOG.info("Obtaining job replication counters for Hive DR job");
Path counterFile = new Path(job.getConfiguration().get("counterLogDir"), "counter.txt");
JobCounters hiveReplicationCounters = JobCountersHandler.getCountersType(
JobType.HIVEREPLICATION.name());
hiveReplicationCounters.obtainJobCounters(job.getConfiguration(), job, false);
hiveReplicationCounters.storeJobCounters(job.getConfiguration(), counterFile);
}
}
} catch (Exception e) {
System.err.println("Exception encountered " + e.getMessage());
e.printStackTrace();
LOG.error("Exception encountered, cleaning up staging dirs", e);
cleanup();
return -1;
}
if (inputOptions.getExecutionStage().equalsIgnoreCase(HiveDRUtils.ExecutionStage.IMPORT.name())) {
cleanup();
}
return 0;
}
private void init(String[] args) throws Exception {
LOG.info("Initializing HiveDR");
inputOptions = parseOptions(args);
LOG.info("Input Options: {}", inputOptions);
// Update the source staging path
inputOptions.setSourceStagingPath();
inputOptions.setTargetStagingPath();
LOG.info("srcStaginPath: {}", inputOptions.getSourceStagingPath());
LOG.info("tgtStaginPath: {}", inputOptions.getTargetStagingPath());
Configuration sourceConf = FileUtils.getConfiguration(getConf(), inputOptions.getSourceWriteEP(),
inputOptions.getSourceNNKerberosPrincipal());
sourceClusterFS = FileSystem.get(sourceConf);
Configuration targetConf = FileUtils.getConfiguration(getConf(), inputOptions.getTargetWriteEP(),
inputOptions.getTargetNNKerberosPrincipal());
targetClusterFS = FileSystem.get(targetConf);
jobConf = FileUtils.getConfiguration(getConf(), inputOptions.getJobClusterWriteEP(),
inputOptions.getJobClusterNNPrincipal());
jobFS = FileSystem.get(jobConf);
// init DR status store
drStore = new HiveDRStatusStore(targetClusterFS);
eventSoucerUtil = new EventSourcerUtils(jobConf, inputOptions.shouldKeepHistory(), inputOptions.getJobName());
}
private HiveDROptions parseOptions(String[] args) throws ParseException {
return HiveDROptions.create(args);
}
public Job execute() throws Exception {
assert inputOptions != null;
assert getConf() != null;
executionStage = inputOptions.getExecutionStage();
LOG.info("Executing Workflow stage : {}", executionStage);
if (executionStage.equalsIgnoreCase(HiveDRUtils.ExecutionStage.LASTEVENTS.name())) {
String lastEventsIdFile = getLastEvents(jobConf);
LOG.info("Last successfully replicated Event file : {}", lastEventsIdFile);
return null;
} else if (executionStage.equalsIgnoreCase(HiveDRUtils.ExecutionStage.EXPORT.name())) {
createStagingDirectory();
eventsMetaFile = sourceEvents();
LOG.info("Sourced Events meta file : {}", eventsMetaFile);
if (StringUtils.isEmpty(eventsMetaFile)) {
LOG.info("No events to process");
return null;
} else {
/*
* eventsMetaFile contains the events to be processed by HiveDr. This file should be available
* for the import action as well. Persist the file at a location common to both export and import.
*/
persistEventsMetafileLocation(eventsMetaFile);
}
} else if (executionStage.equalsIgnoreCase(HiveDRUtils.ExecutionStage.IMPORT.name())) {
// read the location of eventsMetaFile from hdfs
eventsMetaFile = getEventsMetaFileLocation();
if (StringUtils.isEmpty(eventsMetaFile)) {
LOG.info("No events to process");
return null;
}
} else {
throw new HiveReplicationException("Invalid Execution stage : " + inputOptions.getExecutionStage());
}
Job job = createJob();
job.submit();
String jobID = job.getJobID().toString();
job.getConfiguration().set("HIVEDR_JOB_ID", jobID);
LOG.info("HiveDR job-id: {}", jobID);
if (inputOptions.shouldBlock() && !job.waitForCompletion(true)) {
throw new IOException("HiveDR failure: Job " + jobID + " has failed: "
+ job.getStatus().getFailureInfo());
}
return job;
}
private Job createJob() throws Exception {
String jobName = "hive-dr" + executionStage;
String userChosenName = getConf().get(JobContext.JOB_NAME);
if (userChosenName != null) {
jobName += ": " + userChosenName;
}
Job job = Job.getInstance(getConf());
job.setJobName(jobName);
job.setJarByClass(CopyMapper.class);
job.setMapperClass(CopyMapper.class);
job.setReducerClass(CopyReducer.class);
job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.NLineInputFormat.class);
job.setOutputFormatClass(NullOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.getConfiguration().set(JobContext.MAP_SPECULATIVE, "false");
job.getConfiguration().set(JobContext.NUM_MAPS,
String.valueOf(inputOptions.getReplicationMaxMaps()));
for (HiveDRArgs args : HiveDRArgs.values()) {
if (inputOptions.getValue(args) != null) {
job.getConfiguration().set(args.getName(), inputOptions.getValue(args));
} else {
job.getConfiguration().set(args.getName(), "null");
}
}
job.getConfiguration().set(FileInputFormat.INPUT_DIR, eventsMetaFile);
return job;
}
private void createStagingDirectory() throws IOException, HiveReplicationException {
Path sourceStagingPath = new Path(inputOptions.getSourceStagingPath());
Path targetStagingPath = new Path(inputOptions.getTargetStagingPath());
LOG.info("Source staging path: {}", sourceStagingPath);
if (!FileSystem.mkdirs(sourceClusterFS, sourceStagingPath, STAGING_DIR_PERMISSION)) {
throw new IOException("mkdir failed for " + sourceStagingPath);
}
LOG.info("Target staging path: {}", targetStagingPath);
if (!FileSystem.mkdirs(targetClusterFS, targetStagingPath, STAGING_DIR_PERMISSION)) {
throw new IOException("mkdir failed for " + targetStagingPath);
}
}
private void cleanStagingDirectory() throws HiveReplicationException {
LOG.info("Cleaning staging directories");
Path sourceStagingPath = new Path(inputOptions.getSourceStagingPath());
Path targetStagingPath = new Path(inputOptions.getTargetStagingPath());
try {
if (sourceClusterFS.exists(sourceStagingPath)) {
sourceClusterFS.delete(sourceStagingPath, true);
}
if (targetClusterFS.exists(targetStagingPath)) {
targetClusterFS.delete(targetStagingPath, true);
}
} catch (IOException e) {
LOG.error("Unable to cleanup staging dir:", e);
}
}
private String sourceEvents() throws Exception {
MetaStoreEventSourcer defaultSourcer = null;
String inputFilename = null;
String lastEventsIdFile = FileUtils.DEFAULT_EVENT_STORE_PATH + File.separator
+ inputOptions.getJobName() + File.separator + inputOptions.getJobName() + ".id";
Map<String, Long> lastEventsIdMap = getLastDBTableEvents(new Path(lastEventsIdFile));
try {
HCatClient sourceMetastoreClient = HiveMetastoreUtils.initializeHiveMetaStoreClient(
inputOptions.getSourceMetastoreUri(),
inputOptions.getSourceMetastoreKerberosPrincipal(),
inputOptions.getSourceHive2KerberosPrincipal());
defaultSourcer = new MetaStoreEventSourcer(sourceMetastoreClient,
new DefaultPartitioner(drStore, eventSoucerUtil), eventSoucerUtil, lastEventsIdMap);
inputFilename = defaultSourcer.sourceEvents(inputOptions);
} finally {
if (defaultSourcer != null) {
defaultSourcer.cleanUp();
}
}
return inputFilename;
}
private String getLastEvents(Configuration conf) throws Exception {
LastReplicatedEvents lastEvents = new LastReplicatedEvents(conf,
inputOptions.getTargetMetastoreUri(),
inputOptions.getTargetMetastoreKerberosPrincipal(),
inputOptions.getTargetHive2KerberosPrincipal(),
drStore, inputOptions);
String eventIdFile = lastEvents.getLastEvents(inputOptions);
lastEvents.cleanUp();
return eventIdFile;
}
private Map<String, Long> getLastDBTableEvents(Path lastEventIdFile) throws Exception {
Map<String, Long> lastEventsIdMap = new HashMap<String, Long>();
BufferedReader in = new BufferedReader(new InputStreamReader(jobFS.open(lastEventIdFile)));
try {
String line;
while ((line=in.readLine())!=null) {
String[] field = line.trim().split(DelimiterUtils.TAB_DELIM, -1);
lastEventsIdMap.put(field[0], Long.parseLong(field[1]));
}
} catch (Exception e) {
throw new IOException(e);
} finally {
IOUtils.closeQuietly(in);
}
return lastEventsIdMap;
}
public static void main(String[] args) {
int exitCode;
try {
HiveDRTool hiveDRTool = new HiveDRTool();
exitCode = ToolRunner.run(HiveDRUtils.getDefaultConf(), hiveDRTool, args);
} catch (Exception e) {
LOG.error("Couldn't complete HiveDR operation: ", e);
exitCode = -1;
}
System.exit(exitCode);
}
private void cleanInputDir() {
eventSoucerUtil.cleanUpEventInputDir();
}
private synchronized void cleanup() throws HiveReplicationException {
cleanStagingDirectory();
cleanInputDir();
cleanTempFiles();
}
private void cleanTempFiles() {
Path eventsDirPath = new Path(FileUtils.DEFAULT_EVENT_STORE_PATH, inputOptions.getJobName());
Path metaFilePath = new Path(eventsDirPath.toString(), inputOptions.getJobName() + META_PATH_FILE_SUFFIX);
Path eventsFilePath = new Path(eventsDirPath.toString(), inputOptions.getJobName() + ".id");
try {
if (jobFS.exists(metaFilePath)) {
jobFS.delete(metaFilePath, true);
}
if (jobFS.exists(eventsFilePath)) {
jobFS.delete(eventsFilePath, true);
}
} catch (IOException e) {
LOG.error("Deleting Temp files failed", e);
}
}
public void persistEventsMetafileLocation(final String eventMetaFilePath) throws IOException {
Path eventsDirPath = new Path(FileUtils.DEFAULT_EVENT_STORE_PATH, inputOptions.getJobName());
Path metaFilePath = new Path(eventsDirPath.toString(), inputOptions.getJobName() + META_PATH_FILE_SUFFIX);
OutputStream out = null;
try {
out = FileSystem.create(jobFS, metaFilePath, FileUtils.FS_PERMISSION_700);
out.write(eventMetaFilePath.getBytes());
out.flush();
} finally {
IOUtils.closeQuietly(out);
}
}
private String getEventsMetaFileLocation() throws IOException {
Path eventsDirPath = new Path(FileUtils.DEFAULT_EVENT_STORE_PATH, inputOptions.getJobName());
Path metaFilePath = new Path(eventsDirPath.toString(), inputOptions.getJobName() + META_PATH_FILE_SUFFIX);
String line = null;
if (jobFS.exists(metaFilePath)) {
BufferedReader in = new BufferedReader(new InputStreamReader(jobFS.open(metaFilePath)));
line = in.readLine();
in.close();
}
return line;
}
public static void usage() {
System.out.println("Usage: hivedrtool -option value ....");
}
}