sdks/java/io/hadoop-format/src/main/java/org/apache/beam/sdk/io/hadoop/format/HDFSSynchronization.java - beam - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.beam.sdk.io.hadoop.format;

 import java.io.IOException;
 import java.io.Serializable;
 import java.util.Random;
 import org.apache.beam.vendor.guava.v20_0.com.google.common.base.Preconditions;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
 import org.apache.hadoop.ipc.RemoteException;
 import org.apache.hadoop.mapred.FileAlreadyExistsException;
 import org.apache.hadoop.mapreduce.JobID;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
 import org.apache.hadoop.mapreduce.TaskID;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Implementation of {@link ExternalSynchronization} which registers locks in the HDFS.
  *
  * <p>Requires {@code locksDir} to be specified. This directory MUST be different that directory
  * which is possibly stored under {@code "mapreduce.output.fileoutputformat.outputdir"} key.
  * Otherwise setup of job will fail because the directory will exist before job setup.
  */
 public class HDFSSynchronization implements ExternalSynchronization {

   private static final Logger LOGGER = LoggerFactory.getLogger(HDFSSynchronization.class);

   private static final String LOCKS_DIR_PATTERN = "%s/";
   private static final String LOCKS_DIR_TASK_PATTERN = LOCKS_DIR_PATTERN + "%s";
   private static final String LOCKS_DIR_TASK_ATTEMPT_PATTERN = LOCKS_DIR_TASK_PATTERN + "_%s";
   private static final String LOCKS_DIR_JOB_FILENAME = LOCKS_DIR_PATTERN + "_job";

   private static final transient Random RANDOM_GEN = new Random();

   private final String locksDir;
   private final ThrowingFunction<Configuration, FileSystem, IOException> fileSystemFactory;

   /**
    * Creates instance of {@link HDFSSynchronization}.
    *
    * @param locksDir directory where locks will be stored. This directory MUST be different that
    *     directory which is possibly stored under {@code
    *     "mapreduce.output.fileoutputformat.outputdir"} key. Otherwise setup of job will fail
    *     because the directory will exist before job setup.
    */
   public HDFSSynchronization(String locksDir) {
     this(locksDir, FileSystem::newInstance);
   }

   /**
    * Creates instance of {@link HDFSSynchronization}. Exists only for easier testing.
    *
    * @param locksDir directory where locks will be stored. This directory MUST be different that
    *     directory which is possibly stored under {@code
    *     "mapreduce.output.fileoutputformat.outputdir"} key. Otherwise setup of job will fail
    *     because the directory will exist before job setup.
    * @param fileSystemFactory supplier of the file system
    */
   HDFSSynchronization(
       String locksDir, ThrowingFunction<Configuration, FileSystem, IOException> fileSystemFactory) {
     this.locksDir = locksDir;
     this.fileSystemFactory = fileSystemFactory;
   }

   @Override
   public boolean tryAcquireJobLock(Configuration conf) {
     Path path = new Path(locksDir, String.format(LOCKS_DIR_JOB_FILENAME, getJobJtIdentifier(conf)));

     return tryCreateFile(conf, path);
   }

   @Override
   public void releaseJobIdLock(Configuration conf) {
     Path path = new Path(locksDir, String.format(LOCKS_DIR_PATTERN, getJobJtIdentifier(conf)));

     try (FileSystem fileSystem = fileSystemFactory.apply(conf)) {
       if (fileSystem.delete(path, true)) {
         LOGGER.info("Delete of lock directory {} was successful", path);
       } else {
         LOGGER.warn("Delete of lock directory {} was unsuccessful", path);
       }

     } catch (IOException e) {
       String formattedExceptionMessage =
           String.format("Delete of lock directory %s was unsuccessful", path);
       LOGGER.warn(formattedExceptionMessage, e);
       throw new IllegalStateException(formattedExceptionMessage, e);
     }
   }

   @Override
   public TaskID acquireTaskIdLock(Configuration conf) {
     JobID jobId = HadoopFormats.getJobId(conf);
     boolean lockAcquired = false;
     int taskIdCandidate = 0;

     while (!lockAcquired) {
       taskIdCandidate = RANDOM_GEN.nextInt(Integer.MAX_VALUE);
       Path path =
           new Path(
               locksDir,
               String.format(LOCKS_DIR_TASK_PATTERN, getJobJtIdentifier(conf), taskIdCandidate));
       lockAcquired = tryCreateFile(conf, path);
     }

     return HadoopFormats.createTaskID(jobId, taskIdCandidate);
   }

   @Override
   public TaskAttemptID acquireTaskAttemptIdLock(Configuration conf, int taskId) {
     String jobJtIdentifier = getJobJtIdentifier(conf);
     JobID jobId = HadoopFormats.getJobId(conf);
     int taskAttemptCandidate = 0;
     boolean taskAttemptAcquired = false;

     while (!taskAttemptAcquired) {
       taskAttemptCandidate++;
       Path path =
           new Path(
               locksDir,
               String.format(
                   LOCKS_DIR_TASK_ATTEMPT_PATTERN, jobJtIdentifier, taskId, taskAttemptCandidate));
       taskAttemptAcquired = tryCreateFile(conf, path);
     }

     return HadoopFormats.createTaskAttemptID(jobId, taskId, taskAttemptCandidate);
   }

   private boolean tryCreateFile(Configuration conf, Path path) {
     try (FileSystem fileSystem = fileSystemFactory.apply(conf)) {
       try {
         return fileSystem.createNewFile(path);
       } catch (FileAlreadyExistsException | org.apache.hadoop.fs.FileAlreadyExistsException e) {
         return false;
       } catch (RemoteException e) {
         // remote hdfs exception
         if (e.getClassName().equals(AlreadyBeingCreatedException.class.getName())) {
           return false;
         }
         throw e;
       }
     } catch (IOException e) {
       throw new IllegalStateException(String.format("Creation of file on path %s failed", path), e);
     }
   }

   private String getJobJtIdentifier(Configuration conf) {
     JobID job =
         Preconditions.checkNotNull(
             HadoopFormats.getJobId(conf),
             "Configuration must contain jobID under key %s.",
             HadoopFormatIO.JOB_ID);
     return job.getJtIdentifier();
   }

   /**
    * Function which can throw exception.
    *
    * @param <T1> parameter type
    * @param <T2> result type
    * @param <X> exception type
    */
   @FunctionalInterface
   interface ThrowingFunction<T1, T2, X extends Exception> extends Serializable {
     T2 apply(T1 value) throws X;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.beam.sdk.io.hadoop.format;

	import java.io.IOException;
	import java.io.Serializable;
	import java.util.Random;
	import org.apache.beam.vendor.guava.v20_0.com.google.common.base.Preconditions;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
	import org.apache.hadoop.ipc.RemoteException;
	import org.apache.hadoop.mapred.FileAlreadyExistsException;
	import org.apache.hadoop.mapreduce.JobID;
	import org.apache.hadoop.mapreduce.TaskAttemptID;
	import org.apache.hadoop.mapreduce.TaskID;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* Implementation of {@link ExternalSynchronization} which registers locks in the HDFS.
	*
	* <p>Requires {@code locksDir} to be specified. This directory MUST be different that directory
	* which is possibly stored under {@code "mapreduce.output.fileoutputformat.outputdir"} key.
	* Otherwise setup of job will fail because the directory will exist before job setup.
	*/
	public class HDFSSynchronization implements ExternalSynchronization {

	private static final Logger LOGGER = LoggerFactory.getLogger(HDFSSynchronization.class);

	private static final String LOCKS_DIR_PATTERN = "%s/";
	private static final String LOCKS_DIR_TASK_PATTERN = LOCKS_DIR_PATTERN + "%s";
	private static final String LOCKS_DIR_TASK_ATTEMPT_PATTERN = LOCKS_DIR_TASK_PATTERN + "_%s";
	private static final String LOCKS_DIR_JOB_FILENAME = LOCKS_DIR_PATTERN + "_job";

	private static final transient Random RANDOM_GEN = new Random();

	private final String locksDir;
	private final ThrowingFunction<Configuration, FileSystem, IOException> fileSystemFactory;

	/**
	* Creates instance of {@link HDFSSynchronization}.
	*
	* @param locksDir directory where locks will be stored. This directory MUST be different that
	* directory which is possibly stored under {@code
	* "mapreduce.output.fileoutputformat.outputdir"} key. Otherwise setup of job will fail
	* because the directory will exist before job setup.
	*/
	public HDFSSynchronization(String locksDir) {
	this(locksDir, FileSystem::newInstance);
	}

	/**
	* Creates instance of {@link HDFSSynchronization}. Exists only for easier testing.
	*
	* @param locksDir directory where locks will be stored. This directory MUST be different that
	* directory which is possibly stored under {@code
	* "mapreduce.output.fileoutputformat.outputdir"} key. Otherwise setup of job will fail
	* because the directory will exist before job setup.
	* @param fileSystemFactory supplier of the file system
	*/
	HDFSSynchronization(
	String locksDir, ThrowingFunction<Configuration, FileSystem, IOException> fileSystemFactory) {
	this.locksDir = locksDir;
	this.fileSystemFactory = fileSystemFactory;
	}

	@Override
	public boolean tryAcquireJobLock(Configuration conf) {
	Path path = new Path(locksDir, String.format(LOCKS_DIR_JOB_FILENAME, getJobJtIdentifier(conf)));

	return tryCreateFile(conf, path);
	}

	@Override
	public void releaseJobIdLock(Configuration conf) {
	Path path = new Path(locksDir, String.format(LOCKS_DIR_PATTERN, getJobJtIdentifier(conf)));

	try (FileSystem fileSystem = fileSystemFactory.apply(conf)) {
	if (fileSystem.delete(path, true)) {
	LOGGER.info("Delete of lock directory {} was successful", path);
	} else {
	LOGGER.warn("Delete of lock directory {} was unsuccessful", path);
	}

	} catch (IOException e) {
	String formattedExceptionMessage =
	String.format("Delete of lock directory %s was unsuccessful", path);
	LOGGER.warn(formattedExceptionMessage, e);
	throw new IllegalStateException(formattedExceptionMessage, e);
	}
	}

	@Override
	public TaskID acquireTaskIdLock(Configuration conf) {
	JobID jobId = HadoopFormats.getJobId(conf);
	boolean lockAcquired = false;
	int taskIdCandidate = 0;

	while (!lockAcquired) {
	taskIdCandidate = RANDOM_GEN.nextInt(Integer.MAX_VALUE);
	Path path =
	new Path(
	locksDir,
	String.format(LOCKS_DIR_TASK_PATTERN, getJobJtIdentifier(conf), taskIdCandidate));
	lockAcquired = tryCreateFile(conf, path);
	}

	return HadoopFormats.createTaskID(jobId, taskIdCandidate);
	}

	@Override
	public TaskAttemptID acquireTaskAttemptIdLock(Configuration conf, int taskId) {
	String jobJtIdentifier = getJobJtIdentifier(conf);
	JobID jobId = HadoopFormats.getJobId(conf);
	int taskAttemptCandidate = 0;
	boolean taskAttemptAcquired = false;

	while (!taskAttemptAcquired) {
	taskAttemptCandidate++;
	Path path =
	new Path(
	locksDir,
	String.format(
	LOCKS_DIR_TASK_ATTEMPT_PATTERN, jobJtIdentifier, taskId, taskAttemptCandidate));
	taskAttemptAcquired = tryCreateFile(conf, path);
	}

	return HadoopFormats.createTaskAttemptID(jobId, taskId, taskAttemptCandidate);
	}

	private boolean tryCreateFile(Configuration conf, Path path) {
	try (FileSystem fileSystem = fileSystemFactory.apply(conf)) {
	try {
	return fileSystem.createNewFile(path);
	} catch (FileAlreadyExistsException \| org.apache.hadoop.fs.FileAlreadyExistsException e) {
	return false;
	} catch (RemoteException e) {
	// remote hdfs exception
	if (e.getClassName().equals(AlreadyBeingCreatedException.class.getName())) {
	return false;
	}
	throw e;
	}
	} catch (IOException e) {
	throw new IllegalStateException(String.format("Creation of file on path %s failed", path), e);
	}
	}

	private String getJobJtIdentifier(Configuration conf) {
	JobID job =
	Preconditions.checkNotNull(
	HadoopFormats.getJobId(conf),
	"Configuration must contain jobID under key %s.",
	HadoopFormatIO.JOB_ID);
	return job.getJtIdentifier();
	}

	/**
	* Function which can throw exception.
	*
	* @param <T1> parameter type
	* @param <T2> result type
	* @param <X> exception type
	*/
	@FunctionalInterface
	interface ThrowingFunction<T1, T2, X extends Exception> extends Serializable {
	T2 apply(T1 value) throws X;
	}
	}