[REEF-1708] use proper defaults and better logging in YarnContainerManager
* Use better (default) parameters when registering and unregistering the AM with YARN;
* Implement better logging for the AM registeration/unregistration process
This is required for debugging the Unmanaged AM mode and otherwise has
no impact on the REEF functionality
JIRA: [REEF-1708](https://issues.apache.org/jira/browse/REEF-1708)
Closes #1222
diff --git a/lang/java/reef-runtime-yarn/src/main/java/org/apache/reef/runtime/yarn/driver/YarnContainerManager.java b/lang/java/reef-runtime-yarn/src/main/java/org/apache/reef/runtime/yarn/driver/YarnContainerManager.java
index 41be1e8..e9e189b 100644
--- a/lang/java/reef-runtime-yarn/src/main/java/org/apache/reef/runtime/yarn/driver/YarnContainerManager.java
+++ b/lang/java/reef-runtime-yarn/src/main/java/org/apache/reef/runtime/yarn/driver/YarnContainerManager.java
@@ -62,6 +62,12 @@
private static final String RUNTIME_NAME = "YARN";
+ /** Default hostname to provide in the Application Master registration. */
+ private static final String AM_REGISTRATION_HOST = "";
+
+ /** Default port number to provide in the Application Master registration. */
+ private static final int AM_REGISTRATION_PORT = -1;
+
private final Queue<AMRMClient.ContainerRequest> requestsBeforeSentToRM = new ConcurrentLinkedQueue<>();
private final Queue<AMRMClient.ContainerRequest> requestsAfterSentToRM = new ConcurrentLinkedQueue<>();
private final Map<String, String> nodeIdToRackName = new ConcurrentHashMap<>();
@@ -112,7 +118,7 @@
this.reefFileNames = reefFileNames;
this.progressProvider = progressProvider;
- LOG.log(Level.FINEST, "Instantiated YarnContainerManager");
+ LOG.log(Level.FINEST, "Instantiated YarnContainerManager: {0}", this.registration);
}
/**
@@ -183,9 +189,8 @@
return Math.max(Math.min(1, progressProvider.get().getProgress()), 0);
} catch (final Exception e) {
// An Exception must be caught and logged here because YARN swallows the Exception and fails the job.
- LOG.log(Level.WARNING, "An exception occurred in ProgressProvider.getProgress(), with message : " +
- e.getMessage() + ". Returning 0 as progress.");
- return 0f;
+ LOG.log(Level.WARNING, "Cannot get the application progress. Will return 0.", e);
+ return 0;
}
}
@@ -299,17 +304,23 @@
* This method is called from DriverRuntimeStartHandler via YARNRuntimeStartHandler.
*/
void onStart() {
+
+ LOG.log(Level.FINEST, "YARN registration: begin");
+
this.resourceManager.init(this.yarnConf);
this.resourceManager.start();
this.nodeManager.init(this.yarnConf);
this.nodeManager.start();
- try {
- this.registration.setRegistration(
- this.resourceManager.registerApplicationMaster("", 0, this.trackingURLProvider.getTrackingUrl()));
+ LOG.log(Level.FINEST, "YARN registration: registered with RM and NM");
- LOG.log(Level.FINE, "YARN registration: {0}", this.registration);
+ try {
+
+ this.registration.setRegistration(this.resourceManager.registerApplicationMaster(
+ AM_REGISTRATION_HOST, AM_REGISTRATION_PORT, this.trackingURLProvider.getTrackingUrl()));
+
+ LOG.log(Level.FINE, "YARN registration: AM registered: {0}", this.registration);
final FileSystem fs = FileSystem.get(this.yarnConf);
final Path outputFileName = new Path(this.jobSubmissionDirectory, this.reefFileNames.getDriverHttpEndpoint());
@@ -321,6 +332,8 @@
LOG.log(Level.WARNING, "Unable to register application master.", e);
onRuntimeError(e);
}
+
+ LOG.log(Level.FINEST, "YARN registration: done: {0}", this.registration);
}
/**
@@ -340,7 +353,8 @@
this.reefEventHandlers.close();
if (exception == null) {
- this.resourceManager.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, null, null);
+ this.resourceManager.unregisterApplicationMaster(
+ FinalApplicationStatus.SUCCEEDED, "Success!", this.trackingURLProvider.getTrackingUrl());
} else {
// Note: We don't allow RM to restart our applications if it's an application level failure.
@@ -351,10 +365,13 @@
final String failureMsg = String.format("Application failed due to:%n%s%n" +
"With stack trace:%n%s", exception.getMessage(), ExceptionUtils.getStackTrace(exception));
- this.resourceManager.unregisterApplicationMaster(FinalApplicationStatus.FAILED, failureMsg, null);
+ this.resourceManager.unregisterApplicationMaster(
+ FinalApplicationStatus.FAILED, failureMsg, this.trackingURLProvider.getTrackingUrl());
}
this.resourceManager.close();
+ LOG.log(Level.FINEST, "Container ResourceManager stopped successfully");
+
} catch (final Exception e) {
LOG.log(Level.WARNING, "Error shutting down YARN application", e);
}
@@ -363,6 +380,7 @@
if (this.nodeManager.getServiceState() == Service.STATE.STARTED) {
try {
this.nodeManager.close();
+ LOG.log(Level.FINEST, "Container NodeManager stopped successfully");
} catch (final IOException e) {
LOG.log(Level.WARNING, "Error closing YARN Node Manager", e);
}