SUBMARINE-62. PS_LAUNCH_CMD CLI description is wrong in RunJobCli. Contributed by Adam Antal
(cherry picked from commit be784de2d4c8d7ae2724cf348925a0fbdbe0c503)
diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/RunJobCli.java b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/RunJobCli.java
index 7b544c1..dfd951f 100644
--- a/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/RunJobCli.java
+++ b/hadoop-submarine/hadoop-submarine-core/src/main/java/org/apache/hadoop/yarn/submarine/client/cli/runjob/RunJobCli.java
@@ -71,13 +71,22 @@
public class RunJobCli extends AbstractCli {
private static final Logger LOG =
LoggerFactory.getLogger(RunJobCli.class);
+
+ private static final String TENSORFLOW = "TensorFlow";
+ private static final String PYTORCH = "PyTorch";
+ private static final String PS = "PS";
+ private static final String WORKER = "worker";
+ private static final String TENSORBOARD = "TensorBoard";
+
private static final String CAN_BE_USED_WITH_TF_PYTORCH =
- "Can be used with TensorFlow or PyTorch frameworks.";
- private static final String CAN_BE_USED_WITH_TF_ONLY =
- "Can only be used with TensorFlow framework.";
+ String.format("Can be used with %s or %s frameworks.",
+ TENSORFLOW, PYTORCH);
+ private static final String TENSORFLOW_ONLY =
+ String.format("Can only be used with %s framework.", TENSORFLOW);
public static final String YAML_PARSE_FAILED = "Failed to parse " +
"YAML config";
-
+ private static final String LOCAL_OR_ANY_FS_DIRECTORY = "Could be a local " +
+ "directory or any other directory on the file system.";
private Options options;
private JobSubmitter jobSubmitter;
@@ -112,50 +121,55 @@
Framework.getValues()));
options.addOption(CliConstants.NAME, true, "Name of the job");
options.addOption(CliConstants.INPUT_PATH, true,
- "Input of the job, could be local or other FS directory");
+ "Input of the job. " + LOCAL_OR_ANY_FS_DIRECTORY);
options.addOption(CliConstants.CHECKPOINT_PATH, true,
- "Training output directory of the job, "
- + "could be local or other FS directory. This typically includes "
- + "checkpoint files and exported model ");
+ "Training output directory of the job. " + LOCAL_OR_ANY_FS_DIRECTORY +
+ "This typically includes checkpoint files and exported model");
options.addOption(CliConstants.SAVED_MODEL_PATH, true,
- "Model exported path (savedmodel) of the job, which is needed when "
- + "exported model is not placed under ${checkpoint_path}"
- + "could be local or other FS directory. " +
- "This will be used to serve.");
+ "Model exported path (saved model) of the job, which is needed when " +
+ "exported model is not placed under ${checkpoint_path}. " +
+ LOCAL_OR_ANY_FS_DIRECTORY + "This will be used to serve");
options.addOption(CliConstants.DOCKER_IMAGE, true, "Docker image name/tag");
+ options.addOption(CliConstants.PS_DOCKER_IMAGE, true,
+ getDockerImageMessage(PS));
+ options.addOption(CliConstants.WORKER_DOCKER_IMAGE, true,
+ getDockerImageMessage(WORKER));
options.addOption(CliConstants.QUEUE, true,
- "Name of queue to run the job, by default it uses default queue");
+ "Name of queue to run the job. By default, the default queue is used");
addWorkerOptions(options);
addPSOptions(options);
addTensorboardOptions(options);
options.addOption(CliConstants.ENV, true,
- "Common environment variable of worker/ps");
+ "Common environment variable passed to worker / PS");
options.addOption(CliConstants.VERBOSE, false,
"Print verbose log for troubleshooting");
options.addOption(CliConstants.WAIT_JOB_FINISH, false,
- "Specified when user want to wait the job finish");
- options.addOption(CliConstants.QUICKLINK, true, "Specify quicklink so YARN"
- + "web UI shows link to given role instance and port. When "
- + "--tensorboard is specified, quicklink to tensorboard instance will "
- + "be added automatically. The format of quick link is: "
- + "Quick_link_label=http(or https)://role-name:port. For example, "
- + "if want to link to first worker's 7070 port, and text of quicklink "
- + "is Notebook_UI, user need to specify --quicklink "
- + "Notebook_UI=https://master-0:7070");
+ "Specified when user wants to wait for jobs to finish");
+ options.addOption(CliConstants.QUICKLINK, true, "Specify quicklink so YARN "
+ + "web UI shows link to the given role instance and port. " +
+ "When --tensorboard is specified, quicklink to the " +
+ TENSORBOARD + " instance will be added automatically. " +
+ "The format of quick link is: "
+ + "Quick_link_label=http(or https)://role-name:port. " +
+ "For example, if users want to link to the first worker's 7070 port, " +
+ "and text of quicklink is Notebook_UI, " +
+ "users need to specify --quicklink Notebook_UI=https://master-0:7070");
options.addOption(CliConstants.LOCALIZATION, true, "Specify"
+ " localization to make remote/local file/directory available to"
+ " all container(Docker)."
- + " Argument format is \"RemoteUri:LocalFilePath[:rw] \" (ro"
- + " permission is not supported yet)"
- + " The RemoteUri can be a file or directory in local or"
- + " HDFS or s3 or abfs or http .etc."
+ + " Argument format is: \"RemoteUri:LocalFilePath[:rw] \" "
+ + "(ro permission is not supported yet)."
+ + " The RemoteUri can be a local file or directory on the filesystem."
+ + " Alternatively, the following remote file systems / "
+ + "transmit mechanisms can be used: "
+ + " HDFS, S3 or abfs, HTTP, etc."
+ " The LocalFilePath can be absolute or relative."
- + " If it's a relative path, it'll be"
+ + " If it is a relative path, it will be"
+ " under container's implied working directory"
- + " but sub directory is not supported yet."
- + " This option can be set mutiple times."
+ + " but sub-directory is not supported yet."
+ + " This option can be set multiple times."
+ " Examples are \n"
+ "-localization \"hdfs:///user/yarn/mydir2:/opt/data\"\n"
+ "-localization \"s3a:///a/b/myfile1:./\"\n"
@@ -163,12 +177,12 @@
+ "-localization \"/user/yarn/mydir3:/opt/mydir3\"\n"
+ "-localization \"./mydir1:.\"\n");
options.addOption(CliConstants.KEYTAB, true, "Specify keytab used by the " +
- "job under security environment");
+ "job under a secured environment");
options.addOption(CliConstants.PRINCIPAL, true, "Specify principal used " +
- "by the job under security environment");
+ "by the job under a secured environment");
options.addOption(CliConstants.DISTRIBUTE_KEYTAB, false, "Distribute " +
- "local keytab to cluster machines for service authentication. If not " +
- "specified, pre-distributed keytab of which path specified by" +
+ "local keytab to cluster machines for service authentication. " +
+ "If not specified, pre-distributed keytab of which path specified by" +
" parameter" + CliConstants.KEYTAB + " on cluster machines will be " +
"used");
options.addOption("h", "help", false, "Print help");
@@ -180,54 +194,67 @@
private void addWorkerOptions(Options options) {
options.addOption(CliConstants.N_WORKERS, true,
- "Number of worker tasks of the job, by default it's 1." +
+ getNumberOfServiceMessage(WORKER, 1) +
CAN_BE_USED_WITH_TF_PYTORCH);
options.addOption(CliConstants.WORKER_DOCKER_IMAGE, true,
- "Specify docker image for WORKER, when this is not specified, WORKER "
- + "uses --" + CliConstants.DOCKER_IMAGE + " as default." +
+ getDockerImageMessage(WORKER) +
CAN_BE_USED_WITH_TF_PYTORCH);
options.addOption(CliConstants.WORKER_LAUNCH_CMD, true,
- "Commandline of worker, arguments will be "
- + "directly used to launch the worker" +
+ getLaunchCommandMessage(WORKER) +
CAN_BE_USED_WITH_TF_PYTORCH);
options.addOption(CliConstants.WORKER_RES, true,
- "Resource of each worker, for example "
- + "memory-mb=2048,vcores=2,yarn.io/gpu=2" +
+ getServiceResourceMessage(WORKER) +
CAN_BE_USED_WITH_TF_PYTORCH);
}
private void addPSOptions(Options options) {
options.addOption(CliConstants.N_PS, true,
- "Number of PS tasks of the job, by default it's 0. " +
- CAN_BE_USED_WITH_TF_ONLY);
+ getNumberOfServiceMessage("PS", 0) +
+ TENSORFLOW_ONLY);
options.addOption(CliConstants.PS_DOCKER_IMAGE, true,
- "Specify docker image for PS, when this is not specified, PS uses --"
- + CliConstants.DOCKER_IMAGE + " as default." +
- CAN_BE_USED_WITH_TF_ONLY);
+ getDockerImageMessage(PS) +
+ TENSORFLOW_ONLY);
options.addOption(CliConstants.PS_LAUNCH_CMD, true,
- "Commandline of worker, arguments will be "
- + "directly used to launch the PS" +
- CAN_BE_USED_WITH_TF_ONLY);
+ getLaunchCommandMessage("PS") +
+ TENSORFLOW_ONLY);
options.addOption(CliConstants.PS_RES, true,
- "Resource of each PS, for example "
- + "memory-mb=2048,vcores=2,yarn.io/gpu=2" +
- CAN_BE_USED_WITH_TF_ONLY);
+ getServiceResourceMessage("PS") +
+ TENSORFLOW_ONLY);
}
private void addTensorboardOptions(Options options) {
options.addOption(CliConstants.TENSORBOARD, false,
- "Should we run TensorBoard"
- + " for this job? By default it's disabled." +
- CAN_BE_USED_WITH_TF_ONLY);
+ "Should we run TensorBoard for this job? " +
+ "By default, TensorBoard is disabled." +
+ TENSORFLOW_ONLY);
options.addOption(CliConstants.TENSORBOARD_RESOURCES, true,
- "Specify resources of Tensorboard, by default it is "
+ "Specifies resources of Tensorboard. The default resource is: "
+ CliConstants.TENSORBOARD_DEFAULT_RESOURCES + "." +
- CAN_BE_USED_WITH_TF_ONLY);
+ TENSORFLOW_ONLY);
options.addOption(CliConstants.TENSORBOARD_DOCKER_IMAGE, true,
- "Specify Tensorboard docker image. when this is not "
- + "specified, Tensorboard " + "uses --" + CliConstants.DOCKER_IMAGE
- + " as default." +
- CAN_BE_USED_WITH_TF_ONLY);
+ getDockerImageMessage(TENSORBOARD));
+ }
+
+ private String getLaunchCommandMessage(String service) {
+ return String.format("Launch command of the %s, arguments will be "
+ + "directly used to launch the %s", service, service);
+ }
+
+ private String getServiceResourceMessage(String serviceType) {
+ return String.format("Resource of each %s process, for example: "
+ + "memory-mb=2048,vcores=2,yarn.io/gpu=2", serviceType);
+ }
+
+ private String getNumberOfServiceMessage(String serviceType,
+ int defaultValue) {
+ return String.format("Number of %s processes for the job. " +
+ "The default value is %d.", serviceType, defaultValue);
+ }
+
+ private String getDockerImageMessage(String serviceType) {
+ return String.format("Specifies docker image for the %s process. " +
+ "When not specified, %s uses --%s as a default value.",
+ serviceType, serviceType, CliConstants.DOCKER_IMAGE);
}
private void parseCommandLineAndGetRunJobParameters(String[] args)