MAPREDUCE-2137. Provide mapping between jobs of trace file and the corresponding simulated cluster's jobs in Gridmix.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/mapreduce/trunk@1128147 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java
index 48ae52c..b64a036 100644
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java
@@ -92,6 +92,20 @@
    */
   public static final String GRIDMIX_USR_RSV = "gridmix.user.resolve.class";
 
+  /**
+   * Configuration property set in simulated job's configuration whose value is
+   * set to the corresponding original job's name. This is not configurable by
+   * gridmix user.
+   */
+  public static final String ORIGINAL_JOB_NAME =
+      "gridmix.job.original-job-name";
+  /**
+   * Configuration property set in simulated job's configuration whose value is
+   * set to the corresponding original job's id. This is not configurable by
+   * gridmix user.
+   */
+  public static final String ORIGINAL_JOB_ID = "gridmix.job.original-job-id";
+
   private DistributedCacheEmulator distCacheEmulator;
 
   // Submit data structures
diff --git a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java
index d94e8fa..0ce5e76 100644
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java
@@ -54,16 +54,17 @@
  */
 abstract class GridmixJob implements Callable<Job>, Delayed {
 
-  public static final String JOBNAME = "GRIDMIX";
-  public static final String ORIGNAME = "gridmix.job.name.original";
+  // Gridmix job name format is GRIDMIX<6 digit sequence number>
+  public static final String JOB_NAME_PREFIX = "GRIDMIX";
   public static final Log LOG = LogFactory.getLog(GridmixJob.class);
 
   private static final ThreadLocal<Formatter> nameFormat =
     new ThreadLocal<Formatter>() {
       @Override
       protected Formatter initialValue() {
-        final StringBuilder sb = new StringBuilder(JOBNAME.length() + 5);
-        sb.append(JOBNAME);
+        final StringBuilder sb =
+            new StringBuilder(JOB_NAME_PREFIX.length() + 6);
+        sb.append(JOB_NAME_PREFIX);
         return new Formatter(sb);
       }
     };
@@ -95,18 +96,21 @@
     this.jobdesc = jobdesc;
     this.seq = seq;
 
-    ((StringBuilder)nameFormat.get().out()).setLength(JOBNAME.length());
+    ((StringBuilder)nameFormat.get().out()).setLength(JOB_NAME_PREFIX.length());
     try {
       job = this.ugi.doAs(new PrivilegedExceptionAction<Job>() {
         public Job run() throws IOException {
-          Job ret = 
-            new Job(conf, 
-                    nameFormat.get().format("%05d", seq).toString());
-          ret.getConfiguration().setInt(GRIDMIX_JOB_SEQ, seq);
+
           String jobId = null == jobdesc.getJobID() 
                          ? "<unknown>" 
                          : jobdesc.getJobID().toString();
-          ret.getConfiguration().set(ORIGNAME, jobId);
+          Job ret = new Job(conf,
+                            nameFormat.get().format("%06d", seq).toString());
+          ret.getConfiguration().setInt(GRIDMIX_JOB_SEQ, seq);
+
+          ret.getConfiguration().set(Gridmix.ORIGINAL_JOB_ID, jobId);
+          ret.getConfiguration().set(Gridmix.ORIGINAL_JOB_NAME,
+                                     jobdesc.getName());
           if (conf.getBoolean(GRIDMIX_USE_QUEUE_IN_TRACE, false)) {
             setJobQueue(ret, jobdesc.getQueueName());
           } else {
diff --git a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/DebugJobProducer.java b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/DebugJobProducer.java
index c1e433c..fca29af 100644
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/DebugJobProducer.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/DebugJobProducer.java
@@ -146,7 +146,7 @@
       final long seed = r.nextLong();
       r.setSeed(seed);
       id = seq.getAndIncrement();
-      name = String.format("MOCKJOB%05d", id);
+      name = String.format("MOCKJOB%06d", id);
 
       this.conf = conf;
       LOG.info(name + " (" + seed + ")");
diff --git a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSubmission.java b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSubmission.java
index 17c6738..2815f24 100644
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSubmission.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSubmission.java
@@ -17,9 +17,9 @@
  */
 package org.apache.hadoop.mapred.gridmix;
 
-import java.io.FileInputStream;
 import java.io.InputStream;
 import java.io.IOException;
+import java.text.DecimalFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -41,6 +41,7 @@
 import org.apache.hadoop.mapred.JobID;
 import org.apache.hadoop.mapred.TaskReport;
 import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.MRJobConfig;
 import org.apache.hadoop.mapreduce.TaskType;
 import org.apache.hadoop.tools.rumen.JobStory;
 import org.apache.hadoop.tools.rumen.JobStoryProducer;
@@ -110,18 +111,10 @@
       final JobClient client = new JobClient(
         GridmixTestUtils.mrCluster.createJobConf());
       for (Job job : succeeded) {
-        final String jobname = job.getJobName();
-        if (GenerateData.JOB_NAME.equals(jobname)) {
-          if (!job.getConfiguration().getBoolean(
-            GridmixJob.GRIDMIX_USE_QUEUE_IN_TRACE, true)) {
-            assertEquals(" Improper queue for " + job.getJobName(),
-                         job.getConfiguration().get("mapred.job.queue.name"), 
-                         "q1");
-          } else {
-            assertEquals(" Improper queue for " + job.getJobName(),
-                         job.getConfiguration().get("mapred.job.queue.name"), 
-                         "default");
-          }
+        final String jobName = job.getJobName();
+        Configuration conf = job.getConfiguration();
+        if (GenerateData.JOB_NAME.equals(jobName)) {
+          verifyQueue(conf, jobName);
           final Path in = new Path("foo").makeQualified(GridmixTestUtils.dfs);
           final Path out = new Path("/gridmix").makeQualified(GridmixTestUtils.dfs);
           final ContentSummary generated = GridmixTestUtils.dfs.getContentSummary(in);
@@ -131,29 +124,48 @@
           FileStatus[] outstat = GridmixTestUtils.dfs.listStatus(out);
           assertEquals("Mismatched job count", NJOBS, outstat.length);
           continue;
+        } else if (GenerateDistCacheData.JOB_NAME.equals(jobName)) {
+          verifyQueue(conf, jobName);
+          continue;
         }
         
-        if (!job.getConfiguration().getBoolean(
+        if (!conf.getBoolean(
           GridmixJob.GRIDMIX_USE_QUEUE_IN_TRACE, true)) {
-          assertEquals(" Improper queue for  " + job.getJobName() + " " ,
-          job.getConfiguration().get("mapred.job.queue.name"),"q1" );
+          assertEquals(" Improper queue for  " + jobName + " " ,
+              conf.get(MRJobConfig.QUEUE_NAME), "q1" );
         } else {
-          assertEquals(" Improper queue for  " + job.getJobName() + " ",
-                       job.getConfiguration().get("mapred.job.queue.name"), 
-                       sub.get(job.getConfiguration().get(GridmixJob.ORIGNAME))
-                          .getQueueName());
+          assertEquals(" Improper queue for  " + jobName + " ",
+              conf.get(MRJobConfig.QUEUE_NAME),
+              sub.get(conf.get(Gridmix.ORIGINAL_JOB_ID)).getQueueName());
         }
 
-        final JobStory spec =
-          sub.get(job.getConfiguration().get(GridmixJob.ORIGNAME));
-        assertNotNull("No spec for " + job.getJobName(), spec);
-        assertNotNull("No counters for " + job.getJobName(), job.getCounters());
-        final String specname = spec.getName();
+        final String originalJobId = conf.get(Gridmix.ORIGINAL_JOB_ID);
+        final JobStory spec = sub.get(originalJobId);
+        assertNotNull("No spec for " + jobName, spec);
+        assertNotNull("No counters for " + jobName, job.getCounters());
+        final String originalJobName = spec.getName();
+        System.out.println("originalJobName=" + originalJobName
+            + ";GridmixJobName=" + jobName + ";originalJobID=" + originalJobId);
+        assertTrue("Original job name is wrong.", originalJobName.equals(
+            conf.get(Gridmix.ORIGINAL_JOB_NAME)));
+
+        // Gridmix job seqNum contains 6 digits
+        int seqNumLength = 6;
+        String jobSeqNum = new DecimalFormat("000000").format(
+            conf.getInt(GridmixJob.GRIDMIX_JOB_SEQ, -1));
+        // Original job name is of the format MOCKJOB<6 digit sequence number>
+        // because MockJob jobNames are of this format.
+        assertTrue(originalJobName.substring(
+            originalJobName.length() - seqNumLength).equals(jobSeqNum));
+
+        assertTrue("Gridmix job name is not in the expected format.",
+            jobName.equals(
+            GridmixJob.JOB_NAME_PREFIX + jobSeqNum));
+
         final FileStatus stat = 
           GridmixTestUtils.dfs.getFileStatus(
-            new Path(GridmixTestUtils.DEST, 
-            "" + Integer.valueOf(specname.substring(specname.length() - 5))));
-        assertEquals("Wrong owner for " + job.getJobName(), spec.getUser(),
+            new Path(GridmixTestUtils.DEST, "" + Integer.valueOf(jobSeqNum)));
+        assertEquals("Wrong owner for " + jobName, spec.getUser(),
                      stat.getOwner());
 
         final int nMaps = spec.getNumberMaps();
@@ -162,7 +174,7 @@
         // TODO Blocked by MAPREDUCE-118
         if (true) return;
         // TODO
-        System.out.println(jobname + ": " + nMaps + "/" + nReds);
+        System.out.println(jobName + ": " + nMaps + "/" + nReds);
         final TaskReport[] mReports =
           client.getMapTaskReports(JobID.downgrade(job.getJobID()));
         assertEquals("Mismatched map count", nMaps, mReports.length);
@@ -177,6 +189,18 @@
       }
     }
 
+    // Verify if correct job queue is used
+    private void verifyQueue(Configuration conf, String jobName) {
+      if (!conf.getBoolean(
+          GridmixJob.GRIDMIX_USE_QUEUE_IN_TRACE, true)) {
+        assertEquals(" Improper queue for " + jobName,
+            conf.get("mapred.job.queue.name"), "q1");
+      } else {
+        assertEquals(" Improper queue for " + jobName,
+            conf.get("mapred.job.queue.name"), "default");
+      }
+    }
+
     public void check(final TaskType type, Job job, JobStory spec,
           final TaskReport[] runTasks,
           long extraInputBytes, int extraInputRecords,
diff --git a/src/docs/src/documentation/content/xdocs/gridmix.xml b/src/docs/src/documentation/content/xdocs/gridmix.xml
index 2ce5b1e..3c72713 100644
--- a/src/docs/src/documentation/content/xdocs/gridmix.xml
+++ b/src/docs/src/documentation/content/xdocs/gridmix.xml
@@ -56,8 +56,8 @@
 	<li>Use GridMix with the job trace on the benchmark cluster.</li>
       </ol>
       <p>Jobs submitted by GridMix have names of the form
-      &quot;<code>GRIDMIXnnnnn</code>&quot;, where
-      &quot;<code>nnnnn</code>&quot; is a sequence number padded with leading
+      &quot;<code>GRIDMIXnnnnnn</code>&quot;, where
+      &quot;<code>nnnnnn</code>&quot; is a sequence number padded with leading
       zeroes.</p>
     </section>
     <section id="usage">
@@ -540,6 +540,36 @@
     </ul>
   </section>
 
+    <section id="simulatedjobconf">
+      <title>Configuration of Simulated Jobs</title>
+      <p> Gridmix3 sets some configuration properties in the simulated Jobs
+      submitted by it so that they can be mapped back to the corresponding Job
+      in the input Job trace. These configuration parameters include:
+      </p>
+      <table>
+        <tr>
+          <th>Parameter</th>
+          <th>Description</th>
+        </tr>
+        <tr>
+          <td>
+            <code>gridmix.job.original-job-id</code>
+          </td>
+          <td> The job id of the original cluster's job corresponding to this
+          simulated job.
+          </td>
+        </tr>
+        <tr>
+          <td>
+            <code>gridmix.job.original-job-name</code>
+          </td>
+          <td> The job name of the original cluster's job corresponding to this
+          simulated job.
+          </td>
+        </tr>
+      </table>
+    </section>
+
     <section id="assumptions">
       <title>Simplifying Assumptions</title>
       <p>GridMix will be developed in stages, incorporating feedback and