MAPREDUCE-1548. Hadoop archives preserve times and other properties from 
original files. (Rodrigo Schmidt via dhruba)



git-svn-id: https://svn.apache.org/repos/asf/hadoop/mapreduce/trunk@1000310 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/CHANGES.txt b/CHANGES.txt
index 6462a65..3f7ba0b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -127,6 +127,9 @@
     MAPREDUCE-1881. Improve TaskTrackerInstrumentation to enable collection of
     advanced metrics. (Matei Zaharia via acmurthy)
 
+    MAPREDUCE-1548. Hadoop archives preserve times and other properties from 
+    original files. (Rodrigo Schmidt via dhruba)
+
   OPTIMIZATIONS
 
     MAPREDUCE-1354. Enhancements to JobTracker for better performance and
diff --git a/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml b/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml
index dd08d95..46ddc3d 100644
--- a/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml
+++ b/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml
@@ -104,7 +104,7 @@
       <comparators>
         <comparator>
           <type>SubstringComparator</type>
-          <expected-output>The resolved paths is empty.</expected-output>
+          <expected-output>The resolved paths set is empty.</expected-output>
         </comparator>
       </comparators>
     </test>
@@ -407,7 +407,7 @@
       <comparators>
         <comparator>
           <type>SubstringComparator</type>
-          <expected-output>The resolved paths is empty.</expected-output>
+          <expected-output>The resolved paths set is empty.</expected-output>
          </comparator>
        </comparators>
     </test>
diff --git a/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java b/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java
index ec4cd8b..e7e6dc8 100644
--- a/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java
+++ b/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java
@@ -135,6 +135,30 @@
     assertTrue("strings are equal ", (b[0] == "c".getBytes()[0]));
   }
 
+  private void checkProperties(Path harPath, Configuration conf) throws IOException {
+    Path harFilea = new Path(harPath, "a");
+    Path harFileb = new Path(harPath, "b");
+    Path harFilec = new Path(harPath, "c c");
+    FileSystem harFs = harFilea.getFileSystem(conf);
+
+    Path nonharFilea = new Path(inputPath, "a");
+    Path nonharFileb = new Path(inputPath, "b");
+    Path nonharFilec = new Path(inputPath, "c c");
+    FileSystem nonharFs = nonharFilea.getFileSystem(conf);
+
+    assertEquals("Modification times do not match for a",
+        harFs.getFileStatus(harFilea).getModificationTime(),
+        nonharFs.getFileStatus(nonharFilea).getModificationTime());
+
+    assertEquals("Modification times do not match for b",
+        harFs.getFileStatus(harFileb).getModificationTime(),
+        nonharFs.getFileStatus(nonharFileb).getModificationTime());
+
+    assertEquals("Modification times do not match for c",
+        harFs.getFileStatus(harFilec).getModificationTime(),
+        nonharFs.getFileStatus(nonharFilec).getModificationTime());
+  }
+
   /**
    * check if the block size of the part files is what we had specified
    */
@@ -182,6 +206,7 @@
       // fileb and filec
       assertTrue(ret == 0);
       checkBytes(harPath, conf);
+      checkProperties(harPath, conf);
       /* check block size for path files */
       checkBlockSize(fs, finalPath, 512 * 1024 * 1024l);
     }
@@ -220,6 +245,7 @@
       // fileb and filec
       assertTrue(ret == 0);
       checkBytes(harPath, conf);
+      checkProperties(harPath, conf);
       checkBlockSize(fs, finalPath, 512);
     }
   }
diff --git a/src/tools/org/apache/hadoop/fs/HarFileSystem.java b/src/tools/org/apache/hadoop/fs/HarFileSystem.java
index e9e7dc7..d07c8f7 100644
--- a/src/tools/org/apache/hadoop/fs/HarFileSystem.java
+++ b/src/tools/org/apache/hadoop/fs/HarFileSystem.java
@@ -49,7 +49,7 @@
  */
 
 public class HarFileSystem extends FilterFileSystem {
-  public static final int VERSION = 2;
+  public static final int VERSION = 3;
   // uri representation of this Har filesystem
   private URI uri;
   // the version of this har filesystem
@@ -218,11 +218,16 @@
     return tmp;
   }
   
+  private static String decodeString(String str)
+    throws UnsupportedEncodingException {
+    return URLDecoder.decode(str, "UTF-8");
+  }
+
   private String decodeFileName(String fname) 
     throws UnsupportedEncodingException {
     
-    if (version == 2){
-      return URLDecoder.decode(fname, "UTF-8");
+    if (version == 2 || version == 3){
+      return decodeString(fname);
     }
     return fname;
   }
@@ -515,14 +520,21 @@
       }
     }
 
+    long modTime = 0;
+    if (version < 3) {
+      modTime = underlying.getModificationTime();
+    } else if (version == 3) {
+      modTime = h.getModificationTime();
+    }
+
     return new FileStatus(
         h.isDir()? 0L: h.getLength(),
         h.isDir(),
         underlying.getReplication(),
         underlying.getBlockSize(),
-        underlying.getModificationTime(),
+        modTime,
         underlying.getAccessTime(),
-        new FsPermission(underlying.getPermission()),
+        underlying.getPermission(),
         underlying.getOwner(),
         underlying.getGroup(),
         makeRelative(this.uri.toString(), new Path(h.name)));
@@ -540,6 +552,7 @@
     String partName;
     long startIndex;
     long length;
+    long modificationTime = 0;
     public HarStatus(String harString) throws UnsupportedEncodingException {
       String[] splits = harString.split(" ");
       this.name = decodeFileName(splits[0]);
@@ -548,11 +561,36 @@
       this.partName = splits[2];
       this.startIndex = Long.parseLong(splits[3]);
       this.length = Long.parseLong(splits[4]);
+
+      String[] propSplits = null;
+      // propSplits is used to retrieve the metainformation that Har versions
+      // 1 & 2 missed (modification time, permission, owner group).
+      // These fields are stored in an encoded string placed in different
+      // locations depending on whether it's a file or directory entry.
+      // If it's a directory, the string will be placed at the partName
+      // location (directories have no partName because they don't have data
+      // to be stored). This is done because the number of fields in a
+      // directory entry is unbounded (all children are listed at the end)
+      // If it's a file, the string will be the last field.
       if (isDir) {
+        if (version == 3){
+          propSplits = decodeString(this.partName).split(" ");
+        }
         children = new ArrayList<String>();
         for (int i = 5; i < splits.length; i++) {
           children.add(decodeFileName(splits[i]));
         }
+      } else if (version == 3) {
+        propSplits = decodeString(splits[5]).split(" ");
+      }
+
+      if (propSplits != null && propSplits.length >= 4) {
+        modificationTime = Long.parseLong(propSplits[0]);
+        // the fields below are stored in the file but are currently not used
+        // by HarFileSystem
+        // permission = new FsPermission(Short.parseShort(propSplits[1]));
+        // owner = decodeString(propSplits[2]);
+        // group = decodeString(propSplits[3]);
       }
     }
     public boolean isDir() {
@@ -578,6 +616,9 @@
     public long getLength() {
       return length;
     }
+    public long getModificationTime() {
+      return modificationTime;
+    }
   }
   
   /**
diff --git a/src/tools/org/apache/hadoop/tools/HadoopArchives.java b/src/tools/org/apache/hadoop/tools/HadoopArchives.java
index 3f757c7..a3bb25c 100644
--- a/src/tools/org/apache/hadoop/tools/HadoopArchives.java
+++ b/src/tools/org/apache/hadoop/tools/HadoopArchives.java
@@ -23,7 +23,6 @@
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
 import java.net.URLEncoder;
 import java.util.ArrayList;
 import java.util.HashSet;
@@ -77,7 +76,7 @@
  * Hadoop archives look at {@link HarFileSystem}.
  */
 public class HadoopArchives implements Tool {
-  public static final int VERSION = 2;
+  public static final int VERSION = 3;
   private static final Log LOG = LogFactory.getLog(HadoopArchives.class);
   
   private static final String NAME = "har"; 
@@ -643,6 +642,16 @@
       return URLEncoder.encode(s,"UTF-8");
     }
 
+    private static String encodeProperties( FileStatus fStatus )
+      throws UnsupportedEncodingException {
+      String propStr = encodeName(
+          fStatus.getModificationTime() + " "
+        + fStatus.getPermission().toShort() + " "
+        + encodeName(fStatus.getOwner()) + " "
+        + encodeName(fStatus.getGroup()));
+      return propStr;
+    }
+
     // read files from the split input 
     // and write it onto the part files.
     // also output hash(name) and string 
@@ -653,11 +662,15 @@
         Reporter reporter) throws IOException {
       Path relPath = new Path(value.path);
       int hash = HarFileSystem.getHarHash(relPath);
-      String towrite = encodeName(relPath.toString());
+      String towrite = null;
       Path srcPath = realPath(relPath, rootPath);
       long startPos = partStream.getPos();
+      FileSystem srcFs = srcPath.getFileSystem(conf);
+      FileStatus srcStatus = srcFs.getFileStatus(srcPath);
+      String propStr = encodeProperties(srcStatus);
       if (value.isDir()) { 
-        towrite += " dir none 0 0 ";
+        towrite = encodeName(relPath.toString())
+                  + " dir " + propStr + " 0 0 ";
         StringBuffer sbuff = new StringBuffer();
         sbuff.append(towrite);
         for (String child: value.children) {
@@ -668,14 +681,13 @@
         reporter.progress();
       }
       else {
-        FileSystem srcFs = srcPath.getFileSystem(conf);
-        FileStatus srcStatus = srcFs.getFileStatus(srcPath);
         FSDataInputStream input = srcFs.open(srcStatus.getPath());
         reporter.setStatus("Copying file " + srcStatus.getPath() + 
             " to archive.");
         copyData(srcStatus.getPath(), input, partStream, reporter);
-        towrite += " file " + partname + " " + startPos
-        + " " + srcStatus.getLen() + " ";
+        towrite = encodeName(relPath.toString())
+                  + " file " + partname + " " + startPos
+                  + " " + srcStatus.getLen() + " " + propStr + " ";
       }
       out.collect(new IntWritable(hash), new Text(towrite));
     }
@@ -842,7 +854,7 @@
         }
       }
       if (globPaths.isEmpty()) {
-        throw new IOException("The resolved paths is empty."
+        throw new IOException("The resolved paths set is empty."
             + "  Please check whether the srcPaths exist, where srcPaths = "
             + srcPaths);
       }