MAPREDUCE-1548. Hadoop archives preserve times and other properties from
original files. (Rodrigo Schmidt via dhruba)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/mapreduce/trunk@1000310 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/CHANGES.txt b/CHANGES.txt
index 6462a65..3f7ba0b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -127,6 +127,9 @@
MAPREDUCE-1881. Improve TaskTrackerInstrumentation to enable collection of
advanced metrics. (Matei Zaharia via acmurthy)
+ MAPREDUCE-1548. Hadoop archives preserve times and other properties from
+ original files. (Rodrigo Schmidt via dhruba)
+
OPTIMIZATIONS
MAPREDUCE-1354. Enhancements to JobTracker for better performance and
diff --git a/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml b/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml
index dd08d95..46ddc3d 100644
--- a/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml
+++ b/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml
@@ -104,7 +104,7 @@
<comparators>
<comparator>
<type>SubstringComparator</type>
- <expected-output>The resolved paths is empty.</expected-output>
+ <expected-output>The resolved paths set is empty.</expected-output>
</comparator>
</comparators>
</test>
@@ -407,7 +407,7 @@
<comparators>
<comparator>
<type>SubstringComparator</type>
- <expected-output>The resolved paths is empty.</expected-output>
+ <expected-output>The resolved paths set is empty.</expected-output>
</comparator>
</comparators>
</test>
diff --git a/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java b/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java
index ec4cd8b..e7e6dc8 100644
--- a/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java
+++ b/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java
@@ -135,6 +135,30 @@
assertTrue("strings are equal ", (b[0] == "c".getBytes()[0]));
}
+ private void checkProperties(Path harPath, Configuration conf) throws IOException {
+ Path harFilea = new Path(harPath, "a");
+ Path harFileb = new Path(harPath, "b");
+ Path harFilec = new Path(harPath, "c c");
+ FileSystem harFs = harFilea.getFileSystem(conf);
+
+ Path nonharFilea = new Path(inputPath, "a");
+ Path nonharFileb = new Path(inputPath, "b");
+ Path nonharFilec = new Path(inputPath, "c c");
+ FileSystem nonharFs = nonharFilea.getFileSystem(conf);
+
+ assertEquals("Modification times do not match for a",
+ harFs.getFileStatus(harFilea).getModificationTime(),
+ nonharFs.getFileStatus(nonharFilea).getModificationTime());
+
+ assertEquals("Modification times do not match for b",
+ harFs.getFileStatus(harFileb).getModificationTime(),
+ nonharFs.getFileStatus(nonharFileb).getModificationTime());
+
+ assertEquals("Modification times do not match for c",
+ harFs.getFileStatus(harFilec).getModificationTime(),
+ nonharFs.getFileStatus(nonharFilec).getModificationTime());
+ }
+
/**
* check if the block size of the part files is what we had specified
*/
@@ -182,6 +206,7 @@
// fileb and filec
assertTrue(ret == 0);
checkBytes(harPath, conf);
+ checkProperties(harPath, conf);
/* check block size for path files */
checkBlockSize(fs, finalPath, 512 * 1024 * 1024l);
}
@@ -220,6 +245,7 @@
// fileb and filec
assertTrue(ret == 0);
checkBytes(harPath, conf);
+ checkProperties(harPath, conf);
checkBlockSize(fs, finalPath, 512);
}
}
diff --git a/src/tools/org/apache/hadoop/fs/HarFileSystem.java b/src/tools/org/apache/hadoop/fs/HarFileSystem.java
index e9e7dc7..d07c8f7 100644
--- a/src/tools/org/apache/hadoop/fs/HarFileSystem.java
+++ b/src/tools/org/apache/hadoop/fs/HarFileSystem.java
@@ -49,7 +49,7 @@
*/
public class HarFileSystem extends FilterFileSystem {
- public static final int VERSION = 2;
+ public static final int VERSION = 3;
// uri representation of this Har filesystem
private URI uri;
// the version of this har filesystem
@@ -218,11 +218,16 @@
return tmp;
}
+ private static String decodeString(String str)
+ throws UnsupportedEncodingException {
+ return URLDecoder.decode(str, "UTF-8");
+ }
+
private String decodeFileName(String fname)
throws UnsupportedEncodingException {
- if (version == 2){
- return URLDecoder.decode(fname, "UTF-8");
+ if (version == 2 || version == 3){
+ return decodeString(fname);
}
return fname;
}
@@ -515,14 +520,21 @@
}
}
+ long modTime = 0;
+ if (version < 3) {
+ modTime = underlying.getModificationTime();
+ } else if (version == 3) {
+ modTime = h.getModificationTime();
+ }
+
return new FileStatus(
h.isDir()? 0L: h.getLength(),
h.isDir(),
underlying.getReplication(),
underlying.getBlockSize(),
- underlying.getModificationTime(),
+ modTime,
underlying.getAccessTime(),
- new FsPermission(underlying.getPermission()),
+ underlying.getPermission(),
underlying.getOwner(),
underlying.getGroup(),
makeRelative(this.uri.toString(), new Path(h.name)));
@@ -540,6 +552,7 @@
String partName;
long startIndex;
long length;
+ long modificationTime = 0;
public HarStatus(String harString) throws UnsupportedEncodingException {
String[] splits = harString.split(" ");
this.name = decodeFileName(splits[0]);
@@ -548,11 +561,36 @@
this.partName = splits[2];
this.startIndex = Long.parseLong(splits[3]);
this.length = Long.parseLong(splits[4]);
+
+ String[] propSplits = null;
+ // propSplits is used to retrieve the metainformation that Har versions
+ // 1 & 2 missed (modification time, permission, owner group).
+ // These fields are stored in an encoded string placed in different
+ // locations depending on whether it's a file or directory entry.
+ // If it's a directory, the string will be placed at the partName
+ // location (directories have no partName because they don't have data
+ // to be stored). This is done because the number of fields in a
+ // directory entry is unbounded (all children are listed at the end)
+ // If it's a file, the string will be the last field.
if (isDir) {
+ if (version == 3){
+ propSplits = decodeString(this.partName).split(" ");
+ }
children = new ArrayList<String>();
for (int i = 5; i < splits.length; i++) {
children.add(decodeFileName(splits[i]));
}
+ } else if (version == 3) {
+ propSplits = decodeString(splits[5]).split(" ");
+ }
+
+ if (propSplits != null && propSplits.length >= 4) {
+ modificationTime = Long.parseLong(propSplits[0]);
+ // the fields below are stored in the file but are currently not used
+ // by HarFileSystem
+ // permission = new FsPermission(Short.parseShort(propSplits[1]));
+ // owner = decodeString(propSplits[2]);
+ // group = decodeString(propSplits[3]);
}
}
public boolean isDir() {
@@ -578,6 +616,9 @@
public long getLength() {
return length;
}
+ public long getModificationTime() {
+ return modificationTime;
+ }
}
/**
diff --git a/src/tools/org/apache/hadoop/tools/HadoopArchives.java b/src/tools/org/apache/hadoop/tools/HadoopArchives.java
index 3f757c7..a3bb25c 100644
--- a/src/tools/org/apache/hadoop/tools/HadoopArchives.java
+++ b/src/tools/org/apache/hadoop/tools/HadoopArchives.java
@@ -23,7 +23,6 @@
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
@@ -77,7 +76,7 @@
* Hadoop archives look at {@link HarFileSystem}.
*/
public class HadoopArchives implements Tool {
- public static final int VERSION = 2;
+ public static final int VERSION = 3;
private static final Log LOG = LogFactory.getLog(HadoopArchives.class);
private static final String NAME = "har";
@@ -643,6 +642,16 @@
return URLEncoder.encode(s,"UTF-8");
}
+ private static String encodeProperties( FileStatus fStatus )
+ throws UnsupportedEncodingException {
+ String propStr = encodeName(
+ fStatus.getModificationTime() + " "
+ + fStatus.getPermission().toShort() + " "
+ + encodeName(fStatus.getOwner()) + " "
+ + encodeName(fStatus.getGroup()));
+ return propStr;
+ }
+
// read files from the split input
// and write it onto the part files.
// also output hash(name) and string
@@ -653,11 +662,15 @@
Reporter reporter) throws IOException {
Path relPath = new Path(value.path);
int hash = HarFileSystem.getHarHash(relPath);
- String towrite = encodeName(relPath.toString());
+ String towrite = null;
Path srcPath = realPath(relPath, rootPath);
long startPos = partStream.getPos();
+ FileSystem srcFs = srcPath.getFileSystem(conf);
+ FileStatus srcStatus = srcFs.getFileStatus(srcPath);
+ String propStr = encodeProperties(srcStatus);
if (value.isDir()) {
- towrite += " dir none 0 0 ";
+ towrite = encodeName(relPath.toString())
+ + " dir " + propStr + " 0 0 ";
StringBuffer sbuff = new StringBuffer();
sbuff.append(towrite);
for (String child: value.children) {
@@ -668,14 +681,13 @@
reporter.progress();
}
else {
- FileSystem srcFs = srcPath.getFileSystem(conf);
- FileStatus srcStatus = srcFs.getFileStatus(srcPath);
FSDataInputStream input = srcFs.open(srcStatus.getPath());
reporter.setStatus("Copying file " + srcStatus.getPath() +
" to archive.");
copyData(srcStatus.getPath(), input, partStream, reporter);
- towrite += " file " + partname + " " + startPos
- + " " + srcStatus.getLen() + " ";
+ towrite = encodeName(relPath.toString())
+ + " file " + partname + " " + startPos
+ + " " + srcStatus.getLen() + " " + propStr + " ";
}
out.collect(new IntWritable(hash), new Text(towrite));
}
@@ -842,7 +854,7 @@
}
}
if (globPaths.isEmpty()) {
- throw new IOException("The resolved paths is empty."
+ throw new IOException("The resolved paths set is empty."
+ " Please check whether the srcPaths exist, where srcPaths = "
+ srcPaths);
}