blob: c2393c6981075f245bd85ecbc6ce94f049bdb185 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.Progressable;
/**
* This is an implementation of the Hadoop Archive
* Filesystem. This archive Filesystem has index files
* of the form _index* and has contents of the form
* part-*. The index files store the indexes of the
* real files. The index files are of the form _masterindex
* and _index. The master index is a level of indirection
* in to the index file to make the look ups faster. the index
* file is sorted with hash code of the paths that it contains
* and the master index contains pointers to the positions in
* index for ranges of hashcodes.
*/
public class HarFileSystem extends FilterFileSystem {
public static final int VERSION = 1;
// uri representation of this Har filesystem
private URI uri;
// the version of this har filesystem
private int version;
// underlying uri
private URI underLyingURI;
// the top level path of the archive
// in the underlying file system
private Path archivePath;
// the masterIndex of the archive
private Path masterIndex;
// the index file
private Path archiveIndex;
// the har auth
private String harAuth;
/**
* public construction of harfilesystem
*
*/
public HarFileSystem() {
}
/**
* Constructor to create a HarFileSystem with an
* underlying filesystem.
* @param fs
*/
public HarFileSystem(FileSystem fs) {
super(fs);
}
/**
* Initialize a Har filesystem per har archive. The
* archive home directory is the top level directory
* in the filesystem that contains the HAR archive.
* Be careful with this method, you do not want to go
* on creating new Filesystem instances per call to
* path.getFileSystem().
* the uri of Har is
* har://underlyingfsscheme-host:port/archivepath.
* or
* har:///archivepath. This assumes the underlying filesystem
* to be used in case not specified.
*/
public void initialize(URI name, Configuration conf) throws IOException {
//decode the name
underLyingURI = decodeHarURI(name, conf);
// we got the right har Path- now check if this is
//truly a har filesystem
Path harPath = archivePath(new Path(name.toString()));
if (harPath == null) {
throw new IOException("Invalid path for the Har Filesystem. " +
name.toString());
}
if (fs == null) {
fs = FileSystem.get(underLyingURI, conf);
}
this.uri = harPath.toUri();
this.archivePath = new Path(this.uri.getPath());
this.harAuth = getHarAuth(this.underLyingURI);
//check for the underlying fs containing
// the index file
this.masterIndex = new Path(archivePath, "_masterindex");
this.archiveIndex = new Path(archivePath, "_index");
if (!fs.exists(masterIndex) || !fs.exists(archiveIndex)) {
throw new IOException("Invalid path for the Har Filesystem. " +
"No index file in " + harPath);
}
try{
this.version = getHarVersion();
} catch(IOException io) {
throw new IOException("Unable to " +
"read the version of the Har file system: " + this.archivePath);
}
if (this.version != HarFileSystem.VERSION) {
throw new IOException("Invalid version " +
this.version + " expected " + HarFileSystem.VERSION);
}
}
// get the version of the filesystem from the masterindex file
// the version is currently not useful since its the first version
// of archives
public int getHarVersion() throws IOException {
FSDataInputStream masterIn = fs.open(masterIndex);
LineReader lmaster = new LineReader(masterIn, getConf());
Text line = new Text();
lmaster.readLine(line);
try {
masterIn.close();
} catch(IOException e){
//disregard it.
// its a read.
}
String versionLine = line.toString();
String[] arr = versionLine.split(" ");
int version = Integer.parseInt(arr[0]);
return version;
}
/*
* find the parent path that is the
* archive path in the path. The last
* path segment that ends with .har is
* the path that will be returned.
*/
private Path archivePath(Path p) {
Path retPath = null;
Path tmp = p;
for (int i=0; i< p.depth(); i++) {
if (tmp.toString().endsWith(".har")) {
retPath = tmp;
break;
}
tmp = tmp.getParent();
}
return retPath;
}
/**
* decode the raw URI to get the underlying URI
* @param rawURI raw Har URI
* @return filtered URI of the underlying fileSystem
*/
private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
String tmpAuth = rawURI.getAuthority();
//we are using the default file
//system in the config
//so create a underlying uri and
//return it
if (tmpAuth == null) {
//create a path
return FileSystem.getDefaultUri(conf);
}
String host = rawURI.getHost();
String[] str = host.split("-", 2);
if (str[0] == null) {
throw new IOException("URI: " + rawURI + " is an invalid Har URI.");
}
String underLyingScheme = str[0];
String underLyingHost = (str.length > 1)? str[1]:null;
int underLyingPort = rawURI.getPort();
String auth = (underLyingHost == null && underLyingPort == -1)?
null:(underLyingHost+":"+underLyingPort);
URI tmp = null;
if (rawURI.getQuery() != null) {
// query component not allowed
throw new IOException("query component in Path not supported " + rawURI);
}
try {
tmp = new URI(underLyingScheme, auth, rawURI.getPath(),
rawURI.getQuery(), rawURI.getFragment());
} catch (URISyntaxException e) {
// do nothing should not happen
}
return tmp;
}
/**
* return the top level archive.
*/
public Path getWorkingDirectory() {
return new Path(uri.toString());
}
/**
* Create a har specific auth
* har-underlyingfs:port
* @param underLyingURI the uri of underlying
* filesystem
* @return har specific auth
*/
private String getHarAuth(URI underLyingUri) {
String auth = underLyingUri.getScheme() + "-";
if (underLyingUri.getHost() != null) {
auth += underLyingUri.getHost() + ":";
if (underLyingUri.getPort() != -1) {
auth += underLyingUri.getPort();
}
}
else {
auth += ":";
}
return auth;
}
/**
* Returns the uri of this filesystem.
* The uri is of the form
* har://underlyingfsschema-host:port/pathintheunderlyingfs
*/
@Override
public URI getUri() {
return this.uri;
}
/**
* this method returns the path
* inside the har filesystem.
* this is relative path inside
* the har filesystem.
* @param path the fully qualified path in the har filesystem.
* @return relative path in the filesystem.
*/
private Path getPathInHar(Path path) {
Path harPath = new Path(path.toUri().getPath());
if (archivePath.compareTo(harPath) == 0)
return new Path(Path.SEPARATOR);
Path tmp = new Path(harPath.getName());
Path parent = harPath.getParent();
while (!(parent.compareTo(archivePath) == 0)) {
if (parent.toString().equals(Path.SEPARATOR)) {
tmp = null;
break;
}
tmp = new Path(parent.getName(), tmp);
parent = parent.getParent();
}
if (tmp != null)
tmp = new Path(Path.SEPARATOR, tmp);
return tmp;
}
//the relative path of p. basically
// getting rid of /. Parsing and doing
// string manipulation is not good - so
// just use the path api to do it.
private Path makeRelative(String initial, Path p) {
Path root = new Path(Path.SEPARATOR);
if (root.compareTo(p) == 0)
return new Path(initial);
Path retPath = new Path(p.getName());
Path parent = p.getParent();
for (int i=0; i < p.depth()-1; i++) {
retPath = new Path(parent.getName(), retPath);
parent = parent.getParent();
}
return new Path(initial, retPath.toString());
}
/* this makes a path qualified in the har filesystem
* (non-Javadoc)
* @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
* org.apache.hadoop.fs.Path)
*/
@Override
public Path makeQualified(Path path) {
// make sure that we just get the
// path component
Path fsPath = path;
if (!path.isAbsolute()) {
fsPath = new Path(archivePath, path);
}
URI tmpURI = fsPath.toUri();
//change this to Har uri
return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
}
/**
* get block locations from the underlying fs
* @param file the input filestatus to get block locations
* @param start the start in the file
* @param len the length in the file
* @return block locations for this segment of file
* @throws IOException
*/
@Override
public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
long len) throws IOException {
// need to look up the file in the underlying fs
// look up the index
// make sure this is a prt of this har filesystem
Path p = makeQualified(file.getPath());
Path harPath = getPathInHar(p);
String line = fileStatusInIndex(harPath);
if (line == null) {
throw new FileNotFoundException("File " + file.getPath() + " not found");
}
HarStatus harStatus = new HarStatus(line);
if (harStatus.isDir()) {
return new BlockLocation[0];
}
FileStatus fsFile = fs.getFileStatus(new Path(archivePath,
harStatus.getPartName()));
BlockLocation[] rawBlocks = fs.getFileBlockLocations(fsFile,
harStatus.getStartIndex() + start, len);
return fakeBlockLocations(rawBlocks, harStatus.getStartIndex());
}
/**
* fake the rawblocks since map reduce uses the block offsets to
* fo some computations regarding the blocks
* @param rawBlocks the raw blocks returned by the filesystem
* @return faked blocks with changed offsets.
*/
private BlockLocation[] fakeBlockLocations(BlockLocation[] rawBlocks,
long startIndex) {
for (BlockLocation block : rawBlocks) {
long rawOffset = block.getOffset();
block.setOffset(rawOffset - startIndex);
}
return rawBlocks;
}
/**
* the hash of the path p inside iniside
* the filesystem
* @param p the path in the harfilesystem
* @return the hash code of the path.
*/
public static int getHarHash(Path p) {
return (p.toString().hashCode() & 0x7fffffff);
}
static class Store {
public Store() {
begin = end = startHash = endHash = 0;
}
public Store(long begin, long end, int startHash, int endHash) {
this.begin = begin;
this.end = end;
this.startHash = startHash;
this.endHash = endHash;
}
public long begin;
public long end;
public int startHash;
public int endHash;
}
// make sure that this harPath is relative to the har filesystem
// this only works for relative paths. This returns the line matching
// the file in the index. Returns a null if there is not matching
// filename in the index file.
private String fileStatusInIndex(Path harPath) throws IOException {
// read the index file
int hashCode = getHarHash(harPath);
// get the master index to find the pos
// in the index file
FSDataInputStream in = fs.open(masterIndex);
FileStatus masterStat = fs.getFileStatus(masterIndex);
LineReader lin = new LineReader(in, getConf());
Text line = new Text();
long read = lin.readLine(line);
//ignore the first line. this is the header of the index files
String[] readStr = null;
List<Store> stores = new ArrayList<Store>();
while(read < masterStat.getLen()) {
int b = lin.readLine(line);
read += b;
readStr = line.toString().split(" ");
int startHash = Integer.parseInt(readStr[0]);
int endHash = Integer.parseInt(readStr[1]);
if (startHash <= hashCode && hashCode <= endHash) {
stores.add(new Store(Long.parseLong(readStr[2]),
Long.parseLong(readStr[3]), startHash,
endHash));
}
line.clear();
}
try {
lin.close();
} catch(IOException io){
// do nothing just a read.
}
FSDataInputStream aIn = fs.open(archiveIndex);
LineReader aLin;
String retStr = null;
// now start reading the real index file
for (Store s: stores) {
read = 0;
aIn.seek(s.begin);
aLin = new LineReader(aIn, getConf());
while (read + s.begin < s.end) {
int tmp = aLin.readLine(line);
read += tmp;
String lineFeed = line.toString();
String[] parsed = lineFeed.split(" ");
if (harPath.compareTo(new Path(parsed[0])) == 0) {
// bingo!
retStr = lineFeed;
break;
}
line.clear();
}
if (retStr != null)
break;
}
try {
aIn.close();
} catch(IOException io) {
//do nothing
}
return retStr;
}
// a single line parser for hadoop archives status
// stored in a single line in the index files
// the format is of the form
// filename "dir"/"file" partFileName startIndex length
// <space seperated children>
private static class HarStatus {
boolean isDir;
String name;
List<String> children;
String partName;
long startIndex;
long length;
public HarStatus(String harString) {
String[] splits = harString.split(" ");
this.name = splits[0];
this.isDir = "dir".equals(splits[1]) ? true: false;
// this is equal to "none" if its a directory
this.partName = splits[2];
this.startIndex = Long.parseLong(splits[3]);
this.length = Long.parseLong(splits[4]);
if (isDir) {
children = new ArrayList<String>();
for (int i = 5; i < splits.length; i++) {
children.add(splits[i]);
}
}
}
public boolean isDir() {
return isDir;
}
public String getName() {
return name;
}
public List<String> getChildren() {
return children;
}
public String getFileName() {
return name;
}
public String getPartName() {
return partName;
}
public long getStartIndex() {
return startIndex;
}
public long getLength() {
return length;
}
}
/**
* return the filestatus of files in har archive.
* The permission returned are that of the archive
* index files. The permissions are not persisted
* while creating a hadoop archive.
* @param f the path in har filesystem
* @return filestatus.
* @throws IOException
*/
@Override
public FileStatus getFileStatus(Path f) throws IOException {
FileStatus archiveStatus = fs.getFileStatus(archiveIndex);
// get the fs DataInputStream for the underlying file
// look up the index.
Path p = makeQualified(f);
Path harPath = getPathInHar(p);
if (harPath == null) {
throw new IOException("Invalid file name: " + f + " in " + uri);
}
String readStr = fileStatusInIndex(harPath);
if (readStr == null) {
throw new FileNotFoundException("File: " + f + " does not exist in " + uri);
}
HarStatus hstatus = null;
hstatus = new HarStatus(readStr);
return new FileStatus(hstatus.isDir()?0:hstatus.getLength(), hstatus.isDir(),
(int)archiveStatus.getReplication(), archiveStatus.getBlockSize(),
archiveStatus.getModificationTime(), archiveStatus.getAccessTime(),
new FsPermission(
archiveStatus.getPermission()), archiveStatus.getOwner(),
archiveStatus.getGroup(),
makeRelative(this.uri.toString(), new Path(hstatus.name)));
}
/**
* Returns a har input stream which fakes end of
* file. It reads the index files to get the part
* file name and the size and start of the file.
*/
@Override
public FSDataInputStream open(Path f, int bufferSize) throws IOException {
// get the fs DataInputStream for the underlying file
// look up the index.
Path p = makeQualified(f);
Path harPath = getPathInHar(p);
if (harPath == null) {
throw new IOException("Invalid file name: " + f + " in " + uri);
}
String readStr = fileStatusInIndex(harPath);
if (readStr == null) {
throw new FileNotFoundException(f + ": not found in " + archivePath);
}
HarStatus hstatus = new HarStatus(readStr);
// we got it.. woo hooo!!!
if (hstatus.isDir()) {
throw new FileNotFoundException(f + " : not a file in " +
archivePath);
}
return new HarFSDataInputStream(fs, new Path(archivePath,
hstatus.getPartName()),
hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
}
/*
* create throws an exception in Har filesystem.
* The archive once created cannot be changed.
*/
public FSDataOutputStream create(Path f, int bufferSize)
throws IOException {
throw new IOException("Har: Create not allowed");
}
public FSDataOutputStream create(Path f,
FsPermission permission,
boolean overwrite,
int bufferSize,
short replication,
long blockSize,
Progressable progress) throws IOException {
throw new IOException("Har: create not allowed.");
}
@Override
public void close() throws IOException {
if (fs != null) {
try {
fs.close();
} catch(IOException ie) {
//this might already be closed
// ignore
}
}
}
/**
* Not implemented.
*/
@Override
public boolean setReplication(Path src, short replication) throws IOException{
throw new IOException("Har: setreplication not allowed");
}
/**
* Not implemented.
*/
@Override
public boolean delete(Path f, boolean recursive) throws IOException {
throw new IOException("Har: delete not allowed");
}
/**
* liststatus returns the children of a directory
* after looking up the index files.
*/
@Override
public FileStatus[] listStatus(Path f) throws IOException {
//need to see if the file is an index in file
//get the filestatus of the archive directory
// we will create fake filestatuses to return
// to the client
List<FileStatus> statuses = new ArrayList<FileStatus>();
FileStatus archiveStatus = fs.getFileStatus(archiveIndex);
Path tmpPath = makeQualified(f);
Path harPath = getPathInHar(tmpPath);
String readStr = fileStatusInIndex(harPath);
if (readStr == null) {
throw new FileNotFoundException("File " + f + " not found in " + archivePath);
}
HarStatus hstatus = new HarStatus(readStr);
if (!hstatus.isDir())
statuses.add(new FileStatus(hstatus.getLength(),
hstatus.isDir(),
archiveStatus.getReplication(), archiveStatus.getBlockSize(),
archiveStatus.getModificationTime(), archiveStatus.getAccessTime(),
new FsPermission(archiveStatus.getPermission()),
archiveStatus.getOwner(), archiveStatus.getGroup(),
makeRelative(this.uri.toString(), new Path(hstatus.name))));
else
for (String child: hstatus.children) {
FileStatus tmp = getFileStatus(new Path(tmpPath, child));
statuses.add(tmp);
}
return statuses.toArray(new FileStatus[statuses.size()]);
}
/**
* return the top level archive path.
*/
public Path getHomeDirectory() {
return new Path(uri.toString());
}
public void setWorkingDirectory(Path newDir) {
//does nothing.
}
/**
* not implemented.
*/
public boolean mkdirs(Path f, FsPermission permission) throws IOException {
throw new IOException("Har: mkdirs not allowed");
}
/**
* not implemented.
*/
public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws
IOException {
throw new IOException("Har: copyfromlocalfile not allowed");
}
/**
* copies the file in the har filesystem to a local file.
*/
public void copyToLocalFile(boolean delSrc, Path src, Path dst)
throws IOException {
FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
}
/**
* not implemented.
*/
public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
throws IOException {
throw new IOException("Har: startLocalOutput not allowed");
}
/**
* not implemented.
*/
public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
throws IOException {
throw new IOException("Har: completeLocalOutput not allowed");
}
/**
* not implemented.
*/
public void setOwner(Path p, String username, String groupname)
throws IOException {
throw new IOException("Har: setowner not allowed");
}
/**
* Not implemented.
*/
public void setPermission(Path p, FsPermission permisssion)
throws IOException {
throw new IOException("Har: setPermission not allowed");
}
/**
* Hadoop archives input stream. This input stream fakes EOF
* since archive files are part of bigger part files.
*/
private static class HarFSDataInputStream extends FSDataInputStream {
/**
* Create an input stream that fakes all the reads/positions/seeking.
*/
private static class HarFsInputStream extends FSInputStream {
private long position, start, end;
//The underlying data input stream that the
// underlying filesystem will return.
private FSDataInputStream underLyingStream;
//one byte buffer
private byte[] oneBytebuff = new byte[1];
HarFsInputStream(FileSystem fs, Path path, long start,
long length, int bufferSize) throws IOException {
underLyingStream = fs.open(path, bufferSize);
underLyingStream.seek(start);
// the start of this file in the part file
this.start = start;
// the position pointer in the part file
this.position = start;
// the end pointer in the part file
this.end = start + length;
}
public synchronized int available() throws IOException {
long remaining = end - underLyingStream.getPos();
if (remaining > (long)Integer.MAX_VALUE) {
return Integer.MAX_VALUE;
}
return (int) remaining;
}
public synchronized void close() throws IOException {
underLyingStream.close();
super.close();
}
//not implemented
@Override
public void mark(int readLimit) {
// do nothing
}
/**
* reset is not implemented
*/
public void reset() throws IOException {
throw new IOException("reset not implemented.");
}
public synchronized int read() throws IOException {
int ret = read(oneBytebuff, 0, 1);
return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
}
public synchronized int read(byte[] b) throws IOException {
int ret = read(b, 0, b.length);
if (ret != -1) {
position += ret;
}
return ret;
}
/**
*
*/
public synchronized int read(byte[] b, int offset, int len)
throws IOException {
int newlen = len;
int ret = -1;
if (position + len > end) {
newlen = (int) (end - position);
}
// end case
if (newlen == 0)
return ret;
ret = underLyingStream.read(b, offset, newlen);
position += ret;
return ret;
}
public synchronized long skip(long n) throws IOException {
long tmpN = n;
if (tmpN > 0) {
if (position + tmpN > end) {
tmpN = end - position;
}
underLyingStream.seek(tmpN + position);
position += tmpN;
return tmpN;
}
return (tmpN < 0)? -1 : 0;
}
public synchronized long getPos() throws IOException {
return (position - start);
}
public synchronized void seek(long pos) throws IOException {
if (pos < 0 || (start + pos > end)) {
throw new IOException("Failed to seek: EOF");
}
position = start + pos;
underLyingStream.seek(position);
}
public boolean seekToNewSource(long targetPos) throws IOException {
//do not need to implement this
// hdfs in itself does seektonewsource
// while reading.
return false;
}
/**
* implementing position readable.
*/
public int read(long pos, byte[] b, int offset, int length)
throws IOException {
int nlength = length;
if (start + nlength + pos > end) {
nlength = (int) (end - (start + pos));
}
return underLyingStream.read(pos + start , b, offset, nlength);
}
/**
* position readable again.
*/
public void readFully(long pos, byte[] b, int offset, int length)
throws IOException {
if (start + length + pos > end) {
throw new IOException("Not enough bytes to read.");
}
underLyingStream.readFully(pos + start, b, offset, length);
}
public void readFully(long pos, byte[] b) throws IOException {
readFully(pos, b, 0, b.length);
}
}
/**
* constructors for har input stream.
* @param fs the underlying filesystem
* @param p The path in the underlying filesystem
* @param start the start position in the part file
* @param length the length of valid data in the part file
* @param bufsize the buffer size
* @throws IOException
*/
public HarFSDataInputStream(FileSystem fs, Path p, long start,
long length, int bufsize) throws IOException {
super(new HarFsInputStream(fs, p, start, length, bufsize));
}
/**
* constructor for har input stream.
* @param fs the underlying filesystem
* @param p the path in the underlying file system
* @param start the start position in the part file
* @param length the length of valid data in the part file.
* @throws IOException
*/
public HarFSDataInputStream(FileSystem fs, Path p, long start, long length)
throws IOException {
super(new HarFsInputStream(fs, p, start, length, 0));
}
}
}