blob: b241a949b15333fc1842bb5c7ab59ea4840de126 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.htrace.core.TraceScope;
import org.apache.htrace.core.Tracer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@InterfaceAudience.Private
@InterfaceStability.Unstable
class Globber {
public static final Logger LOG =
LoggerFactory.getLogger(Globber.class.getName());
private final FileSystem fs;
private final FileContext fc;
private final Path pathPattern;
private final PathFilter filter;
private final Tracer tracer;
public Globber(FileSystem fs, Path pathPattern, PathFilter filter) {
this.fs = fs;
this.fc = null;
this.pathPattern = pathPattern;
this.filter = filter;
this.tracer = FsTracer.get(fs.getConf());
}
public Globber(FileContext fc, Path pathPattern, PathFilter filter) {
this.fs = null;
this.fc = fc;
this.pathPattern = pathPattern;
this.filter = filter;
this.tracer = fc.getTracer();
}
private FileStatus getFileStatus(Path path) throws IOException {
try {
if (fs != null) {
return fs.getFileStatus(path);
} else {
return fc.getFileStatus(path);
}
} catch (FileNotFoundException e) {
return null;
}
}
private FileStatus[] listStatus(Path path) throws IOException {
try {
if (fs != null) {
return fs.listStatus(path);
} else {
return fc.util().listStatus(path);
}
} catch (FileNotFoundException e) {
return new FileStatus[0];
}
}
private Path fixRelativePart(Path path) {
if (fs != null) {
return fs.fixRelativePart(path);
} else {
return fc.fixRelativePart(path);
}
}
/**
* Convert a path component that contains backslash ecape sequences to a
* literal string. This is necessary when you want to explicitly refer to a
* path that contains globber metacharacters.
*/
private static String unescapePathComponent(String name) {
return name.replaceAll("\\\\(.)", "$1");
}
/**
* Translate an absolute path into a list of path components.
* We merge double slashes into a single slash here.
* POSIX root path, i.e. '/', does not get an entry in the list.
*/
private static List<String> getPathComponents(String path)
throws IOException {
ArrayList<String> ret = new ArrayList<String>();
for (String component : path.split(Path.SEPARATOR)) {
if (!component.isEmpty()) {
ret.add(component);
}
}
return ret;
}
private String schemeFromPath(Path path) throws IOException {
String scheme = path.toUri().getScheme();
if (scheme == null) {
if (fs != null) {
scheme = fs.getUri().getScheme();
} else {
scheme = fc.getFSofPath(fc.fixRelativePart(path)).
getUri().getScheme();
}
}
return scheme;
}
private String authorityFromPath(Path path) throws IOException {
String authority = path.toUri().getAuthority();
if (authority == null) {
if (fs != null) {
authority = fs.getUri().getAuthority();
} else {
authority = fc.getFSofPath(fc.fixRelativePart(path)).
getUri().getAuthority();
}
}
return authority ;
}
public FileStatus[] glob() throws IOException {
TraceScope scope = tracer.newScope("Globber#glob");
scope.addKVAnnotation("pattern", pathPattern.toUri().getPath());
try {
return doGlob();
} finally {
scope.close();
}
}
private FileStatus[] doGlob() throws IOException {
// First we get the scheme and authority of the pattern that was passed
// in.
String scheme = schemeFromPath(pathPattern);
String authority = authorityFromPath(pathPattern);
// Next we strip off everything except the pathname itself, and expand all
// globs. Expansion is a process which turns "grouping" clauses,
// expressed as brackets, into separate path patterns.
String pathPatternString = pathPattern.toUri().getPath();
List<String> flattenedPatterns = GlobExpander.expand(pathPatternString);
// Now loop over all flattened patterns. In every case, we'll be trying to
// match them to entries in the filesystem.
ArrayList<FileStatus> results =
new ArrayList<FileStatus>(flattenedPatterns.size());
boolean sawWildcard = false;
for (String flatPattern : flattenedPatterns) {
// Get the absolute path for this flattened pattern. We couldn't do
// this prior to flattening because of patterns like {/,a}, where which
// path you go down influences how the path must be made absolute.
Path absPattern = fixRelativePart(new Path(
flatPattern.isEmpty() ? Path.CUR_DIR : flatPattern));
// Now we break the flattened, absolute pattern into path components.
// For example, /a/*/c would be broken into the list [a, *, c]
List<String> components =
getPathComponents(absPattern.toUri().getPath());
// Starting out at the root of the filesystem, we try to match
// filesystem entries against pattern components.
ArrayList<FileStatus> candidates = new ArrayList<FileStatus>(1);
// To get the "real" FileStatus of root, we'd have to do an expensive
// RPC to the NameNode. So we create a placeholder FileStatus which has
// the correct path, but defaults for the rest of the information.
// Later, if it turns out we actually want the FileStatus of root, we'll
// replace the placeholder with a real FileStatus obtained from the
// NameNode.
FileStatus rootPlaceholder;
if (Path.WINDOWS && !components.isEmpty()
&& Path.isWindowsAbsolutePath(absPattern.toUri().getPath(), true)) {
// On Windows the path could begin with a drive letter, e.g. /E:/foo.
// We will skip matching the drive letter and start from listing the
// root of the filesystem on that drive.
String driveLetter = components.remove(0);
rootPlaceholder = new FileStatus(0, true, 0, 0, 0, new Path(scheme,
authority, Path.SEPARATOR + driveLetter + Path.SEPARATOR));
} else {
rootPlaceholder = new FileStatus(0, true, 0, 0, 0,
new Path(scheme, authority, Path.SEPARATOR));
}
candidates.add(rootPlaceholder);
for (int componentIdx = 0; componentIdx < components.size();
componentIdx++) {
ArrayList<FileStatus> newCandidates =
new ArrayList<FileStatus>(candidates.size());
GlobFilter globFilter = new GlobFilter(components.get(componentIdx));
String component = unescapePathComponent(components.get(componentIdx));
if (globFilter.hasPattern()) {
sawWildcard = true;
}
if (candidates.isEmpty() && sawWildcard) {
// Optimization: if there are no more candidates left, stop examining
// the path components. We can only do this if we've already seen
// a wildcard component-- otherwise, we still need to visit all path
// components in case one of them is a wildcard.
break;
}
if ((componentIdx < components.size() - 1) &&
(!globFilter.hasPattern())) {
// Optimization: if this is not the terminal path component, and we
// are not matching against a glob, assume that it exists. If it
// doesn't exist, we'll find out later when resolving a later glob
// or the terminal path component.
for (FileStatus candidate : candidates) {
candidate.setPath(new Path(candidate.getPath(), component));
}
continue;
}
for (FileStatus candidate : candidates) {
if (globFilter.hasPattern()) {
FileStatus[] children = listStatus(candidate.getPath());
if (children.length == 1) {
// If we get back only one result, this could be either a listing
// of a directory with one entry, or it could reflect the fact
// that what we listed resolved to a file.
//
// Unfortunately, we can't just compare the returned paths to
// figure this out. Consider the case where you have /a/b, where
// b is a symlink to "..". In that case, listing /a/b will give
// back "/a/b" again. If we just went by returned pathname, we'd
// incorrectly conclude that /a/b was a file and should not match
// /a/*/*. So we use getFileStatus of the path we just listed to
// disambiguate.
Path path = candidate.getPath();
FileStatus status = getFileStatus(path);
if (status == null) {
// null means the file was not found
LOG.warn("File/directory {} not found:"
+ " it may have been deleted."
+ " If this is an object store, this can be a sign of"
+ " eventual consistency problems.",
path);
continue;
}
if (!status.isDirectory()) {
continue;
}
}
for (FileStatus child : children) {
if (componentIdx < components.size() - 1) {
// Don't try to recurse into non-directories. See HADOOP-10957.
if (!child.isDirectory()) continue;
}
// Set the child path based on the parent path.
child.setPath(new Path(candidate.getPath(),
child.getPath().getName()));
if (globFilter.accept(child.getPath())) {
newCandidates.add(child);
}
}
} else {
// When dealing with non-glob components, use getFileStatus
// instead of listStatus. This is an optimization, but it also
// is necessary for correctness in HDFS, since there are some
// special HDFS directories like .reserved and .snapshot that are
// not visible to listStatus, but which do exist. (See HADOOP-9877)
FileStatus childStatus = getFileStatus(
new Path(candidate.getPath(), component));
if (childStatus != null) {
newCandidates.add(childStatus);
}
}
}
candidates = newCandidates;
}
for (FileStatus status : candidates) {
// Use object equality to see if this status is the root placeholder.
// See the explanation for rootPlaceholder above for more information.
if (status == rootPlaceholder) {
status = getFileStatus(rootPlaceholder.getPath());
if (status == null) continue;
}
// HADOOP-3497 semantics: the user-defined filter is applied at the
// end, once the full path is built up.
if (filter.accept(status.getPath())) {
results.add(status);
}
}
}
/*
* When the input pattern "looks" like just a simple filename, and we
* can't find it, we return null rather than an empty array.
* This is a special case which the shell relies on.
*
* To be more precise: if there were no results, AND there were no
* groupings (aka brackets), and no wildcards in the input (aka stars),
* we return null.
*/
if ((!sawWildcard) && results.isEmpty() &&
(flattenedPatterns.size() <= 1)) {
return null;
}
/*
* In general, the results list will already be sorted, since listStatus
* returns results in sorted order for many Hadoop filesystems. However,
* not all Hadoop filesystems have this property. So we sort here in order
* to get consistent results. See HADOOP-10798 for details.
*/
FileStatus ret[] = results.toArray(new FileStatus[0]);
Arrays.sort(ret);
return ret;
}
}