hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.hadoop.fs;

 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;

 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;

 import org.apache.htrace.core.TraceScope;
 import org.apache.htrace.core.Tracer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 @InterfaceAudience.Private
 @InterfaceStability.Unstable
 class Globber {
   public static final Logger LOG =
       LoggerFactory.getLogger(Globber.class.getName());

   private final FileSystem fs;
   private final FileContext fc;
   private final Path pathPattern;
   private final PathFilter filter;
   private final Tracer tracer;

   public Globber(FileSystem fs, Path pathPattern, PathFilter filter) {
     this.fs = fs;
     this.fc = null;
     this.pathPattern = pathPattern;
     this.filter = filter;
     this.tracer = FsTracer.get(fs.getConf());
   }

   public Globber(FileContext fc, Path pathPattern, PathFilter filter) {
     this.fs = null;
     this.fc = fc;
     this.pathPattern = pathPattern;
     this.filter = filter;
     this.tracer = fc.getTracer();
   }

   private FileStatus getFileStatus(Path path) throws IOException {
     try {
       if (fs != null) {
         return fs.getFileStatus(path);
       } else {
         return fc.getFileStatus(path);
       }
     } catch (FileNotFoundException e) {
       return null;
     }
   }

   private FileStatus[] listStatus(Path path) throws IOException {
     try {
       if (fs != null) {
         return fs.listStatus(path);
       } else {
         return fc.util().listStatus(path);
       }
     } catch (FileNotFoundException e) {
       return new FileStatus[0];
     }
   }

   private Path fixRelativePart(Path path) {
     if (fs != null) {
       return fs.fixRelativePart(path);
     } else {
       return fc.fixRelativePart(path);
     }
   }

   /**
    * Convert a path component that contains backslash ecape sequences to a
    * literal string.  This is necessary when you want to explicitly refer to a
    * path that contains globber metacharacters.
    */
   private static String unescapePathComponent(String name) {
     return name.replaceAll("\\\\(.)", "$1");
   }

   /**
    * Translate an absolute path into a list of path components.
    * We merge double slashes into a single slash here.
    * POSIX root path, i.e. '/', does not get an entry in the list.
    */
   private static List<String> getPathComponents(String path)
       throws IOException {
     ArrayList<String> ret = new ArrayList<String>();
     for (String component : path.split(Path.SEPARATOR)) {
       if (!component.isEmpty()) {
         ret.add(component);
       }
     }
     return ret;
   }

   private String schemeFromPath(Path path) throws IOException {
     String scheme = path.toUri().getScheme();
     if (scheme == null) {
       if (fs != null) {
         scheme = fs.getUri().getScheme();
       } else {
         scheme = fc.getFSofPath(fc.fixRelativePart(path)).
                     getUri().getScheme();
       }
     }
     return scheme;
   }

   private String authorityFromPath(Path path) throws IOException {
     String authority = path.toUri().getAuthority();
     if (authority == null) {
       if (fs != null) {
         authority = fs.getUri().getAuthority();
       } else {
         authority = fc.getFSofPath(fc.fixRelativePart(path)).
                       getUri().getAuthority();
       }
     }
     return authority ;
   }

   public FileStatus[] glob() throws IOException {
     TraceScope scope = tracer.newScope("Globber#glob");
     scope.addKVAnnotation("pattern", pathPattern.toUri().getPath());
     try {
       return doGlob();
     } finally {
       scope.close();
     }
   }

   private FileStatus[] doGlob() throws IOException {
     // First we get the scheme and authority of the pattern that was passed
     // in.
     String scheme = schemeFromPath(pathPattern);
     String authority = authorityFromPath(pathPattern);

     // Next we strip off everything except the pathname itself, and expand all
     // globs.  Expansion is a process which turns "grouping" clauses,
     // expressed as brackets, into separate path patterns.
     String pathPatternString = pathPattern.toUri().getPath();
     List<String> flattenedPatterns = GlobExpander.expand(pathPatternString);

     // Now loop over all flattened patterns.  In every case, we'll be trying to
     // match them to entries in the filesystem.
     ArrayList<FileStatus> results =
         new ArrayList<FileStatus>(flattenedPatterns.size());
     boolean sawWildcard = false;
     for (String flatPattern : flattenedPatterns) {
       // Get the absolute path for this flattened pattern.  We couldn't do
       // this prior to flattening because of patterns like {/,a}, where which
       // path you go down influences how the path must be made absolute.
       Path absPattern = fixRelativePart(new Path(
           flatPattern.isEmpty() ? Path.CUR_DIR : flatPattern));
       // Now we break the flattened, absolute pattern into path components.
       // For example, /a/*/c would be broken into the list [a, *, c]
       List<String> components =
           getPathComponents(absPattern.toUri().getPath());
       // Starting out at the root of the filesystem, we try to match
       // filesystem entries against pattern components.
       ArrayList<FileStatus> candidates = new ArrayList<FileStatus>(1);
       // To get the "real" FileStatus of root, we'd have to do an expensive
       // RPC to the NameNode.  So we create a placeholder FileStatus which has
       // the correct path, but defaults for the rest of the information.
       // Later, if it turns out we actually want the FileStatus of root, we'll
       // replace the placeholder with a real FileStatus obtained from the
       // NameNode.
       FileStatus rootPlaceholder;
       if (Path.WINDOWS && !components.isEmpty()
           && Path.isWindowsAbsolutePath(absPattern.toUri().getPath(), true)) {
         // On Windows the path could begin with a drive letter, e.g. /E:/foo.
         // We will skip matching the drive letter and start from listing the
         // root of the filesystem on that drive.
         String driveLetter = components.remove(0);
         rootPlaceholder = new FileStatus(0, true, 0, 0, 0, new Path(scheme,
             authority, Path.SEPARATOR + driveLetter + Path.SEPARATOR));
       } else {
         rootPlaceholder = new FileStatus(0, true, 0, 0, 0,
             new Path(scheme, authority, Path.SEPARATOR));
       }
       candidates.add(rootPlaceholder);

       for (int componentIdx = 0; componentIdx < components.size();
           componentIdx++) {
         ArrayList<FileStatus> newCandidates =
             new ArrayList<FileStatus>(candidates.size());
         GlobFilter globFilter = new GlobFilter(components.get(componentIdx));
         String component = unescapePathComponent(components.get(componentIdx));
         if (globFilter.hasPattern()) {
           sawWildcard = true;
         }
         if (candidates.isEmpty() && sawWildcard) {
           // Optimization: if there are no more candidates left, stop examining
           // the path components.  We can only do this if we've already seen
           // a wildcard component-- otherwise, we still need to visit all path
           // components in case one of them is a wildcard.
           break;
         }
         if ((componentIdx < components.size() - 1) &&
             (!globFilter.hasPattern())) {
           // Optimization: if this is not the terminal path component, and we
           // are not matching against a glob, assume that it exists.  If it
           // doesn't exist, we'll find out later when resolving a later glob
           // or the terminal path component.
           for (FileStatus candidate : candidates) {
             candidate.setPath(new Path(candidate.getPath(), component));
           }
           continue;
         }
         for (FileStatus candidate : candidates) {
           if (globFilter.hasPattern()) {
             FileStatus[] children = listStatus(candidate.getPath());
             if (children.length == 1) {
               // If we get back only one result, this could be either a listing
               // of a directory with one entry, or it could reflect the fact
               // that what we listed resolved to a file.
               //
               // Unfortunately, we can't just compare the returned paths to
               // figure this out.  Consider the case where you have /a/b, where
               // b is a symlink to "..".  In that case, listing /a/b will give
               // back "/a/b" again.  If we just went by returned pathname, we'd
               // incorrectly conclude that /a/b was a file and should not match
               // /a/*/*.  So we use getFileStatus of the path we just listed to
               // disambiguate.
               Path path = candidate.getPath();
               FileStatus status = getFileStatus(path);
               if (status == null) {
                 // null means the file was not found
                 LOG.warn("File/directory {} not found:"
                     + " it may have been deleted."
                     + " If this is an object store, this can be a sign of"
                     + " eventual consistency problems.",
                     path);
                 continue;
               }
               if (!status.isDirectory()) {
                 continue;
               }
             }
             for (FileStatus child : children) {
               if (componentIdx < components.size() - 1) {
                 // Don't try to recurse into non-directories.  See HADOOP-10957.
                 if (!child.isDirectory()) continue;
               }
               // Set the child path based on the parent path.
               child.setPath(new Path(candidate.getPath(),
                       child.getPath().getName()));
               if (globFilter.accept(child.getPath())) {
                 newCandidates.add(child);
               }
             }
           } else {
             // When dealing with non-glob components, use getFileStatus
             // instead of listStatus.  This is an optimization, but it also
             // is necessary for correctness in HDFS, since there are some
             // special HDFS directories like .reserved and .snapshot that are
             // not visible to listStatus, but which do exist.  (See HADOOP-9877)
             FileStatus childStatus = getFileStatus(
                 new Path(candidate.getPath(), component));
             if (childStatus != null) {
               newCandidates.add(childStatus);
             }
           }
         }
         candidates = newCandidates;
       }
       for (FileStatus status : candidates) {
         // Use object equality to see if this status is the root placeholder.
         // See the explanation for rootPlaceholder above for more information.
         if (status == rootPlaceholder) {
           status = getFileStatus(rootPlaceholder.getPath());
           if (status == null) continue;
         }
         // HADOOP-3497 semantics: the user-defined filter is applied at the
         // end, once the full path is built up.
         if (filter.accept(status.getPath())) {
           results.add(status);
         }
       }
     }
     /*
      * When the input pattern "looks" like just a simple filename, and we
      * can't find it, we return null rather than an empty array.
      * This is a special case which the shell relies on.
      *
      * To be more precise: if there were no results, AND there were no
      * groupings (aka brackets), and no wildcards in the input (aka stars),
      * we return null.
      */
     if ((!sawWildcard) && results.isEmpty() &&
         (flattenedPatterns.size() <= 1)) {
       return null;
     }
     /*
      * In general, the results list will already be sorted, since listStatus
      * returns results in sorted order for many Hadoop filesystems.  However,
      * not all Hadoop filesystems have this property.  So we sort here in order
      * to get consistent results.  See HADOOP-10798 for details.
      */
     FileStatus ret[] = results.toArray(new FileStatus[0]);
     Arrays.sort(ret);
     return ret;
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.hadoop.fs;

	import java.io.FileNotFoundException;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;

	import org.apache.hadoop.classification.InterfaceAudience;
	import org.apache.hadoop.classification.InterfaceStability;

	import org.apache.htrace.core.TraceScope;
	import org.apache.htrace.core.Tracer;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	@InterfaceAudience.Private
	@InterfaceStability.Unstable
	class Globber {
	public static final Logger LOG =
	LoggerFactory.getLogger(Globber.class.getName());

	private final FileSystem fs;
	private final FileContext fc;
	private final Path pathPattern;
	private final PathFilter filter;
	private final Tracer tracer;

	public Globber(FileSystem fs, Path pathPattern, PathFilter filter) {
	this.fs = fs;
	this.fc = null;
	this.pathPattern = pathPattern;
	this.filter = filter;
	this.tracer = FsTracer.get(fs.getConf());
	}

	public Globber(FileContext fc, Path pathPattern, PathFilter filter) {
	this.fs = null;
	this.fc = fc;
	this.pathPattern = pathPattern;
	this.filter = filter;
	this.tracer = fc.getTracer();
	}

	private FileStatus getFileStatus(Path path) throws IOException {
	try {
	if (fs != null) {
	return fs.getFileStatus(path);
	} else {
	return fc.getFileStatus(path);
	}
	} catch (FileNotFoundException e) {
	return null;
	}
	}

	private FileStatus[] listStatus(Path path) throws IOException {
	try {
	if (fs != null) {
	return fs.listStatus(path);
	} else {
	return fc.util().listStatus(path);
	}
	} catch (FileNotFoundException e) {
	return new FileStatus[0];
	}
	}

	private Path fixRelativePart(Path path) {
	if (fs != null) {
	return fs.fixRelativePart(path);
	} else {
	return fc.fixRelativePart(path);
	}
	}

	/**
	* Convert a path component that contains backslash ecape sequences to a
	* literal string. This is necessary when you want to explicitly refer to a
	* path that contains globber metacharacters.
	*/
	private static String unescapePathComponent(String name) {
	return name.replaceAll("\\\\(.)", "$1");
	}

	/**
	* Translate an absolute path into a list of path components.
	* We merge double slashes into a single slash here.
	* POSIX root path, i.e. '/', does not get an entry in the list.
	*/
	private static List<String> getPathComponents(String path)
	throws IOException {
	ArrayList<String> ret = new ArrayList<String>();
	for (String component : path.split(Path.SEPARATOR)) {
	if (!component.isEmpty()) {
	ret.add(component);
	}
	}
	return ret;
	}

	private String schemeFromPath(Path path) throws IOException {
	String scheme = path.toUri().getScheme();
	if (scheme == null) {
	if (fs != null) {
	scheme = fs.getUri().getScheme();
	} else {
	scheme = fc.getFSofPath(fc.fixRelativePart(path)).
	getUri().getScheme();
	}
	}
	return scheme;
	}

	private String authorityFromPath(Path path) throws IOException {
	String authority = path.toUri().getAuthority();
	if (authority == null) {
	if (fs != null) {
	authority = fs.getUri().getAuthority();
	} else {
	authority = fc.getFSofPath(fc.fixRelativePart(path)).
	getUri().getAuthority();
	}
	}
	return authority ;
	}

	public FileStatus[] glob() throws IOException {
	TraceScope scope = tracer.newScope("Globber#glob");
	scope.addKVAnnotation("pattern", pathPattern.toUri().getPath());
	try {
	return doGlob();
	} finally {
	scope.close();
	}
	}

	private FileStatus[] doGlob() throws IOException {
	// First we get the scheme and authority of the pattern that was passed
	// in.
	String scheme = schemeFromPath(pathPattern);
	String authority = authorityFromPath(pathPattern);

	// Next we strip off everything except the pathname itself, and expand all
	// globs. Expansion is a process which turns "grouping" clauses,
	// expressed as brackets, into separate path patterns.
	String pathPatternString = pathPattern.toUri().getPath();
	List<String> flattenedPatterns = GlobExpander.expand(pathPatternString);

	// Now loop over all flattened patterns. In every case, we'll be trying to
	// match them to entries in the filesystem.
	ArrayList<FileStatus> results =
	new ArrayList<FileStatus>(flattenedPatterns.size());
	boolean sawWildcard = false;
	for (String flatPattern : flattenedPatterns) {
	// Get the absolute path for this flattened pattern. We couldn't do
	// this prior to flattening because of patterns like {/,a}, where which
	// path you go down influences how the path must be made absolute.
	Path absPattern = fixRelativePart(new Path(
	flatPattern.isEmpty() ? Path.CUR_DIR : flatPattern));
	// Now we break the flattened, absolute pattern into path components.
	// For example, /a//c would be broken into the list [a, , c]
	List<String> components =
	getPathComponents(absPattern.toUri().getPath());
	// Starting out at the root of the filesystem, we try to match
	// filesystem entries against pattern components.
	ArrayList<FileStatus> candidates = new ArrayList<FileStatus>(1);
	// To get the "real" FileStatus of root, we'd have to do an expensive
	// RPC to the NameNode. So we create a placeholder FileStatus which has
	// the correct path, but defaults for the rest of the information.
	// Later, if it turns out we actually want the FileStatus of root, we'll
	// replace the placeholder with a real FileStatus obtained from the
	// NameNode.
	FileStatus rootPlaceholder;
	if (Path.WINDOWS && !components.isEmpty()
	&& Path.isWindowsAbsolutePath(absPattern.toUri().getPath(), true)) {
	// On Windows the path could begin with a drive letter, e.g. /E:/foo.
	// We will skip matching the drive letter and start from listing the
	// root of the filesystem on that drive.
	String driveLetter = components.remove(0);
	rootPlaceholder = new FileStatus(0, true, 0, 0, 0, new Path(scheme,
	authority, Path.SEPARATOR + driveLetter + Path.SEPARATOR));
	} else {
	rootPlaceholder = new FileStatus(0, true, 0, 0, 0,
	new Path(scheme, authority, Path.SEPARATOR));
	}
	candidates.add(rootPlaceholder);

	for (int componentIdx = 0; componentIdx < components.size();
	componentIdx++) {
	ArrayList<FileStatus> newCandidates =
	new ArrayList<FileStatus>(candidates.size());
	GlobFilter globFilter = new GlobFilter(components.get(componentIdx));
	String component = unescapePathComponent(components.get(componentIdx));
	if (globFilter.hasPattern()) {
	sawWildcard = true;
	}
	if (candidates.isEmpty() && sawWildcard) {
	// Optimization: if there are no more candidates left, stop examining
	// the path components. We can only do this if we've already seen
	// a wildcard component-- otherwise, we still need to visit all path
	// components in case one of them is a wildcard.
	break;
	}
	if ((componentIdx < components.size() - 1) &&
	(!globFilter.hasPattern())) {
	// Optimization: if this is not the terminal path component, and we
	// are not matching against a glob, assume that it exists. If it
	// doesn't exist, we'll find out later when resolving a later glob
	// or the terminal path component.
	for (FileStatus candidate : candidates) {
	candidate.setPath(new Path(candidate.getPath(), component));
	}
	continue;
	}
	for (FileStatus candidate : candidates) {
	if (globFilter.hasPattern()) {
	FileStatus[] children = listStatus(candidate.getPath());
	if (children.length == 1) {
	// If we get back only one result, this could be either a listing
	// of a directory with one entry, or it could reflect the fact
	// that what we listed resolved to a file.
	//
	// Unfortunately, we can't just compare the returned paths to
	// figure this out. Consider the case where you have /a/b, where
	// b is a symlink to "..". In that case, listing /a/b will give
	// back "/a/b" again. If we just went by returned pathname, we'd
	// incorrectly conclude that /a/b was a file and should not match
	// /a//. So we use getFileStatus of the path we just listed to
	// disambiguate.
	Path path = candidate.getPath();
	FileStatus status = getFileStatus(path);
	if (status == null) {
	// null means the file was not found
	LOG.warn("File/directory {} not found:"
	+ " it may have been deleted."
	+ " If this is an object store, this can be a sign of"
	+ " eventual consistency problems.",
	path);
	continue;
	}
	if (!status.isDirectory()) {
	continue;
	}
	}
	for (FileStatus child : children) {
	if (componentIdx < components.size() - 1) {
	// Don't try to recurse into non-directories. See HADOOP-10957.
	if (!child.isDirectory()) continue;
	}
	// Set the child path based on the parent path.
	child.setPath(new Path(candidate.getPath(),
	child.getPath().getName()));
	if (globFilter.accept(child.getPath())) {
	newCandidates.add(child);
	}
	}
	} else {
	// When dealing with non-glob components, use getFileStatus
	// instead of listStatus. This is an optimization, but it also
	// is necessary for correctness in HDFS, since there are some
	// special HDFS directories like .reserved and .snapshot that are
	// not visible to listStatus, but which do exist. (See HADOOP-9877)
	FileStatus childStatus = getFileStatus(
	new Path(candidate.getPath(), component));
	if (childStatus != null) {
	newCandidates.add(childStatus);
	}
	}
	}
	candidates = newCandidates;
	}
	for (FileStatus status : candidates) {
	// Use object equality to see if this status is the root placeholder.
	// See the explanation for rootPlaceholder above for more information.
	if (status == rootPlaceholder) {
	status = getFileStatus(rootPlaceholder.getPath());
	if (status == null) continue;
	}
	// HADOOP-3497 semantics: the user-defined filter is applied at the
	// end, once the full path is built up.
	if (filter.accept(status.getPath())) {
	results.add(status);
	}
	}
	}
	/*
	* When the input pattern "looks" like just a simple filename, and we
	* can't find it, we return null rather than an empty array.
	* This is a special case which the shell relies on.
	*
	* To be more precise: if there were no results, AND there were no
	* groupings (aka brackets), and no wildcards in the input (aka stars),
	* we return null.
	*/
	if ((!sawWildcard) && results.isEmpty() &&
	(flattenedPatterns.size() <= 1)) {
	return null;
	}
	/*
	* In general, the results list will already be sorted, since listStatus
	* returns results in sorted order for many Hadoop filesystems. However,
	* not all Hadoop filesystems have this property. So we sort here in order
	* to get consistent results. See HADOOP-10798 for details.
	*/
	FileStatus ret[] = results.toArray(new FileStatus[0]);
	Arrays.sort(ret);
	return ret;
	}
	}