blob: a0a4a52e53a42e3fe6c1332bac32ddd987e1ea31 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.extensions.gcp.util.gcsfs;
import static org.apache.beam.vendor.guava.v20_0.com.google.common.base.Preconditions.checkArgument;
import static org.apache.beam.vendor.guava.v20_0.com.google.common.base.Strings.isNullOrEmpty;
import com.google.api.services.storage.model.StorageObject;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.FileSystem;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.WatchEvent;
import java.nio.file.WatchKey;
import java.nio.file.WatchService;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
/**
* Implements the Java NIO {@link Path} API for Google Cloud Storage paths.
*
* <p>GcsPath uses a slash ('/') as a directory separator. Below is a summary of how slashes are
* treated:
*
* <ul>
* <li>A GCS bucket may not contain a slash. An object may contain zero or more slashes.
* <li>A trailing slash always indicates a directory, which is compliant with POSIX.1-2008.
* <li>Slashes separate components of a path. Empty components are allowed, these are represented
* as repeated slashes. An empty component always refers to a directory, and always ends in a
* slash.
* <li>{@link #getParent()}} always returns a path ending in a slash, as the parent of a GcsPath
* is always a directory.
* <li>Use {@link #resolve(String)} to append elements to a GcsPath -- this applies the rules
* consistently and is highly recommended over any custom string concatenation.
* </ul>
*
* <p>GcsPath treats all GCS objects and buckets as belonging to the same filesystem, so the root of
* a GcsPath is the GcsPath bucket="", object="".
*
* <p>Relative paths are not associated with any bucket. This matches common treatment of Path in
* which relative paths can be constructed from one filesystem and appended to another filesystem.
*
* @see <a href= "http://docs.oracle.com/javase/tutorial/essential/io/pathOps.html" >Java Tutorials:
* Path Operations</a>
*/
public class GcsPath implements Path, Serializable {
public static final String SCHEME = "gs";
/**
* Creates a GcsPath from a URI.
*
* <p>The URI must be in the form {@code gs://[bucket]/[path]}, and may not contain a port, user
* info, a query, or a fragment.
*/
public static GcsPath fromUri(URI uri) {
checkArgument(uri.getScheme().equalsIgnoreCase(SCHEME), "URI: %s is not a GCS URI", uri);
checkArgument(uri.getPort() == -1, "GCS URI may not specify port: %s (%i)", uri, uri.getPort());
checkArgument(
isNullOrEmpty(uri.getUserInfo()),
"GCS URI may not specify userInfo: %s (%s)",
uri,
uri.getUserInfo());
checkArgument(
isNullOrEmpty(uri.getQuery()),
"GCS URI may not specify query: %s (%s)",
uri,
uri.getQuery());
checkArgument(
isNullOrEmpty(uri.getFragment()),
"GCS URI may not specify fragment: %s (%s)",
uri,
uri.getFragment());
return fromUri(uri.toString());
}
/**
* Pattern that is used to parse a GCS URL.
*
* <p>This is used to separate the components. Verification is handled separately.
*/
public static final Pattern GCS_URI =
Pattern.compile("(?<SCHEME>[^:]+)://(?<BUCKET>[^/]+)(/(?<OBJECT>.*))?");
/**
* Creates a GcsPath from a URI in string form.
*
* <p>This does not use URI parsing, which means it may accept patterns that the URI parser would
* not accept.
*/
public static GcsPath fromUri(String uri) {
Matcher m = GCS_URI.matcher(uri);
checkArgument(m.matches(), "Invalid GCS URI: %s", uri);
checkArgument(m.group("SCHEME").equalsIgnoreCase(SCHEME), "URI: %s is not a GCS URI", uri);
return new GcsPath(null, m.group("BUCKET"), m.group("OBJECT"));
}
/** Pattern that is used to parse a GCS resource name. */
private static final Pattern GCS_RESOURCE_NAME =
Pattern.compile("storage.googleapis.com/(?<BUCKET>[^/]+)(/(?<OBJECT>.*))?");
/** Creates a GcsPath from a OnePlatform resource name in string form. */
public static GcsPath fromResourceName(String name) {
Matcher m = GCS_RESOURCE_NAME.matcher(name);
checkArgument(m.matches(), "Invalid GCS resource name: %s", name);
return new GcsPath(null, m.group("BUCKET"), m.group("OBJECT"));
}
/** Creates a GcsPath from a {@linkplain StorageObject}. */
public static GcsPath fromObject(StorageObject object) {
return new GcsPath(null, object.getBucket(), object.getName());
}
/**
* Creates a GcsPath from bucket and object components.
*
* <p>A GcsPath without a bucket name is treated as a relative path, which is a path component
* with no linkage to the root element. This is similar to a Unix path that does not begin with
* the root marker (a slash). GCS has different naming constraints and APIs for working with
* buckets and objects, so these two concepts are kept separate to avoid accidental attempts to
* treat objects as buckets, or vice versa, as much as possible.
*
* <p>A GcsPath without an object name is a bucket reference. A bucket is always a directory,
* which could be used to lookup or add files to a bucket, but could not be opened as a file.
*
* <p>A GcsPath containing neither bucket or object names is treated as the root of the GCS
* filesystem. A listing on the root element would return the buckets available to the user.
*
* <p>If {@code null} is passed as either parameter, it is converted to an empty string internally
* for consistency. There is no distinction between an empty string and a {@code null}, as neither
* are allowed by GCS.
*
* @param bucket a GCS bucket name, or none ({@code null} or an empty string) if the object is not
* associated with a bucket (e.g. relative paths or the root node).
* @param object a GCS object path, or none ({@code null} or an empty string) for no object.
*/
public static GcsPath fromComponents(@Nullable String bucket, @Nullable String object) {
return new GcsPath(null, bucket, object);
}
@Nullable private transient FileSystem fs;
@Nonnull private final String bucket;
@Nonnull private final String object;
/**
* Constructs a GcsPath.
*
* @param fs the associated FileSystem, if any
* @param bucket the associated bucket, or none ({@code null} or an empty string) for a relative
* path component
* @param object the object, which is a fully-qualified object name if bucket was also provided,
* or none ({@code null} or an empty string) for no object
* @throws java.lang.IllegalArgumentException if the bucket of object names are invalid.
*/
public GcsPath(@Nullable FileSystem fs, @Nullable String bucket, @Nullable String object) {
if (bucket == null) {
bucket = "";
}
checkArgument(!bucket.contains("/"), "GCS bucket may not contain a slash");
checkArgument(
bucket.isEmpty() || bucket.matches("[a-z0-9][-_a-z0-9.]+[a-z0-9]"),
"GCS bucket names must contain only lowercase letters, numbers, "
+ "dashes (-), underscores (_), and dots (.). Bucket names "
+ "must start and end with a number or letter. "
+ "See https://developers.google.com/storage/docs/bucketnaming "
+ "for more details. Bucket name: "
+ bucket);
if (object == null) {
object = "";
}
checkArgument(
object.indexOf('\n') < 0 && object.indexOf('\r') < 0,
"GCS object names must not contain Carriage Return or " + "Line Feed characters.");
this.fs = fs;
this.bucket = bucket;
this.object = object;
}
/**
* Returns the bucket name associated with this GCS path, or an empty string if this is a relative
* path component.
*/
public String getBucket() {
return bucket;
}
/**
* Returns the object name associated with this GCS path, or an empty string if no object is
* specified.
*/
public String getObject() {
return object;
}
public void setFileSystem(FileSystem fs) {
this.fs = fs;
}
@Override
public FileSystem getFileSystem() {
return fs;
}
// Absolute paths are those that have a bucket and the root path.
@Override
public boolean isAbsolute() {
return !bucket.isEmpty() || object.isEmpty();
}
@Override
public GcsPath getRoot() {
return new GcsPath(fs, "", "");
}
@Override
public GcsPath getFileName() {
int nameCount = getNameCount();
if (nameCount < 2) {
throw new UnsupportedOperationException(
"Can't get filename from root path in the bucket: " + this);
}
return getName(nameCount - 1);
}
/**
* Returns the <em>parent path</em>, or {@code null} if this path does not have a parent.
*
* <p>Returns a path that ends in '/', as the parent path always refers to a directory.
*/
@Override
public GcsPath getParent() {
if (bucket.isEmpty() && object.isEmpty()) {
// The root path has no parent, by definition.
return null;
}
if (object.isEmpty()) {
// A GCS bucket. All buckets come from a common root.
return getRoot();
}
// Skip last character, in case it is a trailing slash.
int i = object.lastIndexOf('/', object.length() - 2);
if (i <= 0) {
if (bucket.isEmpty()) {
// Relative paths are not attached to the root node.
return null;
}
return new GcsPath(fs, bucket, "");
}
// Retain trailing slash.
return new GcsPath(fs, bucket, object.substring(0, i + 1));
}
@Override
public int getNameCount() {
int count = bucket.isEmpty() ? 0 : 1;
if (object.isEmpty()) {
return count;
}
// Add another for each separator found.
int index = -1;
while ((index = object.indexOf('/', index + 1)) != -1) {
count++;
}
return object.endsWith("/") ? count : count + 1;
}
@Override
public GcsPath getName(int count) {
checkArgument(count >= 0);
Iterator<Path> iterator = iterator();
for (int i = 0; i < count; ++i) {
checkArgument(iterator.hasNext());
iterator.next();
}
checkArgument(iterator.hasNext());
return (GcsPath) iterator.next();
}
@Override
public GcsPath subpath(int beginIndex, int endIndex) {
checkArgument(beginIndex >= 0);
checkArgument(endIndex > beginIndex);
Iterator<Path> iterator = iterator();
for (int i = 0; i < beginIndex; ++i) {
checkArgument(iterator.hasNext());
iterator.next();
}
GcsPath path = null;
while (beginIndex < endIndex) {
checkArgument(iterator.hasNext());
if (path == null) {
path = (GcsPath) iterator.next();
} else {
path = path.resolve(iterator.next());
}
++beginIndex;
}
return path;
}
@Override
public boolean startsWith(Path other) {
if (other instanceof GcsPath) {
GcsPath gcsPath = (GcsPath) other;
return startsWith(gcsPath.bucketAndObject());
} else {
return startsWith(other.toString());
}
}
@Override
public boolean startsWith(String prefix) {
return bucketAndObject().startsWith(prefix);
}
@Override
public boolean endsWith(Path other) {
if (other instanceof GcsPath) {
GcsPath gcsPath = (GcsPath) other;
return endsWith(gcsPath.bucketAndObject());
} else {
return endsWith(other.toString());
}
}
@Override
public boolean endsWith(String suffix) {
return bucketAndObject().endsWith(suffix);
}
// TODO: support "." and ".." path components?
@Override
public GcsPath normalize() {
return this;
}
@Override
public GcsPath resolve(Path other) {
if (other instanceof GcsPath) {
GcsPath path = (GcsPath) other;
if (path.isAbsolute()) {
return path;
} else {
return resolve(path.getObject());
}
} else {
return resolve(other.toString());
}
}
@Override
public GcsPath resolve(String other) {
if (bucket.isEmpty() && object.isEmpty()) {
// Resolve on a root path is equivalent to looking up a bucket and object.
other = SCHEME + "://" + other;
}
if (other.startsWith(SCHEME + "://")) {
GcsPath path = GcsPath.fromUri(other);
path.setFileSystem(getFileSystem());
return path;
}
if (other.isEmpty()) {
// An empty component MUST refer to a directory.
other = "/";
}
if (object.isEmpty()) {
return new GcsPath(fs, bucket, other);
} else if (object.endsWith("/")) {
return new GcsPath(fs, bucket, object + other);
} else {
return new GcsPath(fs, bucket, object + "/" + other);
}
}
@Override
public Path resolveSibling(Path other) {
throw new UnsupportedOperationException();
}
@Override
public Path resolveSibling(String other) {
if (getNameCount() < 2) {
throw new UnsupportedOperationException("Can't resolve the sibling of a root path: " + this);
}
GcsPath parent = getParent();
return (parent == null) ? fromUri(other) : parent.resolve(other);
}
@Override
public Path relativize(Path other) {
throw new UnsupportedOperationException();
}
@Override
public GcsPath toAbsolutePath() {
return this;
}
@Override
public GcsPath toRealPath(LinkOption... options) throws IOException {
return this;
}
@Override
public File toFile() {
throw new UnsupportedOperationException();
}
@Override
public WatchKey register(
WatchService watcher, WatchEvent.Kind<?>[] events, WatchEvent.Modifier... modifiers)
throws IOException {
throw new UnsupportedOperationException();
}
@Override
public WatchKey register(WatchService watcher, WatchEvent.Kind<?>... events) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public Iterator<Path> iterator() {
return new NameIterator(fs, !bucket.isEmpty(), bucketAndObject());
}
private static class NameIterator implements Iterator<Path> {
private final FileSystem fs;
private boolean fullPath;
private String name;
NameIterator(FileSystem fs, boolean fullPath, String name) {
this.fs = fs;
this.fullPath = fullPath;
this.name = name;
}
@Override
public boolean hasNext() {
return !isNullOrEmpty(name);
}
@Override
public GcsPath next() {
int i = name.indexOf('/');
String component;
if (i >= 0) {
component = name.substring(0, i);
name = name.substring(i + 1);
} else {
component = name;
name = null;
}
if (fullPath) {
fullPath = false;
return new GcsPath(fs, component, "");
} else {
// Relative paths have no bucket.
return new GcsPath(fs, "", component);
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
@Override
public int compareTo(Path other) {
if (!(other instanceof GcsPath)) {
throw new ClassCastException();
}
GcsPath path = (GcsPath) other;
int b = bucket.compareTo(path.bucket);
if (b != 0) {
return b;
}
// Compare a component at a time, so that the separator char doesn't
// get compared against component contents. Eg, "a/b" < "a-1/b".
Iterator<Path> left = iterator();
Iterator<Path> right = path.iterator();
while (left.hasNext() && right.hasNext()) {
String leftStr = left.next().toString();
String rightStr = right.next().toString();
int c = leftStr.compareTo(rightStr);
if (c != 0) {
return c;
}
}
if (!left.hasNext() && !right.hasNext()) {
return 0;
} else {
return left.hasNext() ? 1 : -1;
}
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
GcsPath paths = (GcsPath) o;
return bucket.equals(paths.bucket) && object.equals(paths.object);
}
@Override
public int hashCode() {
int result = bucket.hashCode();
result = 31 * result + object.hashCode();
return result;
}
@Override
public String toString() {
if (!isAbsolute()) {
return object;
}
StringBuilder sb = new StringBuilder();
sb.append(SCHEME).append("://");
if (!bucket.isEmpty()) {
sb.append(bucket).append('/');
}
sb.append(object);
return sb.toString();
}
// TODO: Consider using resource names for all GCS paths used by the SDK.
public String toResourceName() {
StringBuilder sb = new StringBuilder();
sb.append("storage.googleapis.com/");
if (!bucket.isEmpty()) {
sb.append(bucket).append('/');
}
sb.append(object);
return sb.toString();
}
@Override
public URI toUri() {
try {
return new URI(SCHEME, "//" + bucketAndObject(), null);
} catch (URISyntaxException e) {
throw new RuntimeException("Unable to create URI for GCS path " + this);
}
}
private String bucketAndObject() {
if (bucket.isEmpty()) {
return object;
} else {
return bucket + "/" + object;
}
}
}