blob: 23488ef660c5fa8af9a7fa5fe7d0fd888565e09b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Path abstraction for Object Storage
use itertools::Itertools;
use percent_encoding::percent_decode;
use snafu::{ensure, ResultExt, Snafu};
use std::fmt::Formatter;
use url::Url;
/// The delimiter to separate object namespaces, creating a directory structure.
pub const DELIMITER: &str = "/";
/// The path delimiter as a single byte
pub const DELIMITER_BYTE: u8 = DELIMITER.as_bytes()[0];
mod parts;
pub use parts::{InvalidPart, PathPart};
/// Error returned by [`Path::parse`]
#[derive(Debug, Snafu)]
#[allow(missing_docs)]
pub enum Error {
#[snafu(display("Path \"{}\" contained empty path segment", path))]
EmptySegment { path: String },
#[snafu(display("Error parsing Path \"{}\": {}", path, source))]
BadSegment { path: String, source: InvalidPart },
#[snafu(display("Failed to canonicalize path \"{}\": {}", path.display(), source))]
Canonicalize {
path: std::path::PathBuf,
source: std::io::Error,
},
#[snafu(display("Unable to convert path \"{}\" to URL", path.display()))]
InvalidPath { path: std::path::PathBuf },
#[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))]
NonUnicode {
path: String,
source: std::str::Utf8Error,
},
#[snafu(display("Path {} does not start with prefix {}", path, prefix))]
PrefixMismatch { path: String, prefix: String },
}
/// A parsed path representation that can be safely written to object storage
///
/// # Path Safety
///
/// In theory object stores support any UTF-8 character sequence, however, certain character
/// sequences cause compatibility problems with some applications and protocols. As such the
/// naming guidelines for [S3], [GCS] and [Azure Blob Storage] all recommend sticking to a
/// limited character subset.
///
/// [S3]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html
/// [GCS]: https://cloud.google.com/storage/docs/naming-objects
/// [Azure Blob Storage]: https://docs.microsoft.com/en-us/rest/api/storageservices/Naming-and-Referencing-Containers--Blobs--and-Metadata#blob-names
///
/// This presents libraries with two options for consistent path handling:
///
/// 1. Allow constructing unsafe paths, allowing for both reading and writing of data to paths
/// that may not be consistently understood or supported
/// 2. Disallow constructing unsafe paths, ensuring data written can be consistently handled by
/// all other systems, but preventing interaction with objects at unsafe paths
///
/// This library takes the second approach, in particular:
///
/// * Paths are delimited by `/`
/// * Paths do not start with a `/`
/// * Empty path segments are discarded (e.g. `//` is treated as though it were `/`)
/// * Relative path segments, i.e. `.` and `..` are percent encoded
/// * Unsafe characters are percent encoded, as described by [RFC 1738]
/// * All paths are relative to the root of the object store
///
/// In order to provide these guarantees there are two ways to safely construct a [`Path`]
///
/// # Encode
///
/// A string containing potentially illegal path segments can be encoded to a [`Path`]
/// using [`Path::from`] or [`Path::from_iter`].
///
/// ```
/// # use object_store::path::Path;
/// assert_eq!(Path::from("foo/bar").as_ref(), "foo/bar");
/// assert_eq!(Path::from("foo//bar").as_ref(), "foo/bar");
/// assert_eq!(Path::from("foo/../bar").as_ref(), "foo/%2E%2E/bar");
/// assert_eq!(Path::from_iter(["foo", "foo/bar"]).as_ref(), "foo/foo%2Fbar");
/// ```
///
/// Note: if provided with an already percent encoded string, this will encode it again
///
/// ```
/// # use object_store::path::Path;
/// assert_eq!(Path::from("foo/foo%2Fbar").as_ref(), "foo/foo%252Fbar");
/// ```
///
/// # Parse
///
/// Alternatively a [`Path`] can be created from an existing string, returning an
/// error if it is invalid. Unlike the encoding methods, this will permit
/// valid percent encoded sequences.
///
/// ```
/// # use object_store::path::Path;
///
/// assert_eq!(Path::parse("/foo/foo%2Fbar").unwrap().as_ref(), "foo/foo%2Fbar");
/// Path::parse("..").unwrap_err();
/// Path::parse("/foo//").unwrap_err();
/// Path::parse("😀").unwrap_err();
/// Path::parse("%Q").unwrap_err();
/// ```
///
/// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt
#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Ord, PartialOrd)]
pub struct Path {
/// The raw path with no leading or trailing delimiters
raw: String,
}
impl Path {
/// Parse a string as a [`Path`], returning a [`Error`] if invalid,
/// as defined on the docstring for [`Path`]
///
/// Note: this will strip any leading `/` or trailing `/`
pub fn parse(path: impl AsRef<str>) -> Result<Self, Error> {
let path = path.as_ref();
let stripped = path.strip_prefix(DELIMITER).unwrap_or(path);
if stripped.is_empty() {
return Ok(Default::default());
}
let stripped = stripped.strip_suffix(DELIMITER).unwrap_or(stripped);
for segment in stripped.split(DELIMITER) {
ensure!(!segment.is_empty(), EmptySegmentSnafu { path });
PathPart::parse(segment).context(BadSegmentSnafu { path })?;
}
Ok(Self {
raw: stripped.to_string(),
})
}
/// Convert a filesystem path to a [`Path`] relative to the filesystem root
///
/// This will return an error if the path does not exist, or contains illegal
/// character sequences as defined by [`Path::parse`]
pub fn from_filesystem_path(
path: impl AsRef<std::path::Path>,
) -> Result<Self, Error> {
Self::from_filesystem_path_with_base(path, None)
}
/// Convert a filesystem path to a [`Path`] relative to the provided base
///
/// This will return an error if the path does not exist on the local filesystem,
/// contains illegal character sequences as defined by [`Path::parse`], or `base`
/// does not refer to a parent path of `path`
pub(crate) fn from_filesystem_path_with_base(
path: impl AsRef<std::path::Path>,
base: Option<&Url>,
) -> Result<Self, Error> {
let url = filesystem_path_to_url(path)?;
let path = match base {
Some(prefix) => url.path().strip_prefix(prefix.path()).ok_or_else(|| {
Error::PrefixMismatch {
path: url.path().to_string(),
prefix: prefix.to_string(),
}
})?,
None => url.path(),
};
// Reverse any percent encoding performed by conversion to URL
let decoded = percent_decode(path.as_bytes())
.decode_utf8()
.context(NonUnicodeSnafu { path })?;
Self::parse(decoded)
}
/// Returns the [`PathPart`] of this [`Path`]
pub fn parts(&self) -> impl Iterator<Item = PathPart<'_>> {
match self.raw.is_empty() {
true => itertools::Either::Left(std::iter::empty()),
false => itertools::Either::Right(
self.raw
.split(DELIMITER)
.map(|s| PathPart { raw: s.into() }),
),
}
}
/// Returns an iterator of the [`PathPart`] of this [`Path`] after `prefix`
///
/// Returns `None` if the prefix does not match
pub fn prefix_match(
&self,
prefix: &Self,
) -> Option<impl Iterator<Item = PathPart<'_>> + '_> {
let diff = itertools::diff_with(self.parts(), prefix.parts(), |a, b| a == b);
match diff {
// Both were equal
None => Some(itertools::Either::Left(std::iter::empty())),
// Mismatch or prefix was longer => None
Some(
itertools::Diff::FirstMismatch(_, _, _) | itertools::Diff::Longer(_, _),
) => None,
// Match with remaining
Some(itertools::Diff::Shorter(_, back)) => {
Some(itertools::Either::Right(back))
}
}
}
/// Returns true if this [`Path`] starts with `prefix`
pub fn prefix_matches(&self, prefix: &Self) -> bool {
self.prefix_match(prefix).is_some()
}
/// Creates a new child of this [`Path`]
pub fn child<'a>(&self, child: impl Into<PathPart<'a>>) -> Self {
let raw = match self.raw.is_empty() {
true => format!("{}", child.into().raw),
false => format!("{}{}{}", self.raw, DELIMITER, child.into().raw),
};
Self { raw }
}
}
impl AsRef<str> for Path {
fn as_ref(&self) -> &str {
&self.raw
}
}
impl From<&str> for Path {
fn from(path: &str) -> Self {
Self::from_iter(path.split(DELIMITER))
}
}
impl From<String> for Path {
fn from(path: String) -> Self {
Self::from_iter(path.split(DELIMITER))
}
}
impl From<Path> for String {
fn from(path: Path) -> Self {
path.raw
}
}
impl std::fmt::Display for Path {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
self.raw.fmt(f)
}
}
impl<'a, I> FromIterator<I> for Path
where
I: Into<PathPart<'a>>,
{
fn from_iter<T: IntoIterator<Item = I>>(iter: T) -> Self {
let raw = T::into_iter(iter)
.map(|s| s.into())
.filter(|s| !s.raw.is_empty())
.map(|s| s.raw)
.join(DELIMITER);
Self { raw }
}
}
/// Given a filesystem path, convert it to its canonical URL representation,
/// returning an error if the file doesn't exist on the local filesystem
pub(crate) fn filesystem_path_to_url(
path: impl AsRef<std::path::Path>,
) -> Result<Url, Error> {
let path = path.as_ref().canonicalize().context(CanonicalizeSnafu {
path: path.as_ref(),
})?;
match path.is_dir() {
true => Url::from_directory_path(&path),
false => Url::from_file_path(&path),
}
.map_err(|_| Error::InvalidPath { path })
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn cloud_prefix_with_trailing_delimiter() {
// Use case: files exist in object storage named `foo/bar.json` and
// `foo_test.json`. A search for the prefix `foo/` should return
// `foo/bar.json` but not `foo_test.json'.
let prefix = Path::from_iter(["test"]);
assert_eq!(prefix.as_ref(), "test");
}
#[test]
fn push_encodes() {
let location = Path::from_iter(["foo/bar", "baz%2Ftest"]);
assert_eq!(location.as_ref(), "foo%2Fbar/baz%252Ftest");
}
#[test]
fn test_parse() {
assert_eq!(Path::parse("/").unwrap().as_ref(), "");
assert_eq!(Path::parse("").unwrap().as_ref(), "");
let err = Path::parse("//").unwrap_err();
assert!(matches!(err, Error::EmptySegment { .. }));
assert_eq!(Path::parse("/foo/bar/").unwrap().as_ref(), "foo/bar");
assert_eq!(Path::parse("foo/bar/").unwrap().as_ref(), "foo/bar");
assert_eq!(Path::parse("foo/bar").unwrap().as_ref(), "foo/bar");
let err = Path::parse("foo///bar").unwrap_err();
assert!(matches!(err, Error::EmptySegment { .. }));
}
#[test]
fn convert_raw_before_partial_eq() {
// dir and file_name
let cloud = Path::from("test_dir/test_file.json");
let built = Path::from_iter(["test_dir", "test_file.json"]);
assert_eq!(built, cloud);
// dir and file_name w/o dot
let cloud = Path::from("test_dir/test_file");
let built = Path::from_iter(["test_dir", "test_file"]);
assert_eq!(built, cloud);
// dir, no file
let cloud = Path::from("test_dir/");
let built = Path::from_iter(["test_dir"]);
assert_eq!(built, cloud);
// file_name, no dir
let cloud = Path::from("test_file.json");
let built = Path::from_iter(["test_file.json"]);
assert_eq!(built, cloud);
// empty
let cloud = Path::from("");
let built = Path::from_iter(["", ""]);
assert_eq!(built, cloud);
}
#[test]
fn parts_after_prefix_behavior() {
let existing_path = Path::from("apple/bear/cow/dog/egg.json");
// Prefix with one directory
let prefix = Path::from("apple");
let expected_parts: Vec<PathPart<'_>> = vec!["bear", "cow", "dog", "egg.json"]
.into_iter()
.map(Into::into)
.collect();
let parts: Vec<_> = existing_path.prefix_match(&prefix).unwrap().collect();
assert_eq!(parts, expected_parts);
// Prefix with two directories
let prefix = Path::from("apple/bear");
let expected_parts: Vec<PathPart<'_>> = vec!["cow", "dog", "egg.json"]
.into_iter()
.map(Into::into)
.collect();
let parts: Vec<_> = existing_path.prefix_match(&prefix).unwrap().collect();
assert_eq!(parts, expected_parts);
// Not a prefix
let prefix = Path::from("cow");
assert!(existing_path.prefix_match(&prefix).is_none());
// Prefix with a partial directory
let prefix = Path::from("ap");
assert!(existing_path.prefix_match(&prefix).is_none());
// Prefix matches but there aren't any parts after it
let existing_path = Path::from("apple/bear/cow/dog");
let prefix = existing_path.clone();
assert_eq!(existing_path.prefix_match(&prefix).unwrap().count(), 0);
}
#[test]
fn prefix_matches() {
let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something"]);
let needle = haystack.clone();
// self starts with self
assert!(
haystack.prefix_matches(&haystack),
"{:?} should have started with {:?}",
haystack,
haystack
);
// a longer prefix doesn't match
let needle = needle.child("longer now");
assert!(
!haystack.prefix_matches(&needle),
"{:?} shouldn't have started with {:?}",
haystack,
needle
);
// one dir prefix matches
let needle = Path::from_iter(["foo/bar"]);
assert!(
haystack.prefix_matches(&needle),
"{:?} should have started with {:?}",
haystack,
needle
);
// two dir prefix matches
let needle = needle.child("baz%2Ftest");
assert!(
haystack.prefix_matches(&needle),
"{:?} should have started with {:?}",
haystack,
needle
);
// partial dir prefix doesn't match
let needle = Path::from_iter(["f"]);
assert!(
!haystack.prefix_matches(&needle),
"{:?} should not have started with {:?}",
haystack,
needle
);
// one dir and one partial dir doesn't match
let needle = Path::from_iter(["foo/bar", "baz"]);
assert!(
!haystack.prefix_matches(&needle),
"{:?} should not have started with {:?}",
haystack,
needle
);
// empty prefix matches
let needle = Path::from("");
assert!(
haystack.prefix_matches(&needle),
"{:?} should have started with {:?}",
haystack,
needle
);
}
#[test]
fn prefix_matches_with_file_name() {
let haystack =
Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo.segment"]);
// All directories match and file name is a prefix
let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo"]);
assert!(
!haystack.prefix_matches(&needle),
"{:?} should not have started with {:?}",
haystack,
needle
);
// All directories match but file name is not a prefix
let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "e"]);
assert!(
!haystack.prefix_matches(&needle),
"{:?} should not have started with {:?}",
haystack,
needle
);
// Not all directories match; file name is a prefix of the next directory; this
// does not match
let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "s"]);
assert!(
!haystack.prefix_matches(&needle),
"{:?} should not have started with {:?}",
haystack,
needle
);
// Not all directories match; file name is NOT a prefix of the next directory;
// no match
let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "p"]);
assert!(
!haystack.prefix_matches(&needle),
"{:?} should not have started with {:?}",
haystack,
needle
);
}
}