blob: 452d2314edb717b2ed8509006835231e2a1fc337 [file]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use percent_encoding::{NON_ALPHANUMERIC, utf8_percent_encode};
use serde::Deserialize;
use serde::Serialize;
use super::HUGGINGFACE_SCHEME;
use opendal_core::raw::*;
/// Repository type of Huggingface. Supports `model`, `dataset`, `space`, and `bucket`.
/// [Reference](https://huggingface.co/docs/hub/repositories)
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
#[serde(rename_all = "lowercase")]
pub enum HfRepoType {
#[default]
Model,
Dataset,
Space,
Bucket,
}
impl HfRepoType {
pub fn parse(s: &str) -> opendal_core::Result<Self> {
match s.to_lowercase().replace(' ', "").as_str() {
"model" | "models" => Ok(Self::Model),
"dataset" | "datasets" => Ok(Self::Dataset),
"space" | "spaces" => Ok(Self::Space),
"bucket" | "buckets" => Ok(Self::Bucket),
other => Err(opendal_core::Error::new(
opendal_core::ErrorKind::ConfigInvalid,
format!("unknown repo type: {other}"),
)
.with_context("service", HUGGINGFACE_SCHEME)),
}
}
pub fn as_str(&self) -> &'static str {
match self {
Self::Model => "model",
Self::Dataset => "dataset",
Self::Space => "space",
Self::Bucket => "bucket",
}
}
pub fn as_plural_str(&self) -> &'static str {
match self {
Self::Model => "models",
Self::Dataset => "datasets",
Self::Space => "spaces",
Self::Bucket => "buckets",
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HfRepo {
pub repo_type: HfRepoType,
pub repo_id: String,
pub revision: Option<String>,
}
impl HfRepo {
pub fn new(repo_type: HfRepoType, repo_id: String, revision: Option<String>) -> Self {
Self {
repo_type,
repo_id,
revision,
}
}
/// Whether this repo is a bucket (as opposed to a git-based repo).
pub fn is_bucket(&self) -> bool {
self.repo_type == HfRepoType::Bucket
}
/// Return the revision, defaulting to "main" if unset.
pub fn revision(&self) -> &str {
self.revision.as_deref().unwrap_or("main")
}
/// Create an `HfUri` for the given root and path within this repo.
pub fn uri(&self, root: &str, path: &str) -> HfUri {
HfUri {
repo: self.clone(),
path: build_abs_path(root, path)
.trim_start_matches('/')
.trim_end_matches('/')
.to_string(),
}
}
/// Build the paths-info API URL for this repository.
pub fn paths_info_url(&self, endpoint: &str) -> String {
match self.repo_type {
HfRepoType::Bucket => {
format!("{}/api/buckets/{}/paths-info", endpoint, &self.repo_id)
}
_ => {
format!(
"{}/api/{}/{}/paths-info/{}",
endpoint,
self.repo_type.as_plural_str(),
&self.repo_id,
percent_encode_revision(self.revision()),
)
}
}
}
/// Build the XET token API URL for this repository.
pub fn xet_token_url(&self, endpoint: &str, token_type: &str) -> String {
match self.repo_type {
HfRepoType::Bucket => {
format!(
"{}/api/buckets/{}/xet-{}-token",
endpoint, &self.repo_id, token_type
)
}
_ => {
format!(
"{}/api/{}/{}/xet-{}-token/{}",
endpoint,
self.repo_type.as_plural_str(),
&self.repo_id,
token_type,
self.revision(),
)
}
}
}
/// Build the bucket batch API URL for this repository.
pub fn bucket_batch_url(&self, endpoint: &str) -> String {
format!("{}/api/buckets/{}/batch", endpoint, &self.repo_id)
}
/// Build the git commit API URL for this repository.
pub fn git_commit_url(&self, endpoint: &str) -> String {
format!(
"{}/api/{}/{}/commit/{}",
endpoint,
self.repo_type.as_plural_str(),
&self.repo_id,
percent_encode_revision(self.revision()),
)
}
}
/// Parsed Hugging Face URI following the official format:
/// `hf://[<repo_type_prefix>/]<repo_id>[@<revision>][/<path_in_repo>]`
///
/// Use this directly when you need access to `path_in_repo` separately
/// from the config (e.g. to resolve a specific file within the repo).
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HfUri {
pub repo: HfRepo,
pub path: String,
}
impl HfUri {
/// Parse a Hugging Face path into its components.
/// Path format: `[<repo_type>/]<repo_id>[@<revision>][/<path_in_repo>]`
pub fn parse(path: &str) -> opendal_core::Result<Self> {
if path.is_empty() {
return Err(opendal_core::Error::new(
opendal_core::ErrorKind::ConfigInvalid,
"repo_id is required in uri path",
)
.with_context("service", HUGGINGFACE_SCHEME));
}
let mut path = path.to_string();
// Strip repo_type prefix if present (e.g. "datasets/user/repo" → "user/repo")
let repo_type = if let Some((first, rest)) = path.split_once('/') {
if let Ok(rt) = HfRepoType::parse(first) {
path = rest.to_string();
rt
} else {
HfRepoType::Model
}
} else if HfRepoType::parse(&path).is_ok() {
return Err(opendal_core::Error::new(
opendal_core::ErrorKind::ConfigInvalid,
"repository name is required in uri path",
)
.with_context("service", HUGGINGFACE_SCHEME));
} else {
HfRepoType::Model
};
// Parse repo_id, revision, and path_in_repo.
// Path is now: <repo_id>[@<revision>][/<path_in_repo>]
let (repo_id, revision, path_in_repo) = if path.contains('/') {
// Check if @ appears in the first two segments (the repo_id portion).
// This distinguishes "user/repo@rev/file" from "user/repo/path/to/@file".
let first_two: String = path.splitn(3, '/').take(2).collect::<Vec<_>>().join("/");
if first_two.contains('@') {
let (repo_id, rev_and_path) = path.split_once('@').unwrap();
let rev_and_path = rev_and_path.replace("%2F", "/");
let (revision, path_in_repo) = Self::parse_revision(&rev_and_path);
(repo_id.to_string(), Some(revision), path_in_repo)
} else {
let segments: Vec<_> = path.splitn(3, '/').collect();
let repo_id = format!("{}/{}", segments[0], segments[1]);
let path_in_repo = segments.get(2).copied().unwrap_or("").to_string();
(repo_id, None, path_in_repo)
}
} else if let Some((repo_id, rev)) = path.split_once('@') {
let rev = rev.replace("%2F", "/");
(
repo_id.to_string(),
if rev.is_empty() { None } else { Some(rev) },
String::new(),
)
} else {
(path, None, String::new())
};
Ok(Self {
repo: HfRepo::new(repo_type, repo_id, revision),
path: path_in_repo,
})
}
/// Split a string after `@` into (revision, path_in_repo).
/// Handles special refs like `refs/convert/parquet` and `refs/pr/10`.
fn parse_revision(rev_and_path: &str) -> (String, String) {
if !rev_and_path.contains('/') {
return (rev_and_path.to_string(), String::new());
}
// Match special refs: refs/(convert|pr)/<segment>
if let Some(rest) = rev_and_path.strip_prefix("refs/convert/") {
return if let Some(slash) = rest.find('/') {
(
rev_and_path[..14 + slash].to_string(),
rest[slash + 1..].to_string(),
)
} else {
(rev_and_path.to_string(), String::new())
};
}
if let Some(rest) = rev_and_path.strip_prefix("refs/pr/") {
return if let Some(slash) = rest.find('/') {
let revision = format!("refs/pr/{}", &rest[..slash]);
(revision, rest[slash + 1..].to_string())
} else {
(rev_and_path.to_string(), String::new())
};
}
// Regular revision: split on first /
let (rev, path) = rev_and_path.split_once('/').unwrap();
(rev.to_string(), path.to_string())
}
/// Return the revision, defaulting to "main" if unset.
pub fn revision(&self) -> &str {
self.repo.revision()
}
/// Build the resolve URL for this URI.
pub fn resolve_url(&self, endpoint: &str) -> String {
let revision = percent_encode_revision(self.revision());
let path = percent_encode_path(&self.path);
match self.repo.repo_type {
HfRepoType::Model => {
format!(
"{}/{}/resolve/{}/{}",
endpoint, &self.repo.repo_id, revision, path
)
}
HfRepoType::Dataset => {
format!(
"{}/datasets/{}/resolve/{}/{}",
endpoint, &self.repo.repo_id, revision, path
)
}
HfRepoType::Space => {
format!(
"{}/spaces/{}/resolve/{}/{}",
endpoint, &self.repo.repo_id, revision, path
)
}
HfRepoType::Bucket => {
format!(
"{}/buckets/{}/resolve/{}",
endpoint, &self.repo.repo_id, path
)
}
}
}
/// Build the paths-info API URL for this URI.
pub fn paths_info_url(&self, endpoint: &str) -> String {
self.repo.paths_info_url(endpoint)
}
/// Build the file tree API URL for this URI.
pub fn file_tree_url(&self, endpoint: &str, recursive: bool, cursor: Option<&str>) -> String {
let mut url = if self.repo.is_bucket() {
format!(
"{}/api/buckets/{}/tree/{}?expand=True",
endpoint,
&self.repo.repo_id,
percent_encode_path(&self.path),
)
} else {
format!(
"{}/api/{}/{}/tree/{}/{}?expand=True",
endpoint,
self.repo.repo_type.as_plural_str(),
&self.repo.repo_id,
percent_encode_revision(self.revision()),
percent_encode_path(&self.path),
)
};
if recursive {
url.push_str("&recursive=True");
} else if self.repo.is_bucket() {
// Bucket tree API defaults to recursive; must opt out explicitly.
url.push_str("&recursive=false");
}
if let Some(cursor_val) = cursor {
url.push_str(&format!("&cursor={}", cursor_val));
}
url
}
}
pub(super) fn percent_encode_revision(revision: &str) -> String {
utf8_percent_encode(revision, NON_ALPHANUMERIC).to_string()
}
#[cfg(test)]
mod tests {
use super::*;
fn resolve(path: &str) -> HfUri {
HfUri::parse(path).unwrap()
}
#[test]
fn test_repo_type_parse() {
assert_eq!(HfRepoType::parse("models").unwrap(), HfRepoType::Model);
assert_eq!(HfRepoType::parse("Models").unwrap(), HfRepoType::Model);
assert_eq!(HfRepoType::parse("MODELS").unwrap(), HfRepoType::Model);
assert_eq!(HfRepoType::parse("datasets").unwrap(), HfRepoType::Dataset);
assert_eq!(HfRepoType::parse("Datasets").unwrap(), HfRepoType::Dataset);
assert_eq!(HfRepoType::parse("spaces").unwrap(), HfRepoType::Space);
assert_eq!(HfRepoType::parse("Spaces").unwrap(), HfRepoType::Space);
assert_eq!(HfRepoType::parse("model").unwrap(), HfRepoType::Model);
assert_eq!(HfRepoType::parse("dataset").unwrap(), HfRepoType::Dataset);
assert_eq!(HfRepoType::parse("space").unwrap(), HfRepoType::Space);
assert_eq!(HfRepoType::parse("data sets").unwrap(), HfRepoType::Dataset);
assert_eq!(HfRepoType::parse("Data Sets").unwrap(), HfRepoType::Dataset);
assert!(HfRepoType::parse("unknown").is_err());
assert!(HfRepoType::parse("foobar").is_err());
}
#[test]
fn resolve_with_namespace() {
let p = resolve("username/my_model");
assert_eq!(p.repo.repo_type, HfRepoType::Model);
assert_eq!(p.repo.repo_id, "username/my_model");
assert!(p.repo.revision.is_none());
assert_eq!(p.path, "");
}
#[test]
fn resolve_with_revision() {
let p = resolve("username/my_model@dev");
assert_eq!(p.repo.repo_type, HfRepoType::Model);
assert_eq!(p.repo.repo_id, "username/my_model");
assert_eq!(p.repo.revision.as_deref(), Some("dev"));
assert_eq!(p.path, "");
}
#[test]
fn resolve_datasets_prefix() {
let p = resolve("datasets/username/my_dataset");
assert_eq!(p.repo.repo_type, HfRepoType::Dataset);
assert_eq!(p.repo.repo_id, "username/my_dataset");
assert!(p.repo.revision.is_none());
assert_eq!(p.path, "");
}
#[test]
fn resolve_datasets_prefix_and_revision() {
let p = resolve("datasets/username/my_dataset@dev");
assert_eq!(p.repo.repo_type, HfRepoType::Dataset);
assert_eq!(p.repo.repo_id, "username/my_dataset");
assert_eq!(p.repo.revision.as_deref(), Some("dev"));
assert_eq!(p.path, "");
}
#[test]
fn resolve_with_path_in_repo() {
let p = resolve("username/my_model/config.json");
assert_eq!(p.repo.repo_type, HfRepoType::Model);
assert_eq!(p.repo.repo_id, "username/my_model");
assert!(p.repo.revision.is_none());
assert_eq!(p.path, "config.json");
}
#[test]
fn resolve_with_revision_and_path() {
let p = resolve("username/my_model@dev/path/to/file.txt");
assert_eq!(p.repo.repo_type, HfRepoType::Model);
assert_eq!(p.repo.repo_id, "username/my_model");
assert_eq!(p.repo.revision.as_deref(), Some("dev"));
assert_eq!(p.path, "path/to/file.txt");
}
#[test]
fn resolve_datasets_revision_and_path() {
let p = resolve("datasets/username/my_dataset@dev/train/data.csv");
assert_eq!(p.repo.repo_type, HfRepoType::Dataset);
assert_eq!(p.repo.repo_id, "username/my_dataset");
assert_eq!(p.repo.revision.as_deref(), Some("dev"));
assert_eq!(p.path, "train/data.csv");
}
#[test]
fn resolve_refs_convert_revision() {
let p = resolve("datasets/squad@refs/convert/parquet");
assert_eq!(p.repo.repo_type, HfRepoType::Dataset);
assert_eq!(p.repo.repo_id, "squad");
assert_eq!(p.repo.revision.as_deref(), Some("refs/convert/parquet"));
assert_eq!(p.path, "");
}
#[test]
fn resolve_refs_pr_revision() {
let p = resolve("username/my_model@refs/pr/10");
assert_eq!(p.repo.repo_type, HfRepoType::Model);
assert_eq!(p.repo.repo_id, "username/my_model");
assert_eq!(p.repo.revision.as_deref(), Some("refs/pr/10"));
assert_eq!(p.path, "");
}
#[test]
fn resolve_encoded_revision() {
let p = resolve("username/my_model@refs%2Fpr%2F10");
assert_eq!(p.repo.repo_type, HfRepoType::Model);
assert_eq!(p.repo.repo_id, "username/my_model");
assert_eq!(p.repo.revision.as_deref(), Some("refs/pr/10"));
assert_eq!(p.path, "");
}
#[test]
fn resolve_at_in_path_not_revision() {
let p = resolve("username/my_model/path/to/@not-a-revision.txt");
assert_eq!(p.repo.repo_type, HfRepoType::Model);
assert_eq!(p.repo.repo_id, "username/my_model");
assert!(p.repo.revision.is_none());
assert_eq!(p.path, "path/to/@not-a-revision.txt");
}
#[test]
fn resolve_bare_repo_type_fails() {
assert!(HfUri::parse("datasets").is_err());
assert!(HfUri::parse("").is_err());
}
#[test]
fn resolve_bare_repo_no_namespace() {
let p = resolve("gpt2");
assert_eq!(p.repo.repo_type, HfRepoType::Model);
assert_eq!(p.repo.repo_id, "gpt2");
assert!(p.repo.revision.is_none());
assert_eq!(p.path, "");
}
#[test]
fn resolve_bare_repo_with_revision() {
let p = resolve("gpt2@dev");
assert_eq!(p.repo.repo_type, HfRepoType::Model);
assert_eq!(p.repo.repo_id, "gpt2");
assert_eq!(p.repo.revision.as_deref(), Some("dev"));
assert_eq!(p.path, "");
}
#[test]
fn resolve_bare_dataset_no_namespace() {
let p = resolve("datasets/squad");
assert_eq!(p.repo.repo_type, HfRepoType::Dataset);
assert_eq!(p.repo.repo_id, "squad");
assert!(p.repo.revision.is_none());
assert_eq!(p.path, "");
}
#[test]
fn resolve_bare_dataset_with_revision() {
let p = resolve("datasets/squad@dev");
assert_eq!(p.repo.repo_type, HfRepoType::Dataset);
assert_eq!(p.repo.repo_id, "squad");
assert_eq!(p.repo.revision.as_deref(), Some("dev"));
assert_eq!(p.path, "");
}
#[test]
fn resolve_models_prefix() {
let p = resolve("models/username/my_model");
assert_eq!(p.repo.repo_type, HfRepoType::Model);
assert_eq!(p.repo.repo_id, "username/my_model");
assert!(p.repo.revision.is_none());
assert_eq!(p.path, "");
}
#[test]
fn resolve_spaces_prefix() {
let p = resolve("spaces/username/my_space");
assert_eq!(p.repo.repo_type, HfRepoType::Space);
assert_eq!(p.repo.repo_id, "username/my_space");
assert!(p.repo.revision.is_none());
assert_eq!(p.path, "");
}
#[test]
fn resolve_buckets_prefix() {
let p = resolve("buckets/username/my_bucket");
assert_eq!(p.repo.repo_type, HfRepoType::Bucket);
assert_eq!(p.repo.repo_id, "username/my_bucket");
assert!(p.repo.revision.is_none());
assert_eq!(p.path, "");
}
#[test]
fn resolve_buckets_with_path() {
let p = resolve("buckets/username/my_bucket/data/file.txt");
assert_eq!(p.repo.repo_type, HfRepoType::Bucket);
assert_eq!(p.repo.repo_id, "username/my_bucket");
assert!(p.repo.revision.is_none());
assert_eq!(p.path, "data/file.txt");
}
#[test]
fn test_bucket_resolve_url() {
let p = resolve("buckets/user/bucket/file.txt");
let url = p.resolve_url("https://huggingface.co");
assert_eq!(
url,
"https://huggingface.co/buckets/user/bucket/resolve/file.txt"
);
}
#[test]
fn test_bucket_xet_token_urls() {
let p = resolve("buckets/user/bucket");
let read_url = p.repo.xet_token_url("https://huggingface.co", "read");
let write_url = p.repo.xet_token_url("https://huggingface.co", "write");
assert_eq!(
read_url,
"https://huggingface.co/api/buckets/user/bucket/xet-read-token"
);
assert_eq!(
write_url,
"https://huggingface.co/api/buckets/user/bucket/xet-write-token"
);
}
#[test]
fn test_bucket_batch_url() {
let p = resolve("buckets/user/bucket");
let url = p.repo.bucket_batch_url("https://huggingface.co");
assert_eq!(url, "https://huggingface.co/api/buckets/user/bucket/batch");
}
}