| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| use percent_encoding::{NON_ALPHANUMERIC, utf8_percent_encode}; |
| use serde::Deserialize; |
| use serde::Serialize; |
| |
| use super::HUGGINGFACE_SCHEME; |
| use opendal_core::raw::*; |
| |
| /// Repository type of Huggingface. Supports `model`, `dataset`, `space`, and `bucket`. |
| /// [Reference](https://huggingface.co/docs/hub/repositories) |
| #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] |
| #[serde(rename_all = "lowercase")] |
| pub enum HfRepoType { |
| #[default] |
| Model, |
| Dataset, |
| Space, |
| Bucket, |
| } |
| |
| impl HfRepoType { |
| pub fn parse(s: &str) -> opendal_core::Result<Self> { |
| match s.to_lowercase().replace(' ', "").as_str() { |
| "model" | "models" => Ok(Self::Model), |
| "dataset" | "datasets" => Ok(Self::Dataset), |
| "space" | "spaces" => Ok(Self::Space), |
| "bucket" | "buckets" => Ok(Self::Bucket), |
| other => Err(opendal_core::Error::new( |
| opendal_core::ErrorKind::ConfigInvalid, |
| format!("unknown repo type: {other}"), |
| ) |
| .with_context("service", HUGGINGFACE_SCHEME)), |
| } |
| } |
| |
| pub fn as_str(&self) -> &'static str { |
| match self { |
| Self::Model => "model", |
| Self::Dataset => "dataset", |
| Self::Space => "space", |
| Self::Bucket => "bucket", |
| } |
| } |
| |
| pub fn as_plural_str(&self) -> &'static str { |
| match self { |
| Self::Model => "models", |
| Self::Dataset => "datasets", |
| Self::Space => "spaces", |
| Self::Bucket => "buckets", |
| } |
| } |
| } |
| |
| #[derive(Debug, Clone, PartialEq, Eq)] |
| pub struct HfRepo { |
| pub repo_type: HfRepoType, |
| pub repo_id: String, |
| pub revision: Option<String>, |
| } |
| |
| impl HfRepo { |
| pub fn new(repo_type: HfRepoType, repo_id: String, revision: Option<String>) -> Self { |
| Self { |
| repo_type, |
| repo_id, |
| revision, |
| } |
| } |
| |
| /// Whether this repo is a bucket (as opposed to a git-based repo). |
| pub fn is_bucket(&self) -> bool { |
| self.repo_type == HfRepoType::Bucket |
| } |
| |
| /// Return the revision, defaulting to "main" if unset. |
| pub fn revision(&self) -> &str { |
| self.revision.as_deref().unwrap_or("main") |
| } |
| |
| /// Create an `HfUri` for the given root and path within this repo. |
| pub fn uri(&self, root: &str, path: &str) -> HfUri { |
| HfUri { |
| repo: self.clone(), |
| path: build_abs_path(root, path) |
| .trim_start_matches('/') |
| .trim_end_matches('/') |
| .to_string(), |
| } |
| } |
| |
| /// Build the paths-info API URL for this repository. |
| pub fn paths_info_url(&self, endpoint: &str) -> String { |
| match self.repo_type { |
| HfRepoType::Bucket => { |
| format!("{}/api/buckets/{}/paths-info", endpoint, &self.repo_id) |
| } |
| _ => { |
| format!( |
| "{}/api/{}/{}/paths-info/{}", |
| endpoint, |
| self.repo_type.as_plural_str(), |
| &self.repo_id, |
| percent_encode_revision(self.revision()), |
| ) |
| } |
| } |
| } |
| |
| /// Build the XET token API URL for this repository. |
| pub fn xet_token_url(&self, endpoint: &str, token_type: &str) -> String { |
| match self.repo_type { |
| HfRepoType::Bucket => { |
| format!( |
| "{}/api/buckets/{}/xet-{}-token", |
| endpoint, &self.repo_id, token_type |
| ) |
| } |
| _ => { |
| format!( |
| "{}/api/{}/{}/xet-{}-token/{}", |
| endpoint, |
| self.repo_type.as_plural_str(), |
| &self.repo_id, |
| token_type, |
| self.revision(), |
| ) |
| } |
| } |
| } |
| |
| /// Build the bucket batch API URL for this repository. |
| pub fn bucket_batch_url(&self, endpoint: &str) -> String { |
| format!("{}/api/buckets/{}/batch", endpoint, &self.repo_id) |
| } |
| |
| /// Build the git commit API URL for this repository. |
| pub fn git_commit_url(&self, endpoint: &str) -> String { |
| format!( |
| "{}/api/{}/{}/commit/{}", |
| endpoint, |
| self.repo_type.as_plural_str(), |
| &self.repo_id, |
| percent_encode_revision(self.revision()), |
| ) |
| } |
| } |
| |
| /// Parsed Hugging Face URI following the official format: |
| /// `hf://[<repo_type_prefix>/]<repo_id>[@<revision>][/<path_in_repo>]` |
| /// |
| /// Use this directly when you need access to `path_in_repo` separately |
| /// from the config (e.g. to resolve a specific file within the repo). |
| #[derive(Debug, Clone, PartialEq, Eq)] |
| pub struct HfUri { |
| pub repo: HfRepo, |
| pub path: String, |
| } |
| |
| impl HfUri { |
| /// Parse a Hugging Face path into its components. |
| /// Path format: `[<repo_type>/]<repo_id>[@<revision>][/<path_in_repo>]` |
| pub fn parse(path: &str) -> opendal_core::Result<Self> { |
| if path.is_empty() { |
| return Err(opendal_core::Error::new( |
| opendal_core::ErrorKind::ConfigInvalid, |
| "repo_id is required in uri path", |
| ) |
| .with_context("service", HUGGINGFACE_SCHEME)); |
| } |
| |
| let mut path = path.to_string(); |
| |
| // Strip repo_type prefix if present (e.g. "datasets/user/repo" → "user/repo") |
| let repo_type = if let Some((first, rest)) = path.split_once('/') { |
| if let Ok(rt) = HfRepoType::parse(first) { |
| path = rest.to_string(); |
| rt |
| } else { |
| HfRepoType::Model |
| } |
| } else if HfRepoType::parse(&path).is_ok() { |
| return Err(opendal_core::Error::new( |
| opendal_core::ErrorKind::ConfigInvalid, |
| "repository name is required in uri path", |
| ) |
| .with_context("service", HUGGINGFACE_SCHEME)); |
| } else { |
| HfRepoType::Model |
| }; |
| |
| // Parse repo_id, revision, and path_in_repo. |
| // Path is now: <repo_id>[@<revision>][/<path_in_repo>] |
| let (repo_id, revision, path_in_repo) = if path.contains('/') { |
| // Check if @ appears in the first two segments (the repo_id portion). |
| // This distinguishes "user/repo@rev/file" from "user/repo/path/to/@file". |
| let first_two: String = path.splitn(3, '/').take(2).collect::<Vec<_>>().join("/"); |
| |
| if first_two.contains('@') { |
| let (repo_id, rev_and_path) = path.split_once('@').unwrap(); |
| let rev_and_path = rev_and_path.replace("%2F", "/"); |
| let (revision, path_in_repo) = Self::parse_revision(&rev_and_path); |
| (repo_id.to_string(), Some(revision), path_in_repo) |
| } else { |
| let segments: Vec<_> = path.splitn(3, '/').collect(); |
| let repo_id = format!("{}/{}", segments[0], segments[1]); |
| let path_in_repo = segments.get(2).copied().unwrap_or("").to_string(); |
| (repo_id, None, path_in_repo) |
| } |
| } else if let Some((repo_id, rev)) = path.split_once('@') { |
| let rev = rev.replace("%2F", "/"); |
| ( |
| repo_id.to_string(), |
| if rev.is_empty() { None } else { Some(rev) }, |
| String::new(), |
| ) |
| } else { |
| (path, None, String::new()) |
| }; |
| |
| Ok(Self { |
| repo: HfRepo::new(repo_type, repo_id, revision), |
| path: path_in_repo, |
| }) |
| } |
| |
| /// Split a string after `@` into (revision, path_in_repo). |
| /// Handles special refs like `refs/convert/parquet` and `refs/pr/10`. |
| fn parse_revision(rev_and_path: &str) -> (String, String) { |
| if !rev_and_path.contains('/') { |
| return (rev_and_path.to_string(), String::new()); |
| } |
| |
| // Match special refs: refs/(convert|pr)/<segment> |
| if let Some(rest) = rev_and_path.strip_prefix("refs/convert/") { |
| return if let Some(slash) = rest.find('/') { |
| ( |
| rev_and_path[..14 + slash].to_string(), |
| rest[slash + 1..].to_string(), |
| ) |
| } else { |
| (rev_and_path.to_string(), String::new()) |
| }; |
| } |
| if let Some(rest) = rev_and_path.strip_prefix("refs/pr/") { |
| return if let Some(slash) = rest.find('/') { |
| let revision = format!("refs/pr/{}", &rest[..slash]); |
| (revision, rest[slash + 1..].to_string()) |
| } else { |
| (rev_and_path.to_string(), String::new()) |
| }; |
| } |
| |
| // Regular revision: split on first / |
| let (rev, path) = rev_and_path.split_once('/').unwrap(); |
| (rev.to_string(), path.to_string()) |
| } |
| |
| /// Return the revision, defaulting to "main" if unset. |
| pub fn revision(&self) -> &str { |
| self.repo.revision() |
| } |
| |
| /// Build the resolve URL for this URI. |
| pub fn resolve_url(&self, endpoint: &str) -> String { |
| let revision = percent_encode_revision(self.revision()); |
| let path = percent_encode_path(&self.path); |
| match self.repo.repo_type { |
| HfRepoType::Model => { |
| format!( |
| "{}/{}/resolve/{}/{}", |
| endpoint, &self.repo.repo_id, revision, path |
| ) |
| } |
| HfRepoType::Dataset => { |
| format!( |
| "{}/datasets/{}/resolve/{}/{}", |
| endpoint, &self.repo.repo_id, revision, path |
| ) |
| } |
| HfRepoType::Space => { |
| format!( |
| "{}/spaces/{}/resolve/{}/{}", |
| endpoint, &self.repo.repo_id, revision, path |
| ) |
| } |
| HfRepoType::Bucket => { |
| format!( |
| "{}/buckets/{}/resolve/{}", |
| endpoint, &self.repo.repo_id, path |
| ) |
| } |
| } |
| } |
| |
| /// Build the paths-info API URL for this URI. |
| pub fn paths_info_url(&self, endpoint: &str) -> String { |
| self.repo.paths_info_url(endpoint) |
| } |
| |
| /// Build the file tree API URL for this URI. |
| pub fn file_tree_url(&self, endpoint: &str, recursive: bool, cursor: Option<&str>) -> String { |
| let mut url = if self.repo.is_bucket() { |
| format!( |
| "{}/api/buckets/{}/tree/{}?expand=True", |
| endpoint, |
| &self.repo.repo_id, |
| percent_encode_path(&self.path), |
| ) |
| } else { |
| format!( |
| "{}/api/{}/{}/tree/{}/{}?expand=True", |
| endpoint, |
| self.repo.repo_type.as_plural_str(), |
| &self.repo.repo_id, |
| percent_encode_revision(self.revision()), |
| percent_encode_path(&self.path), |
| ) |
| }; |
| |
| if recursive { |
| url.push_str("&recursive=True"); |
| } else if self.repo.is_bucket() { |
| // Bucket tree API defaults to recursive; must opt out explicitly. |
| url.push_str("&recursive=false"); |
| } |
| |
| if let Some(cursor_val) = cursor { |
| url.push_str(&format!("&cursor={}", cursor_val)); |
| } |
| |
| url |
| } |
| } |
| |
| pub(super) fn percent_encode_revision(revision: &str) -> String { |
| utf8_percent_encode(revision, NON_ALPHANUMERIC).to_string() |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| fn resolve(path: &str) -> HfUri { |
| HfUri::parse(path).unwrap() |
| } |
| |
| #[test] |
| fn test_repo_type_parse() { |
| assert_eq!(HfRepoType::parse("models").unwrap(), HfRepoType::Model); |
| assert_eq!(HfRepoType::parse("Models").unwrap(), HfRepoType::Model); |
| assert_eq!(HfRepoType::parse("MODELS").unwrap(), HfRepoType::Model); |
| assert_eq!(HfRepoType::parse("datasets").unwrap(), HfRepoType::Dataset); |
| assert_eq!(HfRepoType::parse("Datasets").unwrap(), HfRepoType::Dataset); |
| assert_eq!(HfRepoType::parse("spaces").unwrap(), HfRepoType::Space); |
| assert_eq!(HfRepoType::parse("Spaces").unwrap(), HfRepoType::Space); |
| assert_eq!(HfRepoType::parse("model").unwrap(), HfRepoType::Model); |
| assert_eq!(HfRepoType::parse("dataset").unwrap(), HfRepoType::Dataset); |
| assert_eq!(HfRepoType::parse("space").unwrap(), HfRepoType::Space); |
| assert_eq!(HfRepoType::parse("data sets").unwrap(), HfRepoType::Dataset); |
| assert_eq!(HfRepoType::parse("Data Sets").unwrap(), HfRepoType::Dataset); |
| assert!(HfRepoType::parse("unknown").is_err()); |
| assert!(HfRepoType::parse("foobar").is_err()); |
| } |
| |
| #[test] |
| fn resolve_with_namespace() { |
| let p = resolve("username/my_model"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Model); |
| assert_eq!(p.repo.repo_id, "username/my_model"); |
| assert!(p.repo.revision.is_none()); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_with_revision() { |
| let p = resolve("username/my_model@dev"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Model); |
| assert_eq!(p.repo.repo_id, "username/my_model"); |
| assert_eq!(p.repo.revision.as_deref(), Some("dev")); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_datasets_prefix() { |
| let p = resolve("datasets/username/my_dataset"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Dataset); |
| assert_eq!(p.repo.repo_id, "username/my_dataset"); |
| assert!(p.repo.revision.is_none()); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_datasets_prefix_and_revision() { |
| let p = resolve("datasets/username/my_dataset@dev"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Dataset); |
| assert_eq!(p.repo.repo_id, "username/my_dataset"); |
| assert_eq!(p.repo.revision.as_deref(), Some("dev")); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_with_path_in_repo() { |
| let p = resolve("username/my_model/config.json"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Model); |
| assert_eq!(p.repo.repo_id, "username/my_model"); |
| assert!(p.repo.revision.is_none()); |
| assert_eq!(p.path, "config.json"); |
| } |
| |
| #[test] |
| fn resolve_with_revision_and_path() { |
| let p = resolve("username/my_model@dev/path/to/file.txt"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Model); |
| assert_eq!(p.repo.repo_id, "username/my_model"); |
| assert_eq!(p.repo.revision.as_deref(), Some("dev")); |
| assert_eq!(p.path, "path/to/file.txt"); |
| } |
| |
| #[test] |
| fn resolve_datasets_revision_and_path() { |
| let p = resolve("datasets/username/my_dataset@dev/train/data.csv"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Dataset); |
| assert_eq!(p.repo.repo_id, "username/my_dataset"); |
| assert_eq!(p.repo.revision.as_deref(), Some("dev")); |
| assert_eq!(p.path, "train/data.csv"); |
| } |
| |
| #[test] |
| fn resolve_refs_convert_revision() { |
| let p = resolve("datasets/squad@refs/convert/parquet"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Dataset); |
| assert_eq!(p.repo.repo_id, "squad"); |
| assert_eq!(p.repo.revision.as_deref(), Some("refs/convert/parquet")); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_refs_pr_revision() { |
| let p = resolve("username/my_model@refs/pr/10"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Model); |
| assert_eq!(p.repo.repo_id, "username/my_model"); |
| assert_eq!(p.repo.revision.as_deref(), Some("refs/pr/10")); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_encoded_revision() { |
| let p = resolve("username/my_model@refs%2Fpr%2F10"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Model); |
| assert_eq!(p.repo.repo_id, "username/my_model"); |
| assert_eq!(p.repo.revision.as_deref(), Some("refs/pr/10")); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_at_in_path_not_revision() { |
| let p = resolve("username/my_model/path/to/@not-a-revision.txt"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Model); |
| assert_eq!(p.repo.repo_id, "username/my_model"); |
| assert!(p.repo.revision.is_none()); |
| assert_eq!(p.path, "path/to/@not-a-revision.txt"); |
| } |
| |
| #[test] |
| fn resolve_bare_repo_type_fails() { |
| assert!(HfUri::parse("datasets").is_err()); |
| assert!(HfUri::parse("").is_err()); |
| } |
| |
| #[test] |
| fn resolve_bare_repo_no_namespace() { |
| let p = resolve("gpt2"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Model); |
| assert_eq!(p.repo.repo_id, "gpt2"); |
| assert!(p.repo.revision.is_none()); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_bare_repo_with_revision() { |
| let p = resolve("gpt2@dev"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Model); |
| assert_eq!(p.repo.repo_id, "gpt2"); |
| assert_eq!(p.repo.revision.as_deref(), Some("dev")); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_bare_dataset_no_namespace() { |
| let p = resolve("datasets/squad"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Dataset); |
| assert_eq!(p.repo.repo_id, "squad"); |
| assert!(p.repo.revision.is_none()); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_bare_dataset_with_revision() { |
| let p = resolve("datasets/squad@dev"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Dataset); |
| assert_eq!(p.repo.repo_id, "squad"); |
| assert_eq!(p.repo.revision.as_deref(), Some("dev")); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_models_prefix() { |
| let p = resolve("models/username/my_model"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Model); |
| assert_eq!(p.repo.repo_id, "username/my_model"); |
| assert!(p.repo.revision.is_none()); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_spaces_prefix() { |
| let p = resolve("spaces/username/my_space"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Space); |
| assert_eq!(p.repo.repo_id, "username/my_space"); |
| assert!(p.repo.revision.is_none()); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_buckets_prefix() { |
| let p = resolve("buckets/username/my_bucket"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Bucket); |
| assert_eq!(p.repo.repo_id, "username/my_bucket"); |
| assert!(p.repo.revision.is_none()); |
| assert_eq!(p.path, ""); |
| } |
| |
| #[test] |
| fn resolve_buckets_with_path() { |
| let p = resolve("buckets/username/my_bucket/data/file.txt"); |
| assert_eq!(p.repo.repo_type, HfRepoType::Bucket); |
| assert_eq!(p.repo.repo_id, "username/my_bucket"); |
| assert!(p.repo.revision.is_none()); |
| assert_eq!(p.path, "data/file.txt"); |
| } |
| |
| #[test] |
| fn test_bucket_resolve_url() { |
| let p = resolve("buckets/user/bucket/file.txt"); |
| let url = p.resolve_url("https://huggingface.co"); |
| assert_eq!( |
| url, |
| "https://huggingface.co/buckets/user/bucket/resolve/file.txt" |
| ); |
| } |
| |
| #[test] |
| fn test_bucket_xet_token_urls() { |
| let p = resolve("buckets/user/bucket"); |
| let read_url = p.repo.xet_token_url("https://huggingface.co", "read"); |
| let write_url = p.repo.xet_token_url("https://huggingface.co", "write"); |
| assert_eq!( |
| read_url, |
| "https://huggingface.co/api/buckets/user/bucket/xet-read-token" |
| ); |
| assert_eq!( |
| write_url, |
| "https://huggingface.co/api/buckets/user/bucket/xet-write-token" |
| ); |
| } |
| |
| #[test] |
| fn test_bucket_batch_url() { |
| let p = resolve("buckets/user/bucket"); |
| let url = p.repo.bucket_batch_url("https://huggingface.co"); |
| assert_eq!(url, "https://huggingface.co/api/buckets/user/bucket/batch"); |
| } |
| } |