| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| use crate::error::CoreError; |
| use crate::storage::file_metadata::FileMetadata; |
| use crate::Result; |
| use std::fmt::Display; |
| use std::str::FromStr; |
| |
| /// Hudi Base file, part of a [FileSlice]. |
| #[derive(Clone, Debug)] |
| pub struct BaseFile { |
| /// The id of the enclosing [FileGroup]. |
| pub file_id: String, |
| |
| /// Monotonically increasing token for every attempt to write the [BaseFile]. |
| pub write_token: String, |
| |
| /// The timestamp of the commit instant in the Timeline that created the [BaseFile]. |
| pub commit_timestamp: String, |
| |
| /// File extension that matches to [crate::config::table::HudiTableConfig::BaseFileFormat]. |
| /// |
| /// See also [crate::config::table::BaseFileFormatValue]. |
| pub extension: String, |
| |
| /// The metadata about the file. |
| pub file_metadata: Option<FileMetadata>, |
| } |
| |
| impl BaseFile { |
| /// Parse a base file's name into parts. |
| /// |
| /// File name format: |
| /// |
| /// ```text |
| /// [File Id]_[File Write Token]_[Commit timestamp].[File Extension] |
| /// ``` |
| fn parse_file_name(file_name: &str) -> Result<(String, String, String, String)> { |
| let err_msg = format!("Failed to parse file name '{file_name}' for base file."); |
| let (stem, extension) = file_name |
| .rsplit_once('.') |
| .ok_or_else(|| CoreError::FileGroup(err_msg.clone()))?; |
| let parts: Vec<&str> = stem.split('_').collect(); |
| let file_id = parts |
| .first() |
| .ok_or_else(|| CoreError::FileGroup(err_msg.clone()))? |
| .to_string(); |
| let write_token = parts |
| .get(1) |
| .ok_or_else(|| CoreError::FileGroup(err_msg.clone()))? |
| .to_string(); |
| let commit_timestamp = parts |
| .get(2) |
| .ok_or_else(|| CoreError::FileGroup(err_msg.clone()))? |
| .to_string(); |
| Ok(( |
| file_id, |
| write_token, |
| commit_timestamp, |
| extension.to_string(), |
| )) |
| } |
| |
| #[inline] |
| pub fn file_name(&self) -> String { |
| format!( |
| "{file_id}_{write_token}_{commit_timestamp}.{extension}", |
| file_id = self.file_id, |
| write_token = self.write_token, |
| commit_timestamp = self.commit_timestamp, |
| extension = self.extension, |
| ) |
| } |
| } |
| |
| impl Display for BaseFile { |
| fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
| write!(f, "BaseFile: {}", self.file_name()) |
| } |
| } |
| |
| impl PartialEq for BaseFile { |
| fn eq(&self, other: &Self) -> bool { |
| self.file_name() == other.file_name() |
| } |
| } |
| |
| impl Eq for BaseFile {} |
| |
| impl FromStr for BaseFile { |
| type Err = CoreError; |
| |
| fn from_str(file_name: &str) -> Result<Self, Self::Err> { |
| let (file_id, write_token, commit_timestamp, extension) = Self::parse_file_name(file_name)?; |
| Ok(Self { |
| file_id, |
| write_token, |
| commit_timestamp, |
| extension, |
| file_metadata: None, |
| }) |
| } |
| } |
| |
| impl TryFrom<FileMetadata> for BaseFile { |
| type Error = CoreError; |
| |
| fn try_from(metadata: FileMetadata) -> Result<Self> { |
| let file_name = metadata.name.as_str(); |
| let (file_id, write_token, commit_timestamp, extension) = Self::parse_file_name(file_name)?; |
| Ok(Self { |
| file_id, |
| write_token, |
| commit_timestamp, |
| extension, |
| file_metadata: Some(metadata), |
| }) |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| #[test] |
| fn test_create_base_file_from_file_name() { |
| let file_name = "5a226868-2934-4f84-a16f-55124630c68d-0_0-7-24_20240402144910683.parquet"; |
| let base_file = BaseFile::from_str(file_name).unwrap(); |
| assert_eq!(base_file.file_id, "5a226868-2934-4f84-a16f-55124630c68d-0"); |
| assert_eq!(base_file.commit_timestamp, "20240402144910683"); |
| assert!(base_file.file_metadata.is_none()); |
| } |
| |
| #[test] |
| fn test_create_base_file_from_metadata() { |
| let metadata = FileMetadata::new( |
| "5a226868-2934-4f84-a16f-55124630c68d-0_0-7-24_20240402144910683.parquet", |
| 1024, |
| ); |
| let base_file = BaseFile::try_from(metadata).unwrap(); |
| assert_eq!(base_file.file_id, "5a226868-2934-4f84-a16f-55124630c68d-0"); |
| assert_eq!(base_file.commit_timestamp, "20240402144910683"); |
| let file_metadata = base_file.file_metadata.unwrap(); |
| assert_eq!(file_metadata.size, 1024); |
| assert!(!file_metadata.fully_populated); |
| } |
| |
| #[test] |
| fn create_a_base_file_returns_error() { |
| let result = BaseFile::from_str("no_file_extension"); |
| assert!(matches!(result.unwrap_err(), CoreError::FileGroup(_))); |
| |
| let result = BaseFile::from_str(".parquet"); |
| assert!(matches!(result.unwrap_err(), CoreError::FileGroup(_))); |
| |
| let metadata = FileMetadata::new("no-valid-delimiter.parquet", 1024); |
| let result = BaseFile::try_from(metadata); |
| assert!(matches!(result.unwrap_err(), CoreError::FileGroup(_))); |
| } |
| } |