| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| //! # These examples of data formats and I/O |
| //! |
| //! These examples demonstrate data formats and I/O. |
| //! |
| //! ## Usage |
| //! ```bash |
| //! cargo run --example data_io -- [all|catalog|in_memory_object_store|json_shredding|parquet_adv_idx|parquet_emb_idx|parquet_enc_with_kms|parquet_enc|parquet_exec_visitor|parquet_idx|query_http_csv|remote_catalog] |
| //! ``` |
| //! |
| //! Each subcommand runs a corresponding example: |
| //! - `all` — run all examples included in this module |
| //! |
| //! - `catalog` |
| //! (file: catalog.rs, desc: Register tables into a custom catalog) |
| //! |
| //! - `in_memory_object_store` |
| //! (file: in_memory_object_store.rs, desc: Read CSV from an in-memory object store (pattern applies to JSON/Parquet)) |
| //! |
| //! - `json_shredding` |
| //! (file: json_shredding.rs, desc: Implement filter rewriting for JSON shredding) |
| //! |
| //! - `parquet_adv_idx` |
| //! (file: parquet_advanced_index.rs, desc: Create a secondary index across multiple parquet files) |
| //! |
| //! - `parquet_emb_idx` |
| //! (file: parquet_embedded_index.rs, desc: Store a custom index inside Parquet files) |
| //! |
| //! - `parquet_enc` |
| //! (file: parquet_encrypted.rs, desc: Read & write encrypted Parquet files) |
| //! |
| //! - `parquet_enc_with_kms` |
| //! (file: parquet_encrypted_with_kms.rs, desc: Encrypted Parquet I/O using a KMS-backed factory) |
| //! |
| //! - `parquet_exec_visitor` |
| //! (file: parquet_exec_visitor.rs, desc: Extract statistics by visiting an ExecutionPlan) |
| //! |
| //! - `parquet_idx` |
| //! (file: parquet_index.rs, desc: Create a secondary index) |
| //! |
| //! - `query_http_csv` |
| //! (file: query_http_csv.rs, desc: Query CSV files via HTTP) |
| //! |
| //! - `remote_catalog` |
| //! (file: remote_catalog.rs, desc: Interact with a remote catalog) |
| |
| mod catalog; |
| mod in_memory_object_store; |
| mod json_shredding; |
| mod parquet_advanced_index; |
| mod parquet_embedded_index; |
| mod parquet_encrypted; |
| mod parquet_encrypted_with_kms; |
| mod parquet_exec_visitor; |
| mod parquet_index; |
| mod query_http_csv; |
| mod remote_catalog; |
| |
| use datafusion::error::{DataFusionError, Result}; |
| use strum::{IntoEnumIterator, VariantNames}; |
| use strum_macros::{Display, EnumIter, EnumString, VariantNames}; |
| |
| #[derive(EnumIter, EnumString, Display, VariantNames)] |
| #[strum(serialize_all = "snake_case")] |
| enum ExampleKind { |
| All, |
| Catalog, |
| InMemoryObjectStore, |
| JsonShredding, |
| ParquetAdvIdx, |
| ParquetEmbIdx, |
| ParquetEnc, |
| ParquetEncWithKms, |
| ParquetExecVisitor, |
| ParquetIdx, |
| QueryHttpCsv, |
| RemoteCatalog, |
| } |
| |
| impl ExampleKind { |
| const EXAMPLE_NAME: &str = "data_io"; |
| |
| fn runnable() -> impl Iterator<Item = ExampleKind> { |
| ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All)) |
| } |
| |
| async fn run(&self) -> Result<()> { |
| match self { |
| ExampleKind::All => { |
| for example in ExampleKind::runnable() { |
| println!("Running example: {example}"); |
| Box::pin(example.run()).await?; |
| } |
| } |
| ExampleKind::Catalog => catalog::catalog().await?, |
| ExampleKind::InMemoryObjectStore => { |
| in_memory_object_store::in_memory_object_store().await? |
| } |
| ExampleKind::JsonShredding => json_shredding::json_shredding().await?, |
| ExampleKind::ParquetAdvIdx => { |
| parquet_advanced_index::parquet_advanced_index().await? |
| } |
| ExampleKind::ParquetEmbIdx => { |
| parquet_embedded_index::parquet_embedded_index().await? |
| } |
| ExampleKind::ParquetEncWithKms => { |
| parquet_encrypted_with_kms::parquet_encrypted_with_kms().await? |
| } |
| ExampleKind::ParquetEnc => parquet_encrypted::parquet_encrypted().await?, |
| ExampleKind::ParquetExecVisitor => { |
| parquet_exec_visitor::parquet_exec_visitor().await? |
| } |
| ExampleKind::ParquetIdx => parquet_index::parquet_index().await?, |
| ExampleKind::QueryHttpCsv => query_http_csv::query_http_csv().await?, |
| ExampleKind::RemoteCatalog => remote_catalog::remote_catalog().await?, |
| } |
| Ok(()) |
| } |
| } |
| |
| #[tokio::main] |
| async fn main() -> Result<()> { |
| let usage = format!( |
| "Usage: cargo run --example {} -- [{}]", |
| ExampleKind::EXAMPLE_NAME, |
| ExampleKind::VARIANTS.join("|") |
| ); |
| |
| let example: ExampleKind = std::env::args() |
| .nth(1) |
| .unwrap_or_else(|| ExampleKind::All.to_string()) |
| .parse() |
| .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?; |
| |
| example.run().await |
| } |