blob: c9ad793410b8ec753eb2ca584f4b045c84f1d6e0 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! # These examples of data formats and I/O
//!
//! These examples demonstrate data formats and I/O.
//!
//! ## Usage
//! ```bash
//! cargo run --example data_io -- [catalog|json_shredding|parquet_adv_idx|parquet_emb_idx|parquet_enc_with_kms|parquet_enc|parquet_exec_visitor|parquet_idx|query_http_csv|remote_catalog]
//! ```
//!
//! Each subcommand runs a corresponding example:
//! - `catalog` — register the table into a custom catalog
//! - `json_shredding` — shows how to implement custom filter rewriting for JSON shredding
//! - `parquet_adv_idx` — create a detailed secondary index that covers the contents of several parquet files
//! - `parquet_emb_idx` — store a custom index inside a Parquet file and use it to speed up queries
//! - `parquet_enc_with_kms` — read and write encrypted Parquet files using an encryption factory
//! - `parquet_enc` — read and write encrypted Parquet files using DataFusion
//! - `parquet_exec_visitor` — extract statistics by visiting an ExecutionPlan after execution
//! - `parquet_idx` — create an secondary index over several parquet files and use it to speed up queries
//! - `query_http_csv` — configure `object_store` and run a query against files via HTTP
//! - `remote_catalog` — interfacing with a remote catalog (e.g. over a network)
mod catalog;
mod json_shredding;
mod parquet_advanced_index;
mod parquet_embedded_index;
mod parquet_encrypted;
mod parquet_encrypted_with_kms;
mod parquet_exec_visitor;
mod parquet_index;
mod query_http_csv;
mod remote_catalog;
use std::str::FromStr;
use datafusion::error::{DataFusionError, Result};
enum ExampleKind {
Catalog,
JsonShredding,
ParquetAdvIdx,
ParquetEmbIdx,
ParquetEnc,
ParquetEncWithKms,
ParquetExecVisitor,
ParquetIdx,
QueryHttpCsv,
RemoteCatalog,
}
impl AsRef<str> for ExampleKind {
fn as_ref(&self) -> &str {
match self {
Self::Catalog => "catalog",
Self::JsonShredding => "json_shredding",
Self::ParquetAdvIdx => "parquet_adv_idx",
Self::ParquetEmbIdx => "parquet_emb_idx",
Self::ParquetEnc => "parquet_enc",
Self::ParquetEncWithKms => "parquet_enc_with_kms",
Self::ParquetExecVisitor => "parquet_exec_visitor",
Self::ParquetIdx => "parquet_idx",
Self::QueryHttpCsv => "query_http_csv",
Self::RemoteCatalog => "remote_catalog",
}
}
}
impl FromStr for ExampleKind {
type Err = DataFusionError;
fn from_str(s: &str) -> Result<Self> {
match s {
"catalog" => Ok(Self::Catalog),
"json_shredding" => Ok(Self::JsonShredding),
"parquet_adv_idx" => Ok(Self::ParquetAdvIdx),
"parquet_emb_idx" => Ok(Self::ParquetEmbIdx),
"parquet_enc" => Ok(Self::ParquetEnc),
"parquet_enc_with_kms" => Ok(Self::ParquetEncWithKms),
"parquet_exec_visitor" => Ok(Self::ParquetExecVisitor),
"parquet_idx" => Ok(Self::ParquetIdx),
"query_http_csv" => Ok(Self::QueryHttpCsv),
"remote_catalog" => Ok(Self::RemoteCatalog),
_ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))),
}
}
}
impl ExampleKind {
const ALL: [Self; 10] = [
Self::Catalog,
Self::JsonShredding,
Self::ParquetAdvIdx,
Self::ParquetEmbIdx,
Self::ParquetEnc,
Self::ParquetEncWithKms,
Self::ParquetExecVisitor,
Self::ParquetIdx,
Self::QueryHttpCsv,
Self::RemoteCatalog,
];
const EXAMPLE_NAME: &str = "data_io";
fn variants() -> Vec<&'static str> {
Self::ALL.iter().map(|x| x.as_ref()).collect()
}
}
#[tokio::main]
async fn main() -> Result<()> {
let usage = format!(
"Usage: cargo run --example {} -- [{}]",
ExampleKind::EXAMPLE_NAME,
ExampleKind::variants().join("|")
);
let arg = std::env::args().nth(1).ok_or_else(|| {
eprintln!("{usage}");
DataFusionError::Execution("Missing argument".to_string())
})?;
match arg.parse::<ExampleKind>()? {
ExampleKind::Catalog => catalog::catalog().await?,
ExampleKind::JsonShredding => json_shredding::json_shredding().await?,
ExampleKind::ParquetAdvIdx => {
parquet_advanced_index::parquet_advanced_index().await?
}
ExampleKind::ParquetEmbIdx => {
parquet_embedded_index::parquet_embedded_index().await?
}
ExampleKind::ParquetEncWithKms => {
parquet_encrypted_with_kms::parquet_encrypted_with_kms().await?
}
ExampleKind::ParquetEnc => parquet_encrypted::parquet_encrypted().await?,
ExampleKind::ParquetExecVisitor => {
parquet_exec_visitor::parquet_exec_visitor().await?
}
ExampleKind::ParquetIdx => parquet_index::parquet_index().await?,
ExampleKind::QueryHttpCsv => query_http_csv::query_http_csv().await?,
ExampleKind::RemoteCatalog => remote_catalog::remote_catalog().await?,
}
Ok(())
}