blob: e4972411a54d13c8ea3e763a32388dc7df4a88f0 [file]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""DataFusion: an in-process query engine built on Apache Arrow.
DataFusion is not a database -- it has no server and no external dependencies.
You create a :py:class:`SessionContext`, point it at data sources (Parquet, CSV,
JSON, Arrow IPC, Pandas, Polars, or raw Python dicts/lists), and run queries
using either SQL or the DataFrame API.
Core abstractions
-----------------
- **SessionContext** -- entry point for loading data, running SQL, and creating
DataFrames.
- **DataFrame** -- lazy query builder. Every method returns a new DataFrame;
call :py:meth:`~datafusion.dataframe.DataFrame.collect` or a ``to_*``
method to execute.
- **Expr** -- expression tree node for column references, literals, and function
calls. Build with :py:func:`col` and :py:func:`lit`.
- **functions** -- 290+ built-in scalar, aggregate, and window functions.
Quick start
-----------
>>> from datafusion import SessionContext, col
>>> from datafusion import functions as F
>>> ctx = SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
>>> result = (
... df.filter(col("a") > 1)
... .with_column("total", col("a") + col("b"))
... .aggregate([], [F.sum(col("total")).alias("grand_total")])
... )
>>> result.to_pydict()
{'grand_total': [16]}
User guide and full documentation: https://datafusion.apache.org/python
AI agent reference (SQL-to-DataFrame mappings, expression-building patterns,
common pitfalls), written in a dense, skill-oriented format:
https://github.com/apache/datafusion-python/blob/main/SKILL.md
"""
from __future__ import annotations
from typing import Any
try:
import importlib.metadata as importlib_metadata
except ImportError:
import importlib_metadata # type: ignore[import]
# Public submodules
from . import functions, object_store, substrait, unparser
# The following imports are okay to remain as opaque to the user.
from ._internal import Config
from .catalog import Catalog, Table
from .col import col, column
from .common import DFSchema
from .context import (
RuntimeEnvBuilder,
SessionConfig,
SessionContext,
SQLOptions,
)
from .dataframe import (
DataFrame,
DataFrameWriteOptions,
ExplainFormat,
InsertOp,
ParquetColumnOptions,
ParquetWriterOptions,
)
from .dataframe_formatter import configure_formatter
from .expr import Expr, WindowFrame
from .io import read_avro, read_csv, read_json, read_parquet
from .options import CsvReadOptions
from .plan import ExecutionPlan, LogicalPlan, Metric, MetricsSet
from .record_batch import RecordBatch, RecordBatchStream
from .user_defined import (
Accumulator,
AggregateUDF,
ScalarUDF,
TableFunction,
WindowUDF,
udaf,
udf,
udtf,
udwf,
)
__version__ = importlib_metadata.version(__name__)
__all__ = [
"Accumulator",
"AggregateUDF",
"Catalog",
"Config",
"CsvReadOptions",
"DFSchema",
"DataFrame",
"DataFrameWriteOptions",
"ExecutionPlan",
"ExplainFormat",
"Expr",
"InsertOp",
"LogicalPlan",
"Metric",
"MetricsSet",
"ParquetColumnOptions",
"ParquetWriterOptions",
"RecordBatch",
"RecordBatchStream",
"RuntimeEnvBuilder",
"SQLOptions",
"ScalarUDF",
"SessionConfig",
"SessionContext",
"Table",
"TableFunction",
"WindowFrame",
"WindowUDF",
"catalog",
"col",
"column",
"common",
"configure_formatter",
"expr",
"functions",
"lit",
"literal",
"object_store",
"options",
"read_avro",
"read_csv",
"read_json",
"read_parquet",
"substrait",
"udaf",
"udf",
"udtf",
"udwf",
"unparser",
]
def literal(value: Any) -> Expr:
"""Create a literal expression."""
return Expr.literal(value)
def string_literal(value: str) -> Expr:
"""Create a UTF8 literal expression.
It differs from `literal` which creates a UTF8view literal.
"""
return Expr.string_literal(value)
def str_lit(value: str) -> Expr:
"""Alias for `string_literal`."""
return string_literal(value)
def lit(value: Any) -> Expr:
"""Create a literal expression."""
return Expr.literal(value)
def literal_with_metadata(value: Any, metadata: dict[str, str]) -> Expr:
"""Creates a new expression representing a scalar value with metadata.
Args:
value: A valid PyArrow scalar value or easily castable to one.
metadata: Metadata to attach to the expression.
"""
return Expr.literal_with_metadata(value, metadata)
def lit_with_metadata(value: Any, metadata: dict[str, str]) -> Expr:
"""Alias for literal_with_metadata."""
return literal_with_metadata(value, metadata)