python/datafusion/__init__.py - datafusion-python - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """DataFusion: an in-process query engine built on Apache Arrow.

 DataFusion is not a database -- it has no server and no external dependencies.
 You create a :py:class:`SessionContext`, point it at data sources (Parquet, CSV,
 JSON, Arrow IPC, Pandas, Polars, or raw Python dicts/lists), and run queries
 using either SQL or the DataFrame API.

 Core abstractions
 -----------------
 - **SessionContext** -- entry point for loading data, running SQL, and creating
   DataFrames.
 - **DataFrame** -- lazy query builder. Every method returns a new DataFrame;
   call :py:meth:`~datafusion.dataframe.DataFrame.collect` or a ``to_*``
   method to execute.
 - **Expr** -- expression tree node for column references, literals, and function
   calls. Build with :py:func:`col` and :py:func:`lit`.
 - **functions** -- 290+ built-in scalar, aggregate, and window functions.

 Quick start
 -----------

 >>> from datafusion import SessionContext, col
 >>> from datafusion import functions as F
 >>> ctx = SessionContext()
 >>> df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
 >>> result = (
 ...     df.filter(col("a") > 1)
 ...       .with_column("total", col("a") + col("b"))
 ...       .aggregate([], [F.sum(col("total")).alias("grand_total")])
 ... )
 >>> result.to_pydict()
 {'grand_total': [16]}

 User guide and full documentation: https://datafusion.apache.org/python

 AI agent reference (SQL-to-DataFrame mappings, expression-building patterns,
 common pitfalls), written in a dense, skill-oriented format:
 https://github.com/apache/datafusion-python/blob/main/SKILL.md
 """

 from __future__ import annotations

 from typing import Any

 try:
     import importlib.metadata as importlib_metadata
 except ImportError:
     import importlib_metadata  # type: ignore[import]

 # Public submodules
 from . import functions, object_store, substrait, unparser

 # The following imports are okay to remain as opaque to the user.
 from ._internal import Config
 from .catalog import Catalog, Table
 from .col import col, column
 from .common import DFSchema
 from .context import (
     RuntimeEnvBuilder,
     SessionConfig,
     SessionContext,
     SQLOptions,
 )
 from .dataframe import (
     DataFrame,
     DataFrameWriteOptions,
     ExplainFormat,
     InsertOp,
     ParquetColumnOptions,
     ParquetWriterOptions,
 )
 from .dataframe_formatter import configure_formatter
 from .expr import Expr, WindowFrame
 from .io import read_avro, read_csv, read_json, read_parquet
 from .options import CsvReadOptions
 from .plan import ExecutionPlan, LogicalPlan, Metric, MetricsSet
 from .record_batch import RecordBatch, RecordBatchStream
 from .user_defined import (
     Accumulator,
     AggregateUDF,
     ScalarUDF,
     TableFunction,
     WindowUDF,
     udaf,
     udf,
     udtf,
     udwf,
 )

 __version__ = importlib_metadata.version(__name__)

 __all__ = [
     "Accumulator",
     "AggregateUDF",
     "Catalog",
     "Config",
     "CsvReadOptions",
     "DFSchema",
     "DataFrame",
     "DataFrameWriteOptions",
     "ExecutionPlan",
     "ExplainFormat",
     "Expr",
     "InsertOp",
     "LogicalPlan",
     "Metric",
     "MetricsSet",
     "ParquetColumnOptions",
     "ParquetWriterOptions",
     "RecordBatch",
     "RecordBatchStream",
     "RuntimeEnvBuilder",
     "SQLOptions",
     "ScalarUDF",
     "SessionConfig",
     "SessionContext",
     "Table",
     "TableFunction",
     "WindowFrame",
     "WindowUDF",
     "catalog",
     "col",
     "column",
     "common",
     "configure_formatter",
     "expr",
     "functions",
     "lit",
     "literal",
     "object_store",
     "options",
     "read_avro",
     "read_csv",
     "read_json",
     "read_parquet",
     "substrait",
     "udaf",
     "udf",
     "udtf",
     "udwf",
     "unparser",
 ]


 def literal(value: Any) -> Expr:
     """Create a literal expression."""
     return Expr.literal(value)


 def string_literal(value: str) -> Expr:
     """Create a UTF8 literal expression.

     It differs from `literal` which creates a UTF8view literal.
     """
     return Expr.string_literal(value)


 def str_lit(value: str) -> Expr:
     """Alias for `string_literal`."""
     return string_literal(value)


 def lit(value: Any) -> Expr:
     """Create a literal expression."""
     return Expr.literal(value)


 def literal_with_metadata(value: Any, metadata: dict[str, str]) -> Expr:
     """Creates a new expression representing a scalar value with metadata.

     Args:
         value: A valid PyArrow scalar value or easily castable to one.
         metadata: Metadata to attach to the expression.
     """
     return Expr.literal_with_metadata(value, metadata)


 def lit_with_metadata(value: Any, metadata: dict[str, str]) -> Expr:
     """Alias for literal_with_metadata."""
     return literal_with_metadata(value, metadata)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""DataFusion: an in-process query engine built on Apache Arrow.

	DataFusion is not a database -- it has no server and no external dependencies.
	You create a :py:class:`SessionContext`, point it at data sources (Parquet, CSV,
	JSON, Arrow IPC, Pandas, Polars, or raw Python dicts/lists), and run queries
	using either SQL or the DataFrame API.

	Core abstractions
	-----------------
	- SessionContext -- entry point for loading data, running SQL, and creating
	DataFrames.
	- DataFrame -- lazy query builder. Every method returns a new DataFrame;
	call :py:meth:`~datafusion.dataframe.DataFrame.collect` or a ``to_*``
	method to execute.
	- Expr -- expression tree node for column references, literals, and function
	calls. Build with :py:func:`col` and :py:func:`lit`.
	- functions -- 290+ built-in scalar, aggregate, and window functions.

	Quick start
	-----------

	>>> from datafusion import SessionContext, col
	>>> from datafusion import functions as F
	>>> ctx = SessionContext()
	>>> df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
	>>> result = (
	... df.filter(col("a") > 1)
	... .with_column("total", col("a") + col("b"))
	... .aggregate([], [F.sum(col("total")).alias("grand_total")])
	... )
	>>> result.to_pydict()
	{'grand_total': [16]}

	User guide and full documentation: https://datafusion.apache.org/python

	AI agent reference (SQL-to-DataFrame mappings, expression-building patterns,
	common pitfalls), written in a dense, skill-oriented format:
	https://github.com/apache/datafusion-python/blob/main/SKILL.md
	"""

	from __future__ import annotations

	from typing import Any

	try:
	import importlib.metadata as importlib_metadata
	except ImportError:
	import importlib_metadata # type: ignore[import]

	# Public submodules
	from . import functions, object_store, substrait, unparser

	# The following imports are okay to remain as opaque to the user.
	from ._internal import Config
	from .catalog import Catalog, Table
	from .col import col, column
	from .common import DFSchema
	from .context import (
	RuntimeEnvBuilder,
	SessionConfig,
	SessionContext,
	SQLOptions,
	)
	from .dataframe import (
	DataFrame,
	DataFrameWriteOptions,
	ExplainFormat,
	InsertOp,
	ParquetColumnOptions,
	ParquetWriterOptions,
	)
	from .dataframe_formatter import configure_formatter
	from .expr import Expr, WindowFrame
	from .io import read_avro, read_csv, read_json, read_parquet
	from .options import CsvReadOptions
	from .plan import ExecutionPlan, LogicalPlan, Metric, MetricsSet
	from .record_batch import RecordBatch, RecordBatchStream
	from .user_defined import (
	Accumulator,
	AggregateUDF,
	ScalarUDF,
	TableFunction,
	WindowUDF,
	udaf,
	udf,
	udtf,
	udwf,
	)

	__version__ = importlib_metadata.version(__name__)

	__all__ = [
	"Accumulator",
	"AggregateUDF",
	"Catalog",
	"Config",
	"CsvReadOptions",
	"DFSchema",
	"DataFrame",
	"DataFrameWriteOptions",
	"ExecutionPlan",
	"ExplainFormat",
	"Expr",
	"InsertOp",
	"LogicalPlan",
	"Metric",
	"MetricsSet",
	"ParquetColumnOptions",
	"ParquetWriterOptions",
	"RecordBatch",
	"RecordBatchStream",
	"RuntimeEnvBuilder",
	"SQLOptions",
	"ScalarUDF",
	"SessionConfig",
	"SessionContext",
	"Table",
	"TableFunction",
	"WindowFrame",
	"WindowUDF",
	"catalog",
	"col",
	"column",
	"common",
	"configure_formatter",
	"expr",
	"functions",
	"lit",
	"literal",
	"object_store",
	"options",
	"read_avro",
	"read_csv",
	"read_json",
	"read_parquet",
	"substrait",
	"udaf",
	"udf",
	"udtf",
	"udwf",
	"unparser",
	]


	def literal(value: Any) -> Expr:
	"""Create a literal expression."""
	return Expr.literal(value)


	def string_literal(value: str) -> Expr:
	"""Create a UTF8 literal expression.

	It differs from `literal` which creates a UTF8view literal.
	"""
	return Expr.string_literal(value)


	def str_lit(value: str) -> Expr:
	"""Alias for `string_literal`."""
	return string_literal(value)


	def lit(value: Any) -> Expr:
	"""Create a literal expression."""
	return Expr.literal(value)


	def literal_with_metadata(value: Any, metadata: dict[str, str]) -> Expr:
	"""Creates a new expression representing a scalar value with metadata.

	Args:
	value: A valid PyArrow scalar value or easily castable to one.
	metadata: Metadata to attach to the expression.
	"""
	return Expr.literal_with_metadata(value, metadata)


	def lit_with_metadata(value: Any, metadata: dict[str, str]) -> Expr:
	"""Alias for literal_with_metadata."""
	return literal_with_metadata(value, metadata)