python/datafusion/options.py - datafusion-python - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """Options for reading various file formats."""

 from __future__ import annotations

 import warnings
 from typing import TYPE_CHECKING

 import pyarrow as pa

 from datafusion.expr import sort_list_to_raw_sort_list

 if TYPE_CHECKING:
     from datafusion.expr import SortExpr

 from ._internal import options

 __all__ = ["CsvReadOptions"]

 DEFAULT_MAX_INFER_SCHEMA = 1000


 class CsvReadOptions:
     """Options for reading CSV files.

     This class provides a builder pattern for configuring CSV reading options.
     All methods starting with ``with_`` return ``self`` to allow method chaining.
     """

     def __init__(
         self,
         *,
         has_header: bool = True,
         delimiter: str = ",",
         quote: str = '"',
         terminator: str | None = None,
         escape: str | None = None,
         comment: str | None = None,
         newlines_in_values: bool = False,
         schema: pa.Schema | None = None,
         schema_infer_max_records: int = DEFAULT_MAX_INFER_SCHEMA,
         file_extension: str = ".csv",
         table_partition_cols: list[tuple[str, pa.DataType]] | None = None,
         file_compression_type: str = "",
         file_sort_order: list[list[SortExpr]] | None = None,
         null_regex: str | None = None,
         truncated_rows: bool = False,
     ) -> None:
         """Initialize CsvReadOptions.

         Args:
             has_header: Does the CSV file have a header row? If schema inference
                 is run on a file with no headers, default column names are created.
             delimiter: Column delimiter character. Must be a single ASCII character.
             quote: Quote character for fields containing delimiters or newlines.
                 Must be a single ASCII character.
             terminator: Optional line terminator character. If ``None``, uses CRLF.
                 Must be a single ASCII character.
             escape: Optional escape character for quotes. Must be a single ASCII
                 character.
             comment: If specified, lines beginning with this character are ignored.
                 Must be a single ASCII character.
             newlines_in_values: Whether newlines in quoted values are supported.
                 Parsing newlines in quoted values may be affected by execution
                 behavior such as parallel file scanning. Setting this to ``True``
                 ensures that newlines in values are parsed successfully, which may
                 reduce performance.
             schema: Optional PyArrow schema representing the CSV files. If ``None``,
                 the CSV reader will try to infer it based on data in the file.
             schema_infer_max_records: Maximum number of rows to read from CSV files
                 for schema inference if needed.
             file_extension: File extension; only files with this extension are
                 selected for data input.
             table_partition_cols: Partition columns as a list of tuples of
                 (column_name, data_type).
             file_compression_type: File compression type. Supported values are
                 ``"gzip"``, ``"bz2"``, ``"xz"``, ``"zstd"``, or empty string for
                 uncompressed.
             file_sort_order: Optional sort order of the files as a list of sort
                 expressions per file.
             null_regex: Optional regex pattern to match null values in the CSV.
             truncated_rows: Whether to allow truncated rows when parsing. By default
                 this is ``False`` and will error if the CSV rows have different
                 lengths. When set to ``True``, it will allow records with less than
                 the expected number of columns and fill the missing columns with
                 nulls. If the record's schema is not nullable, it will still return
                 an error.
         """
         validate_single_character("delimiter", delimiter)
         validate_single_character("quote", quote)
         validate_single_character("terminator", terminator)
         validate_single_character("escape", escape)
         validate_single_character("comment", comment)

         self.has_header = has_header
         self.delimiter = delimiter
         self.quote = quote
         self.terminator = terminator
         self.escape = escape
         self.comment = comment
         self.newlines_in_values = newlines_in_values
         self.schema = schema
         self.schema_infer_max_records = schema_infer_max_records
         self.file_extension = file_extension
         self.table_partition_cols = table_partition_cols or []
         self.file_compression_type = file_compression_type
         self.file_sort_order = file_sort_order or []
         self.null_regex = null_regex
         self.truncated_rows = truncated_rows

     def with_has_header(self, has_header: bool) -> CsvReadOptions:
         """Configure whether the CSV has a header row."""
         self.has_header = has_header
         return self

     def with_delimiter(self, delimiter: str) -> CsvReadOptions:
         """Configure the column delimiter."""
         self.delimiter = delimiter
         return self

     def with_quote(self, quote: str) -> CsvReadOptions:
         """Configure the quote character."""
         self.quote = quote
         return self

     def with_terminator(self, terminator: str | None) -> CsvReadOptions:
         """Configure the line terminator character."""
         self.terminator = terminator
         return self

     def with_escape(self, escape: str | None) -> CsvReadOptions:
         """Configure the escape character."""
         self.escape = escape
         return self

     def with_comment(self, comment: str | None) -> CsvReadOptions:
         """Configure the comment character."""
         self.comment = comment
         return self

     def with_newlines_in_values(self, newlines_in_values: bool) -> CsvReadOptions:
         """Configure whether newlines in values are supported."""
         self.newlines_in_values = newlines_in_values
         return self

     def with_schema(self, schema: pa.Schema | None) -> CsvReadOptions:
         """Configure the schema."""
         self.schema = schema
         return self

     def with_schema_infer_max_records(
         self, schema_infer_max_records: int
     ) -> CsvReadOptions:
         """Configure maximum records for schema inference."""
         self.schema_infer_max_records = schema_infer_max_records
         return self

     def with_file_extension(self, file_extension: str) -> CsvReadOptions:
         """Configure the file extension filter."""
         self.file_extension = file_extension
         return self

     def with_table_partition_cols(
         self, table_partition_cols: list[tuple[str, pa.DataType]]
     ) -> CsvReadOptions:
         """Configure table partition columns."""
         self.table_partition_cols = table_partition_cols
         return self

     def with_file_compression_type(self, file_compression_type: str) -> CsvReadOptions:
         """Configure file compression type."""
         self.file_compression_type = file_compression_type
         return self

     def with_file_sort_order(
         self, file_sort_order: list[list[SortExpr]]
     ) -> CsvReadOptions:
         """Configure file sort order."""
         self.file_sort_order = file_sort_order
         return self

     def with_null_regex(self, null_regex: str | None) -> CsvReadOptions:
         """Configure null value regex pattern."""
         self.null_regex = null_regex
         return self

     def with_truncated_rows(self, truncated_rows: bool) -> CsvReadOptions:
         """Configure whether to allow truncated rows."""
         self.truncated_rows = truncated_rows
         return self

     def to_inner(self) -> options.CsvReadOptions:
         """Convert this object into the underlying Rust structure.

         This is intended for internal use only.
         """
         file_sort_order = (
             []
             if self.file_sort_order is None
             else [
                 sort_list_to_raw_sort_list(sort_list)
                 for sort_list in self.file_sort_order
             ]
         )

         return options.CsvReadOptions(
             has_header=self.has_header,
             delimiter=ord(self.delimiter[0]) if self.delimiter else ord(","),
             quote=ord(self.quote[0]) if self.quote else ord('"'),
             terminator=ord(self.terminator[0]) if self.terminator else None,
             escape=ord(self.escape[0]) if self.escape else None,
             comment=ord(self.comment[0]) if self.comment else None,
             newlines_in_values=self.newlines_in_values,
             schema=self.schema,
             schema_infer_max_records=self.schema_infer_max_records,
             file_extension=self.file_extension,
             table_partition_cols=_convert_table_partition_cols(
                 self.table_partition_cols
             ),
             file_compression_type=self.file_compression_type or "",
             file_sort_order=file_sort_order,
             null_regex=self.null_regex,
             truncated_rows=self.truncated_rows,
         )


 def validate_single_character(name: str, value: str | None) -> None:
     if value is not None and len(value) != 1:
         message = f"{name} must be a single character"
         raise ValueError(message)


 def _convert_table_partition_cols(
     table_partition_cols: list[tuple[str, str | pa.DataType]],
 ) -> list[tuple[str, pa.DataType]]:
     warn = False
     converted_table_partition_cols = []

     for col, data_type in table_partition_cols:
         if isinstance(data_type, str):
             warn = True
             if data_type == "string":
                 converted_data_type = pa.string()
             elif data_type == "int":
                 converted_data_type = pa.int32()
             else:
                 message = (
                     f"Unsupported literal data type '{data_type}' for partition "
                     "column. Supported types are 'string' and 'int'"
                 )
                 raise ValueError(message)
         else:
             converted_data_type = data_type

         converted_table_partition_cols.append((col, converted_data_type))

     if warn:
         message = (
             "using literals for table_partition_cols data types is deprecated,"
             "use pyarrow types instead"
         )
         warnings.warn(
             message,
             category=DeprecationWarning,
             stacklevel=2,
         )

     return converted_table_partition_cols
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""Options for reading various file formats."""

	from __future__ import annotations

	import warnings
	from typing import TYPE_CHECKING

	import pyarrow as pa

	from datafusion.expr import sort_list_to_raw_sort_list

	if TYPE_CHECKING:
	from datafusion.expr import SortExpr

	from ._internal import options

	__all__ = ["CsvReadOptions"]

	DEFAULT_MAX_INFER_SCHEMA = 1000


	class CsvReadOptions:
	"""Options for reading CSV files.

	This class provides a builder pattern for configuring CSV reading options.
	All methods starting with ``with_`` return ``self`` to allow method chaining.
	"""

	def __init__(
	self,
	*,
	has_header: bool = True,
	delimiter: str = ",",
	quote: str = '"',
	terminator: str \| None = None,
	escape: str \| None = None,
	comment: str \| None = None,
	newlines_in_values: bool = False,
	schema: pa.Schema \| None = None,
	schema_infer_max_records: int = DEFAULT_MAX_INFER_SCHEMA,
	file_extension: str = ".csv",
	table_partition_cols: list[tuple[str, pa.DataType]] \| None = None,
	file_compression_type: str = "",
	file_sort_order: list[list[SortExpr]] \| None = None,
	null_regex: str \| None = None,
	truncated_rows: bool = False,
	) -> None:
	"""Initialize CsvReadOptions.

	Args:
	has_header: Does the CSV file have a header row? If schema inference
	is run on a file with no headers, default column names are created.
	delimiter: Column delimiter character. Must be a single ASCII character.
	quote: Quote character for fields containing delimiters or newlines.
	Must be a single ASCII character.
	terminator: Optional line terminator character. If ``None``, uses CRLF.
	Must be a single ASCII character.
	escape: Optional escape character for quotes. Must be a single ASCII
	character.
	comment: If specified, lines beginning with this character are ignored.
	Must be a single ASCII character.
	newlines_in_values: Whether newlines in quoted values are supported.
	Parsing newlines in quoted values may be affected by execution
	behavior such as parallel file scanning. Setting this to ``True``
	ensures that newlines in values are parsed successfully, which may
	reduce performance.
	schema: Optional PyArrow schema representing the CSV files. If ``None``,
	the CSV reader will try to infer it based on data in the file.
	schema_infer_max_records: Maximum number of rows to read from CSV files
	for schema inference if needed.
	file_extension: File extension; only files with this extension are
	selected for data input.
	table_partition_cols: Partition columns as a list of tuples of
	(column_name, data_type).
	file_compression_type: File compression type. Supported values are
	``"gzip"``, ``"bz2"``, ``"xz"``, ``"zstd"``, or empty string for
	uncompressed.
	file_sort_order: Optional sort order of the files as a list of sort
	expressions per file.
	null_regex: Optional regex pattern to match null values in the CSV.
	truncated_rows: Whether to allow truncated rows when parsing. By default
	this is ``False`` and will error if the CSV rows have different
	lengths. When set to ``True``, it will allow records with less than
	the expected number of columns and fill the missing columns with
	nulls. If the record's schema is not nullable, it will still return
	an error.
	"""
	validate_single_character("delimiter", delimiter)
	validate_single_character("quote", quote)
	validate_single_character("terminator", terminator)
	validate_single_character("escape", escape)
	validate_single_character("comment", comment)

	self.has_header = has_header
	self.delimiter = delimiter
	self.quote = quote
	self.terminator = terminator
	self.escape = escape
	self.comment = comment
	self.newlines_in_values = newlines_in_values
	self.schema = schema
	self.schema_infer_max_records = schema_infer_max_records
	self.file_extension = file_extension
	self.table_partition_cols = table_partition_cols or []
	self.file_compression_type = file_compression_type
	self.file_sort_order = file_sort_order or []
	self.null_regex = null_regex
	self.truncated_rows = truncated_rows

	def with_has_header(self, has_header: bool) -> CsvReadOptions:
	"""Configure whether the CSV has a header row."""
	self.has_header = has_header
	return self

	def with_delimiter(self, delimiter: str) -> CsvReadOptions:
	"""Configure the column delimiter."""
	self.delimiter = delimiter
	return self

	def with_quote(self, quote: str) -> CsvReadOptions:
	"""Configure the quote character."""
	self.quote = quote
	return self

	def with_terminator(self, terminator: str \| None) -> CsvReadOptions:
	"""Configure the line terminator character."""
	self.terminator = terminator
	return self

	def with_escape(self, escape: str \| None) -> CsvReadOptions:
	"""Configure the escape character."""
	self.escape = escape
	return self

	def with_comment(self, comment: str \| None) -> CsvReadOptions:
	"""Configure the comment character."""
	self.comment = comment
	return self

	def with_newlines_in_values(self, newlines_in_values: bool) -> CsvReadOptions:
	"""Configure whether newlines in values are supported."""
	self.newlines_in_values = newlines_in_values
	return self

	def with_schema(self, schema: pa.Schema \| None) -> CsvReadOptions:
	"""Configure the schema."""
	self.schema = schema
	return self

	def with_schema_infer_max_records(
	self, schema_infer_max_records: int
	) -> CsvReadOptions:
	"""Configure maximum records for schema inference."""
	self.schema_infer_max_records = schema_infer_max_records
	return self

	def with_file_extension(self, file_extension: str) -> CsvReadOptions:
	"""Configure the file extension filter."""
	self.file_extension = file_extension
	return self

	def with_table_partition_cols(
	self, table_partition_cols: list[tuple[str, pa.DataType]]
	) -> CsvReadOptions:
	"""Configure table partition columns."""
	self.table_partition_cols = table_partition_cols
	return self

	def with_file_compression_type(self, file_compression_type: str) -> CsvReadOptions:
	"""Configure file compression type."""
	self.file_compression_type = file_compression_type
	return self

	def with_file_sort_order(
	self, file_sort_order: list[list[SortExpr]]
	) -> CsvReadOptions:
	"""Configure file sort order."""
	self.file_sort_order = file_sort_order
	return self

	def with_null_regex(self, null_regex: str \| None) -> CsvReadOptions:
	"""Configure null value regex pattern."""
	self.null_regex = null_regex
	return self

	def with_truncated_rows(self, truncated_rows: bool) -> CsvReadOptions:
	"""Configure whether to allow truncated rows."""
	self.truncated_rows = truncated_rows
	return self

	def to_inner(self) -> options.CsvReadOptions:
	"""Convert this object into the underlying Rust structure.

	This is intended for internal use only.
	"""
	file_sort_order = (
	[]
	if self.file_sort_order is None
	else [
	sort_list_to_raw_sort_list(sort_list)
	for sort_list in self.file_sort_order
	]
	)

	return options.CsvReadOptions(
	has_header=self.has_header,
	delimiter=ord(self.delimiter[0]) if self.delimiter else ord(","),
	quote=ord(self.quote[0]) if self.quote else ord('"'),
	terminator=ord(self.terminator[0]) if self.terminator else None,
	escape=ord(self.escape[0]) if self.escape else None,
	comment=ord(self.comment[0]) if self.comment else None,
	newlines_in_values=self.newlines_in_values,
	schema=self.schema,
	schema_infer_max_records=self.schema_infer_max_records,
	file_extension=self.file_extension,
	table_partition_cols=_convert_table_partition_cols(
	self.table_partition_cols
	),
	file_compression_type=self.file_compression_type or "",
	file_sort_order=file_sort_order,
	null_regex=self.null_regex,
	truncated_rows=self.truncated_rows,
	)


	def validate_single_character(name: str, value: str \| None) -> None:
	if value is not None and len(value) != 1:
	message = f"{name} must be a single character"
	raise ValueError(message)


	def _convert_table_partition_cols(
	table_partition_cols: list[tuple[str, str \| pa.DataType]],
	) -> list[tuple[str, pa.DataType]]:
	warn = False
	converted_table_partition_cols = []

	for col, data_type in table_partition_cols:
	if isinstance(data_type, str):
	warn = True
	if data_type == "string":
	converted_data_type = pa.string()
	elif data_type == "int":
	converted_data_type = pa.int32()
	else:
	message = (
	f"Unsupported literal data type '{data_type}' for partition "
	"column. Supported types are 'string' and 'int'"
	)
	raise ValueError(message)
	else:
	converted_data_type = data_type

	converted_table_partition_cols.append((col, converted_data_type))

	if warn:
	message = (
	"using literals for table_partition_cols data types is deprecated,"
	"use pyarrow types instead"
	)
	warnings.warn(
	message,
	category=DeprecationWarning,
	stacklevel=2,
	)

	return converted_table_partition_cols