python/pyarrow/_csv.pyx - arrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # cython: profile=False
 # distutils: language = c++
 # cython: embedsignature = True
 # cython: language_level = 3

 from __future__ import absolute_import

 from pyarrow.includes.common cimport *
 from pyarrow.includes.libarrow cimport *
 from pyarrow.lib cimport (check_status, Field, MemoryPool, ensure_type,
                           maybe_unbox_memory_pool, get_input_stream,
                           pyarrow_wrap_table, pyarrow_wrap_data_type,
                           pyarrow_unwrap_data_type)

 from pyarrow.compat import frombytes, tobytes, Mapping


 cdef unsigned char _single_char(s) except 0:
     val = ord(s)
     if val == 0 or val > 127:
         raise ValueError("Expecting an ASCII character")
     return <unsigned char> val


 cdef class ReadOptions:
     """
     Options for reading CSV files.

     Parameters
     ----------
     use_threads : bool, optional (default True)
         Whether to use multiple threads to accelerate reading
     block_size : int, optional
         How much bytes to process at a time from the input stream.
         This will determine multi-threading granularity as well as
         the size of individual chunks in the Table.
     """
     cdef:
         CCSVReadOptions options

     # Avoid mistakingly creating attributes
     __slots__ = ()

     def __init__(self, use_threads=None, block_size=None):
         self.options = CCSVReadOptions.Defaults()
         if use_threads is not None:
             self.use_threads = use_threads
         if block_size is not None:
             self.block_size = block_size

     @property
     def use_threads(self):
         """
         Whether to use multiple threads to accelerate reading.
         """
         return self.options.use_threads

     @use_threads.setter
     def use_threads(self, value):
         self.options.use_threads = value

     @property
     def block_size(self):
         """
         How much bytes to process at a time from the input stream.
         This will determine multi-threading granularity as well as
         the size of individual chunks in the Table.
         """
         return self.options.block_size

     @block_size.setter
     def block_size(self, value):
         self.options.block_size = value


 cdef class ParseOptions:
     """
     Options for parsing CSV files.

     Parameters
     ----------
     delimiter: 1-character string, optional (default ',')
         The character delimiting individual cells in the CSV data.
     quote_char: 1-character string or False, optional (default '"')
         The character used optionally for quoting CSV values
         (False if quoting is not allowed).
     double_quote: bool, optional (default True)
         Whether two quotes in a quoted CSV value denote a single quote
         in the data.
     escape_char: 1-character string or False, optional (default False)
         The character used optionally for escaping special characters
         (False if escaping is not allowed).
     header_rows: int, optional (default 1)
         The number of rows to skip at the start of the CSV data.
     newlines_in_values: bool, optional (default False)
         Whether newline characters are allowed in CSV values.
         Setting this to True reduces the performance of multi-threaded
         CSV reading.
     ignore_empty_lines: bool, optional (default True)
         Whether empty lines are ignored in CSV input.
         If False, an empty line is interpreted as containing a single empty
         value (assuming a one-column CSV file).
     """
     cdef:
         CCSVParseOptions options

     __slots__ = ()

     def __init__(self, delimiter=None, quote_char=None, double_quote=None,
                  escape_char=None, header_rows=None, newlines_in_values=None,
                  ignore_empty_lines=None):
         self.options = CCSVParseOptions.Defaults()
         if delimiter is not None:
             self.delimiter = delimiter
         if quote_char is not None:
             self.quote_char = quote_char
         if double_quote is not None:
             self.double_quote = double_quote
         if escape_char is not None:
             self.escape_char = escape_char
         if header_rows is not None:
             self.header_rows = header_rows
         if newlines_in_values is not None:
             self.newlines_in_values = newlines_in_values
         if ignore_empty_lines is not None:
             self.ignore_empty_lines = ignore_empty_lines

     @property
     def delimiter(self):
         """
         The character delimiting individual cells in the CSV data.
         """
         return chr(self.options.delimiter)

     @delimiter.setter
     def delimiter(self, value):
         self.options.delimiter = _single_char(value)

     @property
     def quote_char(self):
         """
         The character used optionally for quoting CSV values
         (False if quoting is not allowed).
         """
         if self.options.quoting:
             return chr(self.options.quote_char)
         else:
             return False

     @quote_char.setter
     def quote_char(self, value):
         if value is False:
             self.options.quoting = False
         else:
             self.options.quote_char = _single_char(value)
             self.options.quoting = True

     @property
     def double_quote(self):
         """
         Whether two quotes in a quoted CSV value denote a single quote
         in the data.
         """
         return self.options.double_quote

     @double_quote.setter
     def double_quote(self, value):
         self.options.double_quote = value

     @property
     def escape_char(self):
         """
         The character used optionally for escaping special characters
         (False if escaping is not allowed).
         """
         if self.options.escaping:
             return chr(self.options.escape_char)
         else:
             return False

     @escape_char.setter
     def escape_char(self, value):
         if value is False:
             self.options.escaping = False
         else:
             self.options.escape_char = _single_char(value)
             self.options.escaping = True

     @property
     def header_rows(self):
         """
         The number of rows to skip at the start of the CSV data.
         """
         return self.options.header_rows

     @header_rows.setter
     def header_rows(self, value):
         self.options.header_rows = value

     @property
     def newlines_in_values(self):
         """
         Whether newline characters are allowed in CSV values.
         Setting this to True reduces the performance of multi-threaded
         CSV reading.
         """
         return self.options.newlines_in_values

     @newlines_in_values.setter
     def newlines_in_values(self, value):
         self.options.newlines_in_values = value

     @property
     def ignore_empty_lines(self):
         """
         Whether empty lines are ignored in CSV input.
         If False, an empty line is interpreted as containing a single empty
         value (assuming a one-column CSV file).
         """
         return self.options.ignore_empty_lines

     @ignore_empty_lines.setter
     def ignore_empty_lines(self, value):
         self.options.ignore_empty_lines = value


 cdef class ConvertOptions:
     """
     Options for converting CSV data.

     Parameters
     ----------
     check_utf8 : bool, optional (default True)
         Whether to check UTF8 validity of string columns.
     column_types: dict, optional
         Map column names to column types
         (disabling type inference on those columns).
     null_values: list, optional
         A sequence of strings that denote nulls in the data
         (defaults are appropriate in most cases).
     true_values: list, optional
         A sequence of strings that denote true booleans in the data
         (defaults are appropriate in most cases).
     false_values: list, optional
         A sequence of strings that denote false booleans in the data
         (defaults are appropriate in most cases).
     strings_can_be_null: bool, optional (default False)
         Whether string / binary columns can have null values.
         If true, then strings in null_values are considered null for
         string columns.
         If false, then all strings are valid string values.
     """
     cdef:
         CCSVConvertOptions options

     # Avoid mistakingly creating attributes
     __slots__ = ()

     def __init__(self, check_utf8=None, column_types=None, null_values=None,
                  true_values=None, false_values=None,
                  strings_can_be_null=None):
         self.options = CCSVConvertOptions.Defaults()
         if check_utf8 is not None:
             self.check_utf8 = check_utf8
         if column_types is not None:
             self.column_types = column_types
         if null_values is not None:
             self.null_values = null_values
         if true_values is not None:
             self.true_values = true_values
         if false_values is not None:
             self.false_values = false_values
         if strings_can_be_null is not None:
             self.strings_can_be_null = strings_can_be_null

     @property
     def check_utf8(self):
         """
         Whether to check UTF8 validity of string columns.
         """
         return self.options.check_utf8

     @check_utf8.setter
     def check_utf8(self, value):
         self.options.check_utf8 = value

     @property
     def strings_can_be_null(self):
         """
         Whether string / binary columns can have null values.
         """
         return self.options.strings_can_be_null

     @strings_can_be_null.setter
     def strings_can_be_null(self, value):
         self.options.strings_can_be_null = value

     @property
     def column_types(self):
         """
         Map column names to column types
         (disabling type inference on those columns).
         """
         d = {frombytes(item.first): pyarrow_wrap_data_type(item.second)
              for item in self.options.column_types}
         return d

     @column_types.setter
     def column_types(self, value):
         cdef:
             shared_ptr[CDataType] typ

         if isinstance(value, Mapping):
             value = value.items()

         self.options.column_types.clear()
         for item in value:
             if isinstance(item, Field):
                 k = item.name
                 v = item.type
             else:
                 k, v = item
             typ = pyarrow_unwrap_data_type(ensure_type(v))
             assert typ != NULL
             self.options.column_types[tobytes(k)] = typ

     @property
     def null_values(self):
         """
         A sequence of strings that denote nulls in the data.
         """
         return [frombytes(x) for x in self.options.null_values]

     @null_values.setter
     def null_values(self, value):
         self.options.null_values = [tobytes(x) for x in value]

     @property
     def true_values(self):
         """
         A sequence of strings that denote true booleans in the data.
         """
         return [frombytes(x) for x in self.options.true_values]

     @true_values.setter
     def true_values(self, value):
         self.options.true_values = [tobytes(x) for x in value]

     @property
     def false_values(self):
         """
         A sequence of strings that denote false booleans in the data.
         """
         return [frombytes(x) for x in self.options.false_values]

     @false_values.setter
     def false_values(self, value):
         self.options.false_values = [tobytes(x) for x in value]


 cdef _get_reader(input_file, shared_ptr[InputStream]* out):
     use_memory_map = False
     get_input_stream(input_file, use_memory_map, out)


 cdef _get_read_options(ReadOptions read_options, CCSVReadOptions* out):
     if read_options is None:
         out[0] = CCSVReadOptions.Defaults()
     else:
         out[0] = read_options.options


 cdef _get_parse_options(ParseOptions parse_options, CCSVParseOptions* out):
     if parse_options is None:
         out[0] = CCSVParseOptions.Defaults()
     else:
         out[0] = parse_options.options


 cdef _get_convert_options(ConvertOptions convert_options,
                           CCSVConvertOptions* out):
     if convert_options is None:
         out[0] = CCSVConvertOptions.Defaults()
     else:
         out[0] = convert_options.options


 def read_csv(input_file, read_options=None, parse_options=None,
              convert_options=None, MemoryPool memory_pool=None):
     """
     Read a Table from a stream of CSV data.

     Parameters
     ----------
     input_file: string, path or file-like object
         The location of CSV data.  If a string or path, and if it ends
         with a recognized compressed file extension (e.g. ".gz" or ".bz2"),
         the data is automatically decompressed when reading.
     read_options: pyarrow.csv.ReadOptions, optional
         Options for the CSV reader (see pyarrow.csv.ReadOptions constructor
         for defaults)
     parse_options: pyarrow.csv.ParseOptions, optional
         Options for the CSV parser
         (see pyarrow.csv.ParseOptions constructor for defaults)
     convert_options: pyarrow.csv.ConvertOptions, optional
         Options for converting CSV data
         (see pyarrow.csv.ConvertOptions constructor for defaults)
     memory_pool: MemoryPool, optional
         Pool to allocate Table memory from

     Returns
     -------
     :class:`pyarrow.Table`
         Contents of the CSV file as a in-memory table.
     """
     cdef:
         shared_ptr[InputStream] stream
         CCSVReadOptions c_read_options
         CCSVParseOptions c_parse_options
         CCSVConvertOptions c_convert_options
         shared_ptr[CCSVReader] reader
         shared_ptr[CTable] table

     _get_reader(input_file, &stream)
     _get_read_options(read_options, &c_read_options)
     _get_parse_options(parse_options, &c_parse_options)
     _get_convert_options(convert_options, &c_convert_options)

     check_status(CCSVReader.Make(maybe_unbox_memory_pool(memory_pool),
                                  stream, c_read_options, c_parse_options,
                                  c_convert_options, &reader))
     with nogil:
         check_status(reader.get().Read(&table))

     return pyarrow_wrap_table(table)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# cython: profile=False
	# distutils: language = c++
	# cython: embedsignature = True
	# cython: language_level = 3

	from __future__ import absolute_import

	from pyarrow.includes.common cimport *
	from pyarrow.includes.libarrow cimport *
	from pyarrow.lib cimport (check_status, Field, MemoryPool, ensure_type,
	maybe_unbox_memory_pool, get_input_stream,
	pyarrow_wrap_table, pyarrow_wrap_data_type,
	pyarrow_unwrap_data_type)

	from pyarrow.compat import frombytes, tobytes, Mapping


	cdef unsigned char _single_char(s) except 0:
	val = ord(s)
	if val == 0 or val > 127:
	raise ValueError("Expecting an ASCII character")
	return <unsigned char> val


	cdef class ReadOptions:
	"""
	Options for reading CSV files.

	Parameters
	----------
	use_threads : bool, optional (default True)
	Whether to use multiple threads to accelerate reading
	block_size : int, optional
	How much bytes to process at a time from the input stream.
	This will determine multi-threading granularity as well as
	the size of individual chunks in the Table.
	"""
	cdef:
	CCSVReadOptions options

	# Avoid mistakingly creating attributes
	__slots__ = ()

	def __init__(self, use_threads=None, block_size=None):
	self.options = CCSVReadOptions.Defaults()
	if use_threads is not None:
	self.use_threads = use_threads
	if block_size is not None:
	self.block_size = block_size

	@property
	def use_threads(self):
	"""
	Whether to use multiple threads to accelerate reading.
	"""
	return self.options.use_threads

	@use_threads.setter
	def use_threads(self, value):
	self.options.use_threads = value

	@property
	def block_size(self):
	"""
	How much bytes to process at a time from the input stream.
	This will determine multi-threading granularity as well as
	the size of individual chunks in the Table.
	"""
	return self.options.block_size

	@block_size.setter
	def block_size(self, value):
	self.options.block_size = value


	cdef class ParseOptions:
	"""
	Options for parsing CSV files.

	Parameters
	----------
	delimiter: 1-character string, optional (default ',')
	The character delimiting individual cells in the CSV data.
	quote_char: 1-character string or False, optional (default '"')
	The character used optionally for quoting CSV values
	(False if quoting is not allowed).
	double_quote: bool, optional (default True)
	Whether two quotes in a quoted CSV value denote a single quote
	in the data.
	escape_char: 1-character string or False, optional (default False)
	The character used optionally for escaping special characters
	(False if escaping is not allowed).
	header_rows: int, optional (default 1)
	The number of rows to skip at the start of the CSV data.
	newlines_in_values: bool, optional (default False)
	Whether newline characters are allowed in CSV values.
	Setting this to True reduces the performance of multi-threaded
	CSV reading.
	ignore_empty_lines: bool, optional (default True)
	Whether empty lines are ignored in CSV input.
	If False, an empty line is interpreted as containing a single empty
	value (assuming a one-column CSV file).
	"""
	cdef:
	CCSVParseOptions options

	__slots__ = ()

	def __init__(self, delimiter=None, quote_char=None, double_quote=None,
	escape_char=None, header_rows=None, newlines_in_values=None,
	ignore_empty_lines=None):
	self.options = CCSVParseOptions.Defaults()
	if delimiter is not None:
	self.delimiter = delimiter
	if quote_char is not None:
	self.quote_char = quote_char
	if double_quote is not None:
	self.double_quote = double_quote
	if escape_char is not None:
	self.escape_char = escape_char
	if header_rows is not None:
	self.header_rows = header_rows
	if newlines_in_values is not None:
	self.newlines_in_values = newlines_in_values
	if ignore_empty_lines is not None:
	self.ignore_empty_lines = ignore_empty_lines

	@property
	def delimiter(self):
	"""
	The character delimiting individual cells in the CSV data.
	"""
	return chr(self.options.delimiter)

	@delimiter.setter
	def delimiter(self, value):
	self.options.delimiter = _single_char(value)

	@property
	def quote_char(self):
	"""
	The character used optionally for quoting CSV values
	(False if quoting is not allowed).
	"""
	if self.options.quoting:
	return chr(self.options.quote_char)
	else:
	return False

	@quote_char.setter
	def quote_char(self, value):
	if value is False:
	self.options.quoting = False
	else:
	self.options.quote_char = _single_char(value)
	self.options.quoting = True

	@property
	def double_quote(self):
	"""
	Whether two quotes in a quoted CSV value denote a single quote
	in the data.
	"""
	return self.options.double_quote

	@double_quote.setter
	def double_quote(self, value):
	self.options.double_quote = value

	@property
	def escape_char(self):
	"""
	The character used optionally for escaping special characters
	(False if escaping is not allowed).
	"""
	if self.options.escaping:
	return chr(self.options.escape_char)
	else:
	return False

	@escape_char.setter
	def escape_char(self, value):
	if value is False:
	self.options.escaping = False
	else:
	self.options.escape_char = _single_char(value)
	self.options.escaping = True

	@property
	def header_rows(self):
	"""
	The number of rows to skip at the start of the CSV data.
	"""
	return self.options.header_rows

	@header_rows.setter
	def header_rows(self, value):
	self.options.header_rows = value

	@property
	def newlines_in_values(self):
	"""
	Whether newline characters are allowed in CSV values.
	Setting this to True reduces the performance of multi-threaded
	CSV reading.
	"""
	return self.options.newlines_in_values

	@newlines_in_values.setter
	def newlines_in_values(self, value):
	self.options.newlines_in_values = value

	@property
	def ignore_empty_lines(self):
	"""
	Whether empty lines are ignored in CSV input.
	If False, an empty line is interpreted as containing a single empty
	value (assuming a one-column CSV file).
	"""
	return self.options.ignore_empty_lines

	@ignore_empty_lines.setter
	def ignore_empty_lines(self, value):
	self.options.ignore_empty_lines = value


	cdef class ConvertOptions:
	"""
	Options for converting CSV data.

	Parameters
	----------
	check_utf8 : bool, optional (default True)
	Whether to check UTF8 validity of string columns.
	column_types: dict, optional
	Map column names to column types
	(disabling type inference on those columns).
	null_values: list, optional
	A sequence of strings that denote nulls in the data
	(defaults are appropriate in most cases).
	true_values: list, optional
	A sequence of strings that denote true booleans in the data
	(defaults are appropriate in most cases).
	false_values: list, optional
	A sequence of strings that denote false booleans in the data
	(defaults are appropriate in most cases).
	strings_can_be_null: bool, optional (default False)
	Whether string / binary columns can have null values.
	If true, then strings in null_values are considered null for
	string columns.
	If false, then all strings are valid string values.
	"""
	cdef:
	CCSVConvertOptions options

	# Avoid mistakingly creating attributes
	__slots__ = ()

	def __init__(self, check_utf8=None, column_types=None, null_values=None,
	true_values=None, false_values=None,
	strings_can_be_null=None):
	self.options = CCSVConvertOptions.Defaults()
	if check_utf8 is not None:
	self.check_utf8 = check_utf8
	if column_types is not None:
	self.column_types = column_types
	if null_values is not None:
	self.null_values = null_values
	if true_values is not None:
	self.true_values = true_values
	if false_values is not None:
	self.false_values = false_values
	if strings_can_be_null is not None:
	self.strings_can_be_null = strings_can_be_null

	@property
	def check_utf8(self):
	"""
	Whether to check UTF8 validity of string columns.
	"""
	return self.options.check_utf8

	@check_utf8.setter
	def check_utf8(self, value):
	self.options.check_utf8 = value

	@property
	def strings_can_be_null(self):
	"""
	Whether string / binary columns can have null values.
	"""
	return self.options.strings_can_be_null

	@strings_can_be_null.setter
	def strings_can_be_null(self, value):
	self.options.strings_can_be_null = value

	@property
	def column_types(self):
	"""
	Map column names to column types
	(disabling type inference on those columns).
	"""
	d = {frombytes(item.first): pyarrow_wrap_data_type(item.second)
	for item in self.options.column_types}
	return d

	@column_types.setter
	def column_types(self, value):
	cdef:
	shared_ptr[CDataType] typ

	if isinstance(value, Mapping):
	value = value.items()

	self.options.column_types.clear()
	for item in value:
	if isinstance(item, Field):
	k = item.name
	v = item.type
	else:
	k, v = item
	typ = pyarrow_unwrap_data_type(ensure_type(v))
	assert typ != NULL
	self.options.column_types[tobytes(k)] = typ

	@property
	def null_values(self):
	"""
	A sequence of strings that denote nulls in the data.
	"""
	return [frombytes(x) for x in self.options.null_values]

	@null_values.setter
	def null_values(self, value):
	self.options.null_values = [tobytes(x) for x in value]

	@property
	def true_values(self):
	"""
	A sequence of strings that denote true booleans in the data.
	"""
	return [frombytes(x) for x in self.options.true_values]

	@true_values.setter
	def true_values(self, value):
	self.options.true_values = [tobytes(x) for x in value]

	@property
	def false_values(self):
	"""
	A sequence of strings that denote false booleans in the data.
	"""
	return [frombytes(x) for x in self.options.false_values]

	@false_values.setter
	def false_values(self, value):
	self.options.false_values = [tobytes(x) for x in value]


	cdef _get_reader(input_file, shared_ptr[InputStream]* out):
	use_memory_map = False
	get_input_stream(input_file, use_memory_map, out)


	cdef _get_read_options(ReadOptions read_options, CCSVReadOptions* out):
	if read_options is None:
	out[0] = CCSVReadOptions.Defaults()
	else:
	out[0] = read_options.options


	cdef _get_parse_options(ParseOptions parse_options, CCSVParseOptions* out):
	if parse_options is None:
	out[0] = CCSVParseOptions.Defaults()
	else:
	out[0] = parse_options.options


	cdef _get_convert_options(ConvertOptions convert_options,
	CCSVConvertOptions* out):
	if convert_options is None:
	out[0] = CCSVConvertOptions.Defaults()
	else:
	out[0] = convert_options.options


	def read_csv(input_file, read_options=None, parse_options=None,
	convert_options=None, MemoryPool memory_pool=None):
	"""
	Read a Table from a stream of CSV data.

	Parameters
	----------
	input_file: string, path or file-like object
	The location of CSV data. If a string or path, and if it ends
	with a recognized compressed file extension (e.g. ".gz" or ".bz2"),
	the data is automatically decompressed when reading.
	read_options: pyarrow.csv.ReadOptions, optional
	Options for the CSV reader (see pyarrow.csv.ReadOptions constructor
	for defaults)
	parse_options: pyarrow.csv.ParseOptions, optional
	Options for the CSV parser
	(see pyarrow.csv.ParseOptions constructor for defaults)
	convert_options: pyarrow.csv.ConvertOptions, optional
	Options for converting CSV data
	(see pyarrow.csv.ConvertOptions constructor for defaults)
	memory_pool: MemoryPool, optional
	Pool to allocate Table memory from

	Returns
	-------
	:class:`pyarrow.Table`
	Contents of the CSV file as a in-memory table.
	"""
	cdef:
	shared_ptr[InputStream] stream
	CCSVReadOptions c_read_options
	CCSVParseOptions c_parse_options
	CCSVConvertOptions c_convert_options
	shared_ptr[CCSVReader] reader
	shared_ptr[CTable] table

	_get_reader(input_file, &stream)
	_get_read_options(read_options, &c_read_options)
	_get_parse_options(parse_options, &c_parse_options)
	_get_convert_options(convert_options, &c_convert_options)

	check_status(CCSVReader.Make(maybe_unbox_memory_pool(memory_pool),
	stream, c_read_options, c_parse_options,
	c_convert_options, &reader))
	with nogil:
	check_status(reader.get().Read(&table))

	return pyarrow_wrap_table(table)