python/pyarrow/_json.pyx - arrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # cython: profile=False
 # distutils: language = c++
 # cython: embedsignature = True
 # cython: language_level = 3

 from pyarrow.includes.common cimport *
 from pyarrow.includes.libarrow cimport *
 from pyarrow.lib cimport (check_status, Field, MemoryPool, ensure_type,
                           maybe_unbox_memory_pool, get_input_stream,
                           pyarrow_wrap_table, pyarrow_wrap_data_type,
                           pyarrow_unwrap_data_type, pyarrow_wrap_schema,
                           pyarrow_unwrap_schema)


 cdef class ReadOptions:
     """
     Options for reading JSON files.

     Parameters
     ----------
     use_threads : bool, optional (default True)
         Whether to use multiple threads to accelerate reading
     block_size : int, optional
         How much bytes to process at a time from the input stream.
         This will determine multi-threading granularity as well as
         the size of individual chunks in the Table.
     """
     cdef:
         CJSONReadOptions options

     # Avoid mistakingly creating attributes
     __slots__ = ()

     def __init__(self, use_threads=None, block_size=None):
         self.options = CJSONReadOptions.Defaults()
         if use_threads is not None:
             self.use_threads = use_threads
         if block_size is not None:
             self.block_size = block_size

     @property
     def use_threads(self):
         """
         Whether to use multiple threads to accelerate reading.
         """
         return self.options.use_threads

     @use_threads.setter
     def use_threads(self, value):
         self.options.use_threads = value

     @property
     def block_size(self):
         """
         How much bytes to process at a time from the input stream.

         This will determine multi-threading granularity as well as the size of
         individual chunks in the Table.
         """
         return self.options.block_size

     @block_size.setter
     def block_size(self, value):
         self.options.block_size = value


 cdef class ParseOptions:
     """
     Options for parsing JSON files.

     Parameters
     ----------
     explicit_schema: Schema, optional (default None)
         Optional explicit schema (no type inference, ignores other fields).
     newlines_in_values: bool, optional (default False)
         Whether objects may be printed across multiple lines (for example
         pretty printed). If false, input must end with an empty line.
     unexpected_field_behavior: str, default "infer"
         How JSON fields outside of explicit_schema (if given) are treated.

         Possible behaviors:

          - "ignore": unexpected JSON fields are ignored
          - "error": error out on unexpected JSON fields
          - "infer": unexpected JSON fields are type-inferred and included in
            the output
     """

     cdef:
         CJSONParseOptions options

     __slots__ = ()

     def __init__(self, explicit_schema=None, newlines_in_values=None,
                  unexpected_field_behavior=None):
         self.options = CJSONParseOptions.Defaults()
         if explicit_schema is not None:
             self.explicit_schema = explicit_schema
         if newlines_in_values is not None:
             self.newlines_in_values = newlines_in_values
         if unexpected_field_behavior is not None:
             self.unexpected_field_behavior = unexpected_field_behavior

     @property
     def explicit_schema(self):
         """
         Optional explicit schema (no type inference, ignores other fields)
         """
         if self.options.explicit_schema.get() == NULL:
             return None
         else:
             return pyarrow_wrap_schema(self.options.explicit_schema)

     @explicit_schema.setter
     def explicit_schema(self, value):
         self.options.explicit_schema = pyarrow_unwrap_schema(value)

     @property
     def newlines_in_values(self):
         """
         Whether newline characters are allowed in JSON values.
         Setting this to True reduces the performance of multi-threaded
         JSON reading.
         """
         return self.options.newlines_in_values

     @newlines_in_values.setter
     def newlines_in_values(self, value):
         self.options.newlines_in_values = value

     @property
     def unexpected_field_behavior(self):
         """
         How JSON fields outside of explicit_schema (if given) are treated.

         Possible behaviors:

          - "ignore": unexpected JSON fields are ignored
          - "error": error out on unexpected JSON fields
          - "infer": unexpected JSON fields are type-inferred and included in
            the output

         Set to "infer" by default.
         """
         v = self.options.unexpected_field_behavior
         if v == CUnexpectedFieldBehavior_Ignore:
             return "ignore"
         elif v == CUnexpectedFieldBehavior_Error:
             return "error"
         elif v == CUnexpectedFieldBehavior_InferType:
             return "infer"
         else:
             raise ValueError('Unexpected value for unexpected_field_behavior')

     @unexpected_field_behavior.setter
     def unexpected_field_behavior(self, value):
         cdef CUnexpectedFieldBehavior v

         if value == "ignore":
             v = CUnexpectedFieldBehavior_Ignore
         elif value == "error":
             v = CUnexpectedFieldBehavior_Error
         elif value == "infer":
             v = CUnexpectedFieldBehavior_InferType
         else:
             raise ValueError(
                 "Unexpected value `{}` for `unexpected_field_behavior`, pass "
                 "either `ignore`, `error` or `infer`.".format(value)
             )

         self.options.unexpected_field_behavior = v


 cdef _get_reader(input_file, shared_ptr[CInputStream]* out):
     use_memory_map = False
     get_input_stream(input_file, use_memory_map, out)

 cdef _get_read_options(ReadOptions read_options, CJSONReadOptions* out):
     if read_options is None:
         out[0] = CJSONReadOptions.Defaults()
     else:
         out[0] = read_options.options

 cdef _get_parse_options(ParseOptions parse_options, CJSONParseOptions* out):
     if parse_options is None:
         out[0] = CJSONParseOptions.Defaults()
     else:
         out[0] = parse_options.options


 def read_json(input_file, read_options=None, parse_options=None,
               MemoryPool memory_pool=None):
     """
     Read a Table from a stream of JSON data.

     Parameters
     ----------
     input_file: string, path or file-like object
         The location of JSON data.
     read_options: pyarrow.json.ReadOptions, optional
         Options for the JSON reader (see ReadOptions constructor for defaults)
     parse_options: pyarrow.json.ParseOptions, optional
         Options for the JSON parser
         (see ParseOptions constructor for defaults)
     memory_pool: MemoryPool, optional
         Pool to allocate Table memory from

     Returns
     -------
     :class:`pyarrow.Table`
         Contents of the JSON file as a in-memory table.
     """
     cdef:
         shared_ptr[CInputStream] stream
         CJSONReadOptions c_read_options
         CJSONParseOptions c_parse_options
         shared_ptr[CJSONReader] reader
         shared_ptr[CTable] table

     _get_reader(input_file, &stream)
     _get_read_options(read_options, &c_read_options)
     _get_parse_options(parse_options, &c_parse_options)

     reader = GetResultValue(
         CJSONReader.Make(maybe_unbox_memory_pool(memory_pool),
                          stream, c_read_options, c_parse_options))

     with nogil:
         table = GetResultValue(reader.get().Read())

     return pyarrow_wrap_table(table)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# cython: profile=False
	# distutils: language = c++
	# cython: embedsignature = True
	# cython: language_level = 3

	from pyarrow.includes.common cimport *
	from pyarrow.includes.libarrow cimport *
	from pyarrow.lib cimport (check_status, Field, MemoryPool, ensure_type,
	maybe_unbox_memory_pool, get_input_stream,
	pyarrow_wrap_table, pyarrow_wrap_data_type,
	pyarrow_unwrap_data_type, pyarrow_wrap_schema,
	pyarrow_unwrap_schema)


	cdef class ReadOptions:
	"""
	Options for reading JSON files.

	Parameters
	----------
	use_threads : bool, optional (default True)
	Whether to use multiple threads to accelerate reading
	block_size : int, optional
	How much bytes to process at a time from the input stream.
	This will determine multi-threading granularity as well as
	the size of individual chunks in the Table.
	"""
	cdef:
	CJSONReadOptions options

	# Avoid mistakingly creating attributes
	__slots__ = ()

	def __init__(self, use_threads=None, block_size=None):
	self.options = CJSONReadOptions.Defaults()
	if use_threads is not None:
	self.use_threads = use_threads
	if block_size is not None:
	self.block_size = block_size

	@property
	def use_threads(self):
	"""
	Whether to use multiple threads to accelerate reading.
	"""
	return self.options.use_threads

	@use_threads.setter
	def use_threads(self, value):
	self.options.use_threads = value

	@property
	def block_size(self):
	"""
	How much bytes to process at a time from the input stream.

	This will determine multi-threading granularity as well as the size of
	individual chunks in the Table.
	"""
	return self.options.block_size

	@block_size.setter
	def block_size(self, value):
	self.options.block_size = value


	cdef class ParseOptions:
	"""
	Options for parsing JSON files.

	Parameters
	----------
	explicit_schema: Schema, optional (default None)
	Optional explicit schema (no type inference, ignores other fields).
	newlines_in_values: bool, optional (default False)
	Whether objects may be printed across multiple lines (for example
	pretty printed). If false, input must end with an empty line.
	unexpected_field_behavior: str, default "infer"
	How JSON fields outside of explicit_schema (if given) are treated.

	Possible behaviors:

	- "ignore": unexpected JSON fields are ignored
	- "error": error out on unexpected JSON fields
	- "infer": unexpected JSON fields are type-inferred and included in
	the output
	"""

	cdef:
	CJSONParseOptions options

	__slots__ = ()

	def __init__(self, explicit_schema=None, newlines_in_values=None,
	unexpected_field_behavior=None):
	self.options = CJSONParseOptions.Defaults()
	if explicit_schema is not None:
	self.explicit_schema = explicit_schema
	if newlines_in_values is not None:
	self.newlines_in_values = newlines_in_values
	if unexpected_field_behavior is not None:
	self.unexpected_field_behavior = unexpected_field_behavior

	@property
	def explicit_schema(self):
	"""
	Optional explicit schema (no type inference, ignores other fields)
	"""
	if self.options.explicit_schema.get() == NULL:
	return None
	else:
	return pyarrow_wrap_schema(self.options.explicit_schema)

	@explicit_schema.setter
	def explicit_schema(self, value):
	self.options.explicit_schema = pyarrow_unwrap_schema(value)

	@property
	def newlines_in_values(self):
	"""
	Whether newline characters are allowed in JSON values.
	Setting this to True reduces the performance of multi-threaded
	JSON reading.
	"""
	return self.options.newlines_in_values

	@newlines_in_values.setter
	def newlines_in_values(self, value):
	self.options.newlines_in_values = value

	@property
	def unexpected_field_behavior(self):
	"""
	How JSON fields outside of explicit_schema (if given) are treated.

	Possible behaviors:

	- "ignore": unexpected JSON fields are ignored
	- "error": error out on unexpected JSON fields
	- "infer": unexpected JSON fields are type-inferred and included in
	the output

	Set to "infer" by default.
	"""
	v = self.options.unexpected_field_behavior
	if v == CUnexpectedFieldBehavior_Ignore:
	return "ignore"
	elif v == CUnexpectedFieldBehavior_Error:
	return "error"
	elif v == CUnexpectedFieldBehavior_InferType:
	return "infer"
	else:
	raise ValueError('Unexpected value for unexpected_field_behavior')

	@unexpected_field_behavior.setter
	def unexpected_field_behavior(self, value):
	cdef CUnexpectedFieldBehavior v

	if value == "ignore":
	v = CUnexpectedFieldBehavior_Ignore
	elif value == "error":
	v = CUnexpectedFieldBehavior_Error
	elif value == "infer":
	v = CUnexpectedFieldBehavior_InferType
	else:
	raise ValueError(
	"Unexpected value `{}` for `unexpected_field_behavior`, pass "
	"either `ignore`, `error` or `infer`.".format(value)
	)

	self.options.unexpected_field_behavior = v


	cdef _get_reader(input_file, shared_ptr[CInputStream]* out):
	use_memory_map = False
	get_input_stream(input_file, use_memory_map, out)

	cdef _get_read_options(ReadOptions read_options, CJSONReadOptions* out):
	if read_options is None:
	out[0] = CJSONReadOptions.Defaults()
	else:
	out[0] = read_options.options

	cdef _get_parse_options(ParseOptions parse_options, CJSONParseOptions* out):
	if parse_options is None:
	out[0] = CJSONParseOptions.Defaults()
	else:
	out[0] = parse_options.options


	def read_json(input_file, read_options=None, parse_options=None,
	MemoryPool memory_pool=None):
	"""
	Read a Table from a stream of JSON data.

	Parameters
	----------
	input_file: string, path or file-like object
	The location of JSON data.
	read_options: pyarrow.json.ReadOptions, optional
	Options for the JSON reader (see ReadOptions constructor for defaults)
	parse_options: pyarrow.json.ParseOptions, optional
	Options for the JSON parser
	(see ParseOptions constructor for defaults)
	memory_pool: MemoryPool, optional
	Pool to allocate Table memory from

	Returns
	-------
	:class:`pyarrow.Table`
	Contents of the JSON file as a in-memory table.
	"""
	cdef:
	shared_ptr[CInputStream] stream
	CJSONReadOptions c_read_options
	CJSONParseOptions c_parse_options
	shared_ptr[CJSONReader] reader
	shared_ptr[CTable] table

	_get_reader(input_file, &stream)
	_get_read_options(read_options, &c_read_options)
	_get_parse_options(parse_options, &c_parse_options)

	reader = GetResultValue(
	CJSONReader.Make(maybe_unbox_memory_pool(memory_pool),
	stream, c_read_options, c_parse_options))

	with nogil:
	table = GetResultValue(reader.get().Read())

	return pyarrow_wrap_table(table)