| import os |
| import re |
| import functools |
| import itertools |
| import warnings |
| import weakref |
| import contextlib |
| import operator |
| from operator import itemgetter, index as opindex, methodcaller |
| from collections.abc import Mapping |
| |
| import numpy as np |
| from . import format |
| from ._datasource import DataSource |
| from numpy.core import overrides |
| from numpy.core.multiarray import packbits, unpackbits |
| from numpy.core._multiarray_umath import _load_from_filelike |
| from numpy.core.overrides import set_array_function_like_doc, set_module |
| from ._iotools import ( |
| LineSplitter, NameValidator, StringConverter, ConverterError, |
| ConverterLockError, ConversionWarning, _is_string_like, |
| has_nested_fields, flatten_dtype, easy_dtype, _decode_line |
| ) |
| |
| from numpy.compat import ( |
| asbytes, asstr, asunicode, os_fspath, os_PathLike, |
| pickle |
| ) |
| |
| |
| __all__ = [ |
| 'savetxt', 'loadtxt', 'genfromtxt', |
| 'recfromtxt', 'recfromcsv', 'load', 'save', 'savez', |
| 'savez_compressed', 'packbits', 'unpackbits', 'fromregex', 'DataSource' |
| ] |
| |
| |
| array_function_dispatch = functools.partial( |
| overrides.array_function_dispatch, module='numpy') |
| |
| |
| class BagObj: |
| """ |
| BagObj(obj) |
| |
| Convert attribute look-ups to getitems on the object passed in. |
| |
| Parameters |
| ---------- |
| obj : class instance |
| Object on which attribute look-up is performed. |
| |
| Examples |
| -------- |
| >>> from numpy.lib.npyio import BagObj as BO |
| >>> class BagDemo: |
| ... def __getitem__(self, key): # An instance of BagObj(BagDemo) |
| ... # will call this method when any |
| ... # attribute look-up is required |
| ... result = "Doesn't matter what you want, " |
| ... return result + "you're gonna get this" |
| ... |
| >>> demo_obj = BagDemo() |
| >>> bagobj = BO(demo_obj) |
| >>> bagobj.hello_there |
| "Doesn't matter what you want, you're gonna get this" |
| >>> bagobj.I_can_be_anything |
| "Doesn't matter what you want, you're gonna get this" |
| |
| """ |
| |
| def __init__(self, obj): |
| # Use weakref to make NpzFile objects collectable by refcount |
| self._obj = weakref.proxy(obj) |
| |
| def __getattribute__(self, key): |
| try: |
| return object.__getattribute__(self, '_obj')[key] |
| except KeyError: |
| raise AttributeError(key) from None |
| |
| def __dir__(self): |
| """ |
| Enables dir(bagobj) to list the files in an NpzFile. |
| |
| This also enables tab-completion in an interpreter or IPython. |
| """ |
| return list(object.__getattribute__(self, '_obj').keys()) |
| |
| |
| def zipfile_factory(file, *args, **kwargs): |
| """ |
| Create a ZipFile. |
| |
| Allows for Zip64, and the `file` argument can accept file, str, or |
| pathlib.Path objects. `args` and `kwargs` are passed to the zipfile.ZipFile |
| constructor. |
| """ |
| if not hasattr(file, 'read'): |
| file = os_fspath(file) |
| import zipfile |
| kwargs['allowZip64'] = True |
| return zipfile.ZipFile(file, *args, **kwargs) |
| |
| |
| class NpzFile(Mapping): |
| """ |
| NpzFile(fid) |
| |
| A dictionary-like object with lazy-loading of files in the zipped |
| archive provided on construction. |
| |
| `NpzFile` is used to load files in the NumPy ``.npz`` data archive |
| format. It assumes that files in the archive have a ``.npy`` extension, |
| other files are ignored. |
| |
| The arrays and file strings are lazily loaded on either |
| getitem access using ``obj['key']`` or attribute lookup using |
| ``obj.f.key``. A list of all files (without ``.npy`` extensions) can |
| be obtained with ``obj.files`` and the ZipFile object itself using |
| ``obj.zip``. |
| |
| Attributes |
| ---------- |
| files : list of str |
| List of all files in the archive with a ``.npy`` extension. |
| zip : ZipFile instance |
| The ZipFile object initialized with the zipped archive. |
| f : BagObj instance |
| An object on which attribute can be performed as an alternative |
| to getitem access on the `NpzFile` instance itself. |
| allow_pickle : bool, optional |
| Allow loading pickled data. Default: False |
| |
| .. versionchanged:: 1.16.3 |
| Made default False in response to CVE-2019-6446. |
| |
| pickle_kwargs : dict, optional |
| Additional keyword arguments to pass on to pickle.load. |
| These are only useful when loading object arrays saved on |
| Python 2 when using Python 3. |
| max_header_size : int, optional |
| Maximum allowed size of the header. Large headers may not be safe |
| to load securely and thus require explicitly passing a larger value. |
| See :py:func:`ast.literal_eval()` for details. |
| This option is ignored when `allow_pickle` is passed. In that case |
| the file is by definition trusted and the limit is unnecessary. |
| |
| Parameters |
| ---------- |
| fid : file or str |
| The zipped archive to open. This is either a file-like object |
| or a string containing the path to the archive. |
| own_fid : bool, optional |
| Whether NpzFile should close the file handle. |
| Requires that `fid` is a file-like object. |
| |
| Examples |
| -------- |
| >>> from tempfile import TemporaryFile |
| >>> outfile = TemporaryFile() |
| >>> x = np.arange(10) |
| >>> y = np.sin(x) |
| >>> np.savez(outfile, x=x, y=y) |
| >>> _ = outfile.seek(0) |
| |
| >>> npz = np.load(outfile) |
| >>> isinstance(npz, np.lib.npyio.NpzFile) |
| True |
| >>> npz |
| NpzFile 'object' with keys x, y |
| >>> sorted(npz.files) |
| ['x', 'y'] |
| >>> npz['x'] # getitem access |
| array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) |
| >>> npz.f.x # attribute lookup |
| array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) |
| |
| """ |
| # Make __exit__ safe if zipfile_factory raises an exception |
| zip = None |
| fid = None |
| _MAX_REPR_ARRAY_COUNT = 5 |
| |
| def __init__(self, fid, own_fid=False, allow_pickle=False, |
| pickle_kwargs=None, *, |
| max_header_size=format._MAX_HEADER_SIZE): |
| # Import is postponed to here since zipfile depends on gzip, an |
| # optional component of the so-called standard library. |
| _zip = zipfile_factory(fid) |
| self._files = _zip.namelist() |
| self.files = [] |
| self.allow_pickle = allow_pickle |
| self.max_header_size = max_header_size |
| self.pickle_kwargs = pickle_kwargs |
| for x in self._files: |
| if x.endswith('.npy'): |
| self.files.append(x[:-4]) |
| else: |
| self.files.append(x) |
| self.zip = _zip |
| self.f = BagObj(self) |
| if own_fid: |
| self.fid = fid |
| |
| def __enter__(self): |
| return self |
| |
| def __exit__(self, exc_type, exc_value, traceback): |
| self.close() |
| |
| def close(self): |
| """ |
| Close the file. |
| |
| """ |
| if self.zip is not None: |
| self.zip.close() |
| self.zip = None |
| if self.fid is not None: |
| self.fid.close() |
| self.fid = None |
| self.f = None # break reference cycle |
| |
| def __del__(self): |
| self.close() |
| |
| # Implement the Mapping ABC |
| def __iter__(self): |
| return iter(self.files) |
| |
| def __len__(self): |
| return len(self.files) |
| |
| def __getitem__(self, key): |
| # FIXME: This seems like it will copy strings around |
| # more than is strictly necessary. The zipfile |
| # will read the string and then |
| # the format.read_array will copy the string |
| # to another place in memory. |
| # It would be better if the zipfile could read |
| # (or at least uncompress) the data |
| # directly into the array memory. |
| member = False |
| if key in self._files: |
| member = True |
| elif key in self.files: |
| member = True |
| key += '.npy' |
| if member: |
| bytes = self.zip.open(key) |
| magic = bytes.read(len(format.MAGIC_PREFIX)) |
| bytes.close() |
| if magic == format.MAGIC_PREFIX: |
| bytes = self.zip.open(key) |
| return format.read_array(bytes, |
| allow_pickle=self.allow_pickle, |
| pickle_kwargs=self.pickle_kwargs, |
| max_header_size=self.max_header_size) |
| else: |
| return self.zip.read(key) |
| else: |
| raise KeyError(f"{key} is not a file in the archive") |
| |
| def __contains__(self, key): |
| return (key in self._files or key in self.files) |
| |
| def __repr__(self): |
| # Get filename or default to `object` |
| if isinstance(self.fid, str): |
| filename = self.fid |
| else: |
| filename = getattr(self.fid, "name", "object") |
| |
| # Get the name of arrays |
| array_names = ', '.join(self.files[:self._MAX_REPR_ARRAY_COUNT]) |
| if len(self.files) > self._MAX_REPR_ARRAY_COUNT: |
| array_names += "..." |
| return f"NpzFile {filename!r} with keys: {array_names}" |
| |
| |
| @set_module('numpy') |
| def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True, |
| encoding='ASCII', *, max_header_size=format._MAX_HEADER_SIZE): |
| """ |
| Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files. |
| |
| .. warning:: Loading files that contain object arrays uses the ``pickle`` |
| module, which is not secure against erroneous or maliciously |
| constructed data. Consider passing ``allow_pickle=False`` to |
| load data that is known not to contain object arrays for the |
| safer handling of untrusted sources. |
| |
| Parameters |
| ---------- |
| file : file-like object, string, or pathlib.Path |
| The file to read. File-like objects must support the |
| ``seek()`` and ``read()`` methods and must always |
| be opened in binary mode. Pickled files require that the |
| file-like object support the ``readline()`` method as well. |
| mmap_mode : {None, 'r+', 'r', 'w+', 'c'}, optional |
| If not None, then memory-map the file, using the given mode (see |
| `numpy.memmap` for a detailed description of the modes). A |
| memory-mapped array is kept on disk. However, it can be accessed |
| and sliced like any ndarray. Memory mapping is especially useful |
| for accessing small fragments of large files without reading the |
| entire file into memory. |
| allow_pickle : bool, optional |
| Allow loading pickled object arrays stored in npy files. Reasons for |
| disallowing pickles include security, as loading pickled data can |
| execute arbitrary code. If pickles are disallowed, loading object |
| arrays will fail. Default: False |
| |
| .. versionchanged:: 1.16.3 |
| Made default False in response to CVE-2019-6446. |
| |
| fix_imports : bool, optional |
| Only useful when loading Python 2 generated pickled files on Python 3, |
| which includes npy/npz files containing object arrays. If `fix_imports` |
| is True, pickle will try to map the old Python 2 names to the new names |
| used in Python 3. |
| encoding : str, optional |
| What encoding to use when reading Python 2 strings. Only useful when |
| loading Python 2 generated pickled files in Python 3, which includes |
| npy/npz files containing object arrays. Values other than 'latin1', |
| 'ASCII', and 'bytes' are not allowed, as they can corrupt numerical |
| data. Default: 'ASCII' |
| max_header_size : int, optional |
| Maximum allowed size of the header. Large headers may not be safe |
| to load securely and thus require explicitly passing a larger value. |
| See :py:func:`ast.literal_eval()` for details. |
| This option is ignored when `allow_pickle` is passed. In that case |
| the file is by definition trusted and the limit is unnecessary. |
| |
| Returns |
| ------- |
| result : array, tuple, dict, etc. |
| Data stored in the file. For ``.npz`` files, the returned instance |
| of NpzFile class must be closed to avoid leaking file descriptors. |
| |
| Raises |
| ------ |
| OSError |
| If the input file does not exist or cannot be read. |
| UnpicklingError |
| If ``allow_pickle=True``, but the file cannot be loaded as a pickle. |
| ValueError |
| The file contains an object array, but ``allow_pickle=False`` given. |
| EOFError |
| When calling ``np.load`` multiple times on the same file handle, |
| if all data has already been read |
| |
| See Also |
| -------- |
| save, savez, savez_compressed, loadtxt |
| memmap : Create a memory-map to an array stored in a file on disk. |
| lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file. |
| |
| Notes |
| ----- |
| - If the file contains pickle data, then whatever object is stored |
| in the pickle is returned. |
| - If the file is a ``.npy`` file, then a single array is returned. |
| - If the file is a ``.npz`` file, then a dictionary-like object is |
| returned, containing ``{filename: array}`` key-value pairs, one for |
| each file in the archive. |
| - If the file is a ``.npz`` file, the returned value supports the |
| context manager protocol in a similar fashion to the open function:: |
| |
| with load('foo.npz') as data: |
| a = data['a'] |
| |
| The underlying file descriptor is closed when exiting the 'with' |
| block. |
| |
| Examples |
| -------- |
| Store data to disk, and load it again: |
| |
| >>> np.save('/tmp/123', np.array([[1, 2, 3], [4, 5, 6]])) |
| >>> np.load('/tmp/123.npy') |
| array([[1, 2, 3], |
| [4, 5, 6]]) |
| |
| Store compressed data to disk, and load it again: |
| |
| >>> a=np.array([[1, 2, 3], [4, 5, 6]]) |
| >>> b=np.array([1, 2]) |
| >>> np.savez('/tmp/123.npz', a=a, b=b) |
| >>> data = np.load('/tmp/123.npz') |
| >>> data['a'] |
| array([[1, 2, 3], |
| [4, 5, 6]]) |
| >>> data['b'] |
| array([1, 2]) |
| >>> data.close() |
| |
| Mem-map the stored array, and then access the second row |
| directly from disk: |
| |
| >>> X = np.load('/tmp/123.npy', mmap_mode='r') |
| >>> X[1, :] |
| memmap([4, 5, 6]) |
| |
| """ |
| if encoding not in ('ASCII', 'latin1', 'bytes'): |
| # The 'encoding' value for pickle also affects what encoding |
| # the serialized binary data of NumPy arrays is loaded |
| # in. Pickle does not pass on the encoding information to |
| # NumPy. The unpickling code in numpy.core.multiarray is |
| # written to assume that unicode data appearing where binary |
| # should be is in 'latin1'. 'bytes' is also safe, as is 'ASCII'. |
| # |
| # Other encoding values can corrupt binary data, and we |
| # purposefully disallow them. For the same reason, the errors= |
| # argument is not exposed, as values other than 'strict' |
| # result can similarly silently corrupt numerical data. |
| raise ValueError("encoding must be 'ASCII', 'latin1', or 'bytes'") |
| |
| pickle_kwargs = dict(encoding=encoding, fix_imports=fix_imports) |
| |
| with contextlib.ExitStack() as stack: |
| if hasattr(file, 'read'): |
| fid = file |
| own_fid = False |
| else: |
| fid = stack.enter_context(open(os_fspath(file), "rb")) |
| own_fid = True |
| |
| # Code to distinguish from NumPy binary files and pickles. |
| _ZIP_PREFIX = b'PK\x03\x04' |
| _ZIP_SUFFIX = b'PK\x05\x06' # empty zip files start with this |
| N = len(format.MAGIC_PREFIX) |
| magic = fid.read(N) |
| if not magic: |
| raise EOFError("No data left in file") |
| # If the file size is less than N, we need to make sure not |
| # to seek past the beginning of the file |
| fid.seek(-min(N, len(magic)), 1) # back-up |
| if magic.startswith(_ZIP_PREFIX) or magic.startswith(_ZIP_SUFFIX): |
| # zip-file (assume .npz) |
| # Potentially transfer file ownership to NpzFile |
| stack.pop_all() |
| ret = NpzFile(fid, own_fid=own_fid, allow_pickle=allow_pickle, |
| pickle_kwargs=pickle_kwargs, |
| max_header_size=max_header_size) |
| return ret |
| elif magic == format.MAGIC_PREFIX: |
| # .npy file |
| if mmap_mode: |
| if allow_pickle: |
| max_header_size = 2**64 |
| return format.open_memmap(file, mode=mmap_mode, |
| max_header_size=max_header_size) |
| else: |
| return format.read_array(fid, allow_pickle=allow_pickle, |
| pickle_kwargs=pickle_kwargs, |
| max_header_size=max_header_size) |
| else: |
| # Try a pickle |
| if not allow_pickle: |
| raise ValueError("Cannot load file containing pickled data " |
| "when allow_pickle=False") |
| try: |
| return pickle.load(fid, **pickle_kwargs) |
| except Exception as e: |
| raise pickle.UnpicklingError( |
| f"Failed to interpret file {file!r} as a pickle") from e |
| |
| |
| def _save_dispatcher(file, arr, allow_pickle=None, fix_imports=None): |
| return (arr,) |
| |
| |
| @array_function_dispatch(_save_dispatcher) |
| def save(file, arr, allow_pickle=True, fix_imports=True): |
| """ |
| Save an array to a binary file in NumPy ``.npy`` format. |
| |
| Parameters |
| ---------- |
| file : file, str, or pathlib.Path |
| File or filename to which the data is saved. If file is a file-object, |
| then the filename is unchanged. If file is a string or Path, a ``.npy`` |
| extension will be appended to the filename if it does not already |
| have one. |
| arr : array_like |
| Array data to be saved. |
| allow_pickle : bool, optional |
| Allow saving object arrays using Python pickles. Reasons for disallowing |
| pickles include security (loading pickled data can execute arbitrary |
| code) and portability (pickled objects may not be loadable on different |
| Python installations, for example if the stored objects require libraries |
| that are not available, and not all pickled data is compatible between |
| Python 2 and Python 3). |
| Default: True |
| fix_imports : bool, optional |
| Only useful in forcing objects in object arrays on Python 3 to be |
| pickled in a Python 2 compatible way. If `fix_imports` is True, pickle |
| will try to map the new Python 3 names to the old module names used in |
| Python 2, so that the pickle data stream is readable with Python 2. |
| |
| See Also |
| -------- |
| savez : Save several arrays into a ``.npz`` archive |
| savetxt, load |
| |
| Notes |
| ----- |
| For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`. |
| |
| Any data saved to the file is appended to the end of the file. |
| |
| Examples |
| -------- |
| >>> from tempfile import TemporaryFile |
| >>> outfile = TemporaryFile() |
| |
| >>> x = np.arange(10) |
| >>> np.save(outfile, x) |
| |
| >>> _ = outfile.seek(0) # Only needed here to simulate closing & reopening file |
| >>> np.load(outfile) |
| array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) |
| |
| |
| >>> with open('test.npy', 'wb') as f: |
| ... np.save(f, np.array([1, 2])) |
| ... np.save(f, np.array([1, 3])) |
| >>> with open('test.npy', 'rb') as f: |
| ... a = np.load(f) |
| ... b = np.load(f) |
| >>> print(a, b) |
| # [1 2] [1 3] |
| """ |
| if hasattr(file, 'write'): |
| file_ctx = contextlib.nullcontext(file) |
| else: |
| file = os_fspath(file) |
| if not file.endswith('.npy'): |
| file = file + '.npy' |
| file_ctx = open(file, "wb") |
| |
| with file_ctx as fid: |
| arr = np.asanyarray(arr) |
| format.write_array(fid, arr, allow_pickle=allow_pickle, |
| pickle_kwargs=dict(fix_imports=fix_imports)) |
| |
| |
| def _savez_dispatcher(file, *args, **kwds): |
| yield from args |
| yield from kwds.values() |
| |
| |
| @array_function_dispatch(_savez_dispatcher) |
| def savez(file, *args, **kwds): |
| """Save several arrays into a single file in uncompressed ``.npz`` format. |
| |
| Provide arrays as keyword arguments to store them under the |
| corresponding name in the output file: ``savez(fn, x=x, y=y)``. |
| |
| If arrays are specified as positional arguments, i.e., ``savez(fn, |
| x, y)``, their names will be `arr_0`, `arr_1`, etc. |
| |
| Parameters |
| ---------- |
| file : str or file |
| Either the filename (string) or an open file (file-like object) |
| where the data will be saved. If file is a string or a Path, the |
| ``.npz`` extension will be appended to the filename if it is not |
| already there. |
| args : Arguments, optional |
| Arrays to save to the file. Please use keyword arguments (see |
| `kwds` below) to assign names to arrays. Arrays specified as |
| args will be named "arr_0", "arr_1", and so on. |
| kwds : Keyword arguments, optional |
| Arrays to save to the file. Each array will be saved to the |
| output file with its corresponding keyword name. |
| |
| Returns |
| ------- |
| None |
| |
| See Also |
| -------- |
| save : Save a single array to a binary file in NumPy format. |
| savetxt : Save an array to a file as plain text. |
| savez_compressed : Save several arrays into a compressed ``.npz`` archive |
| |
| Notes |
| ----- |
| The ``.npz`` file format is a zipped archive of files named after the |
| variables they contain. The archive is not compressed and each file |
| in the archive contains one variable in ``.npy`` format. For a |
| description of the ``.npy`` format, see :py:mod:`numpy.lib.format`. |
| |
| When opening the saved ``.npz`` file with `load` a `NpzFile` object is |
| returned. This is a dictionary-like object which can be queried for |
| its list of arrays (with the ``.files`` attribute), and for the arrays |
| themselves. |
| |
| Keys passed in `kwds` are used as filenames inside the ZIP archive. |
| Therefore, keys should be valid filenames; e.g., avoid keys that begin with |
| ``/`` or contain ``.``. |
| |
| When naming variables with keyword arguments, it is not possible to name a |
| variable ``file``, as this would cause the ``file`` argument to be defined |
| twice in the call to ``savez``. |
| |
| Examples |
| -------- |
| >>> from tempfile import TemporaryFile |
| >>> outfile = TemporaryFile() |
| >>> x = np.arange(10) |
| >>> y = np.sin(x) |
| |
| Using `savez` with \\*args, the arrays are saved with default names. |
| |
| >>> np.savez(outfile, x, y) |
| >>> _ = outfile.seek(0) # Only needed here to simulate closing & reopening file |
| >>> npzfile = np.load(outfile) |
| >>> npzfile.files |
| ['arr_0', 'arr_1'] |
| >>> npzfile['arr_0'] |
| array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) |
| |
| Using `savez` with \\**kwds, the arrays are saved with the keyword names. |
| |
| >>> outfile = TemporaryFile() |
| >>> np.savez(outfile, x=x, y=y) |
| >>> _ = outfile.seek(0) |
| >>> npzfile = np.load(outfile) |
| >>> sorted(npzfile.files) |
| ['x', 'y'] |
| >>> npzfile['x'] |
| array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) |
| |
| """ |
| _savez(file, args, kwds, False) |
| |
| |
| def _savez_compressed_dispatcher(file, *args, **kwds): |
| yield from args |
| yield from kwds.values() |
| |
| |
| @array_function_dispatch(_savez_compressed_dispatcher) |
| def savez_compressed(file, *args, **kwds): |
| """ |
| Save several arrays into a single file in compressed ``.npz`` format. |
| |
| Provide arrays as keyword arguments to store them under the |
| corresponding name in the output file: ``savez(fn, x=x, y=y)``. |
| |
| If arrays are specified as positional arguments, i.e., ``savez(fn, |
| x, y)``, their names will be `arr_0`, `arr_1`, etc. |
| |
| Parameters |
| ---------- |
| file : str or file |
| Either the filename (string) or an open file (file-like object) |
| where the data will be saved. If file is a string or a Path, the |
| ``.npz`` extension will be appended to the filename if it is not |
| already there. |
| args : Arguments, optional |
| Arrays to save to the file. Please use keyword arguments (see |
| `kwds` below) to assign names to arrays. Arrays specified as |
| args will be named "arr_0", "arr_1", and so on. |
| kwds : Keyword arguments, optional |
| Arrays to save to the file. Each array will be saved to the |
| output file with its corresponding keyword name. |
| |
| Returns |
| ------- |
| None |
| |
| See Also |
| -------- |
| numpy.save : Save a single array to a binary file in NumPy format. |
| numpy.savetxt : Save an array to a file as plain text. |
| numpy.savez : Save several arrays into an uncompressed ``.npz`` file format |
| numpy.load : Load the files created by savez_compressed. |
| |
| Notes |
| ----- |
| The ``.npz`` file format is a zipped archive of files named after the |
| variables they contain. The archive is compressed with |
| ``zipfile.ZIP_DEFLATED`` and each file in the archive contains one variable |
| in ``.npy`` format. For a description of the ``.npy`` format, see |
| :py:mod:`numpy.lib.format`. |
| |
| |
| When opening the saved ``.npz`` file with `load` a `NpzFile` object is |
| returned. This is a dictionary-like object which can be queried for |
| its list of arrays (with the ``.files`` attribute), and for the arrays |
| themselves. |
| |
| Examples |
| -------- |
| >>> test_array = np.random.rand(3, 2) |
| >>> test_vector = np.random.rand(4) |
| >>> np.savez_compressed('/tmp/123', a=test_array, b=test_vector) |
| >>> loaded = np.load('/tmp/123.npz') |
| >>> print(np.array_equal(test_array, loaded['a'])) |
| True |
| >>> print(np.array_equal(test_vector, loaded['b'])) |
| True |
| |
| """ |
| _savez(file, args, kwds, True) |
| |
| |
| def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None): |
| # Import is postponed to here since zipfile depends on gzip, an optional |
| # component of the so-called standard library. |
| import zipfile |
| |
| if not hasattr(file, 'write'): |
| file = os_fspath(file) |
| if not file.endswith('.npz'): |
| file = file + '.npz' |
| |
| namedict = kwds |
| for i, val in enumerate(args): |
| key = 'arr_%d' % i |
| if key in namedict.keys(): |
| raise ValueError( |
| "Cannot use un-named variables and keyword %s" % key) |
| namedict[key] = val |
| |
| if compress: |
| compression = zipfile.ZIP_DEFLATED |
| else: |
| compression = zipfile.ZIP_STORED |
| |
| zipf = zipfile_factory(file, mode="w", compression=compression) |
| |
| for key, val in namedict.items(): |
| fname = key + '.npy' |
| val = np.asanyarray(val) |
| # always force zip64, gh-10776 |
| with zipf.open(fname, 'w', force_zip64=True) as fid: |
| format.write_array(fid, val, |
| allow_pickle=allow_pickle, |
| pickle_kwargs=pickle_kwargs) |
| |
| zipf.close() |
| |
| |
| def _ensure_ndmin_ndarray_check_param(ndmin): |
| """Just checks if the param ndmin is supported on |
| _ensure_ndmin_ndarray. It is intended to be used as |
| verification before running anything expensive. |
| e.g. loadtxt, genfromtxt |
| """ |
| # Check correctness of the values of `ndmin` |
| if ndmin not in [0, 1, 2]: |
| raise ValueError(f"Illegal value of ndmin keyword: {ndmin}") |
| |
| def _ensure_ndmin_ndarray(a, *, ndmin: int): |
| """This is a helper function of loadtxt and genfromtxt to ensure |
| proper minimum dimension as requested |
| |
| ndim : int. Supported values 1, 2, 3 |
| ^^ whenever this changes, keep in sync with |
| _ensure_ndmin_ndarray_check_param |
| """ |
| # Verify that the array has at least dimensions `ndmin`. |
| # Tweak the size and shape of the arrays - remove extraneous dimensions |
| if a.ndim > ndmin: |
| a = np.squeeze(a) |
| # and ensure we have the minimum number of dimensions asked for |
| # - has to be in this order for the odd case ndmin=1, a.squeeze().ndim=0 |
| if a.ndim < ndmin: |
| if ndmin == 1: |
| a = np.atleast_1d(a) |
| elif ndmin == 2: |
| a = np.atleast_2d(a).T |
| |
| return a |
| |
| |
| # amount of lines loadtxt reads in one chunk, can be overridden for testing |
| _loadtxt_chunksize = 50000 |
| |
| |
| def _check_nonneg_int(value, name="argument"): |
| try: |
| operator.index(value) |
| except TypeError: |
| raise TypeError(f"{name} must be an integer") from None |
| if value < 0: |
| raise ValueError(f"{name} must be nonnegative") |
| |
| |
| def _preprocess_comments(iterable, comments, encoding): |
| """ |
| Generator that consumes a line iterated iterable and strips out the |
| multiple (or multi-character) comments from lines. |
| This is a pre-processing step to achieve feature parity with loadtxt |
| (we assume that this feature is a nieche feature). |
| """ |
| for line in iterable: |
| if isinstance(line, bytes): |
| # Need to handle conversion here, or the splitting would fail |
| line = line.decode(encoding) |
| |
| for c in comments: |
| line = line.split(c, 1)[0] |
| |
| yield line |
| |
| |
| # The number of rows we read in one go if confronted with a parametric dtype |
| _loadtxt_chunksize = 50000 |
| |
| |
| def _read(fname, *, delimiter=',', comment='#', quote='"', |
| imaginary_unit='j', usecols=None, skiplines=0, |
| max_rows=None, converters=None, ndmin=None, unpack=False, |
| dtype=np.float64, encoding="bytes"): |
| r""" |
| Read a NumPy array from a text file. |
| |
| Parameters |
| ---------- |
| fname : str or file object |
| The filename or the file to be read. |
| delimiter : str, optional |
| Field delimiter of the fields in line of the file. |
| Default is a comma, ','. If None any sequence of whitespace is |
| considered a delimiter. |
| comment : str or sequence of str or None, optional |
| Character that begins a comment. All text from the comment |
| character to the end of the line is ignored. |
| Multiple comments or multiple-character comment strings are supported, |
| but may be slower and `quote` must be empty if used. |
| Use None to disable all use of comments. |
| quote : str or None, optional |
| Character that is used to quote string fields. Default is '"' |
| (a double quote). Use None to disable quote support. |
| imaginary_unit : str, optional |
| Character that represent the imaginay unit `sqrt(-1)`. |
| Default is 'j'. |
| usecols : array_like, optional |
| A one-dimensional array of integer column numbers. These are the |
| columns from the file to be included in the array. If this value |
| is not given, all the columns are used. |
| skiplines : int, optional |
| Number of lines to skip before interpreting the data in the file. |
| max_rows : int, optional |
| Maximum number of rows of data to read. Default is to read the |
| entire file. |
| converters : dict or callable, optional |
| A function to parse all columns strings into the desired value, or |
| a dictionary mapping column number to a parser function. |
| E.g. if column 0 is a date string: ``converters = {0: datestr2num}``. |
| Converters can also be used to provide a default value for missing |
| data, e.g. ``converters = lambda s: float(s.strip() or 0)`` will |
| convert empty fields to 0. |
| Default: None |
| ndmin : int, optional |
| Minimum dimension of the array returned. |
| Allowed values are 0, 1 or 2. Default is 0. |
| unpack : bool, optional |
| If True, the returned array is transposed, so that arguments may be |
| unpacked using ``x, y, z = read(...)``. When used with a structured |
| data-type, arrays are returned for each field. Default is False. |
| dtype : numpy data type |
| A NumPy dtype instance, can be a structured dtype to map to the |
| columns of the file. |
| encoding : str, optional |
| Encoding used to decode the inputfile. The special value 'bytes' |
| (the default) enables backwards-compatible behavior for `converters`, |
| ensuring that inputs to the converter functions are encoded |
| bytes objects. The special value 'bytes' has no additional effect if |
| ``converters=None``. If encoding is ``'bytes'`` or ``None``, the |
| default system encoding is used. |
| |
| Returns |
| ------- |
| ndarray |
| NumPy array. |
| |
| Examples |
| -------- |
| First we create a file for the example. |
| |
| >>> s1 = '1.0,2.0,3.0\n4.0,5.0,6.0\n' |
| >>> with open('example1.csv', 'w') as f: |
| ... f.write(s1) |
| >>> a1 = read_from_filename('example1.csv') |
| >>> a1 |
| array([[1., 2., 3.], |
| [4., 5., 6.]]) |
| |
| The second example has columns with different data types, so a |
| one-dimensional array with a structured data type is returned. |
| The tab character is used as the field delimiter. |
| |
| >>> s2 = '1.0\t10\talpha\n2.3\t25\tbeta\n4.5\t16\tgamma\n' |
| >>> with open('example2.tsv', 'w') as f: |
| ... f.write(s2) |
| >>> a2 = read_from_filename('example2.tsv', delimiter='\t') |
| >>> a2 |
| array([(1. , 10, b'alpha'), (2.3, 25, b'beta'), (4.5, 16, b'gamma')], |
| dtype=[('f0', '<f8'), ('f1', 'u1'), ('f2', 'S5')]) |
| """ |
| # Handle special 'bytes' keyword for encoding |
| byte_converters = False |
| if encoding == 'bytes': |
| encoding = None |
| byte_converters = True |
| |
| if dtype is None: |
| raise TypeError("a dtype must be provided.") |
| dtype = np.dtype(dtype) |
| |
| read_dtype_via_object_chunks = None |
| if dtype.kind in 'SUM' and ( |
| dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'): |
| # This is a legacy "flexible" dtype. We do not truly support |
| # parametric dtypes currently (no dtype discovery step in the core), |
| # but have to support these for backward compatibility. |
| read_dtype_via_object_chunks = dtype |
| dtype = np.dtype(object) |
| |
| if usecols is not None: |
| # Allow usecols to be a single int or a sequence of ints, the C-code |
| # handles the rest |
| try: |
| usecols = list(usecols) |
| except TypeError: |
| usecols = [usecols] |
| |
| _ensure_ndmin_ndarray_check_param(ndmin) |
| |
| if comment is None: |
| comments = None |
| else: |
| # assume comments are a sequence of strings |
| if "" in comment: |
| raise ValueError( |
| "comments cannot be an empty string. Use comments=None to " |
| "disable comments." |
| ) |
| comments = tuple(comment) |
| comment = None |
| if len(comments) == 0: |
| comments = None # No comments at all |
| elif len(comments) == 1: |
| # If there is only one comment, and that comment has one character, |
| # the normal parsing can deal with it just fine. |
| if isinstance(comments[0], str) and len(comments[0]) == 1: |
| comment = comments[0] |
| comments = None |
| else: |
| # Input validation if there are multiple comment characters |
| if delimiter in comments: |
| raise TypeError( |
| f"Comment characters '{comments}' cannot include the " |
| f"delimiter '{delimiter}'" |
| ) |
| |
| # comment is now either a 1 or 0 character string or a tuple: |
| if comments is not None: |
| # Note: An earlier version support two character comments (and could |
| # have been extended to multiple characters, we assume this is |
| # rare enough to not optimize for. |
| if quote is not None: |
| raise ValueError( |
| "when multiple comments or a multi-character comment is " |
| "given, quotes are not supported. In this case quotechar " |
| "must be set to None.") |
| |
| if len(imaginary_unit) != 1: |
| raise ValueError('len(imaginary_unit) must be 1.') |
| |
| _check_nonneg_int(skiplines) |
| if max_rows is not None: |
| _check_nonneg_int(max_rows) |
| else: |
| # Passing -1 to the C code means "read the entire file". |
| max_rows = -1 |
| |
| fh_closing_ctx = contextlib.nullcontext() |
| filelike = False |
| try: |
| if isinstance(fname, os.PathLike): |
| fname = os.fspath(fname) |
| if isinstance(fname, str): |
| fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) |
| if encoding is None: |
| encoding = getattr(fh, 'encoding', 'latin1') |
| |
| fh_closing_ctx = contextlib.closing(fh) |
| data = fh |
| filelike = True |
| else: |
| if encoding is None: |
| encoding = getattr(fname, 'encoding', 'latin1') |
| data = iter(fname) |
| except TypeError as e: |
| raise ValueError( |
| f"fname must be a string, filehandle, list of strings,\n" |
| f"or generator. Got {type(fname)} instead.") from e |
| |
| with fh_closing_ctx: |
| if comments is not None: |
| if filelike: |
| data = iter(data) |
| filelike = False |
| data = _preprocess_comments(data, comments, encoding) |
| |
| if read_dtype_via_object_chunks is None: |
| arr = _load_from_filelike( |
| data, delimiter=delimiter, comment=comment, quote=quote, |
| imaginary_unit=imaginary_unit, |
| usecols=usecols, skiplines=skiplines, max_rows=max_rows, |
| converters=converters, dtype=dtype, |
| encoding=encoding, filelike=filelike, |
| byte_converters=byte_converters) |
| |
| else: |
| # This branch reads the file into chunks of object arrays and then |
| # casts them to the desired actual dtype. This ensures correct |
| # string-length and datetime-unit discovery (like `arr.astype()`). |
| # Due to chunking, certain error reports are less clear, currently. |
| if filelike: |
| data = iter(data) # cannot chunk when reading from file |
| |
| c_byte_converters = False |
| if read_dtype_via_object_chunks == "S": |
| c_byte_converters = True # Use latin1 rather than ascii |
| |
| chunks = [] |
| while max_rows != 0: |
| if max_rows < 0: |
| chunk_size = _loadtxt_chunksize |
| else: |
| chunk_size = min(_loadtxt_chunksize, max_rows) |
| |
| next_arr = _load_from_filelike( |
| data, delimiter=delimiter, comment=comment, quote=quote, |
| imaginary_unit=imaginary_unit, |
| usecols=usecols, skiplines=skiplines, max_rows=max_rows, |
| converters=converters, dtype=dtype, |
| encoding=encoding, filelike=filelike, |
| byte_converters=byte_converters, |
| c_byte_converters=c_byte_converters) |
| # Cast here already. We hope that this is better even for |
| # large files because the storage is more compact. It could |
| # be adapted (in principle the concatenate could cast). |
| chunks.append(next_arr.astype(read_dtype_via_object_chunks)) |
| |
| skiprows = 0 # Only have to skip for first chunk |
| if max_rows >= 0: |
| max_rows -= chunk_size |
| if len(next_arr) < chunk_size: |
| # There was less data than requested, so we are done. |
| break |
| |
| # Need at least one chunk, but if empty, the last one may have |
| # the wrong shape. |
| if len(chunks) > 1 and len(chunks[-1]) == 0: |
| del chunks[-1] |
| if len(chunks) == 1: |
| arr = chunks[0] |
| else: |
| arr = np.concatenate(chunks, axis=0) |
| |
| # NOTE: ndmin works as advertised for structured dtypes, but normally |
| # these would return a 1D result plus the structured dimension, |
| # so ndmin=2 adds a third dimension even when no squeezing occurs. |
| # A `squeeze=False` could be a better solution (pandas uses squeeze). |
| arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin) |
| |
| if arr.shape: |
| if arr.shape[0] == 0: |
| warnings.warn( |
| f'loadtxt: input contained no data: "{fname}"', |
| category=UserWarning, |
| stacklevel=3 |
| ) |
| |
| if unpack: |
| # Unpack structured dtypes if requested: |
| dt = arr.dtype |
| if dt.names is not None: |
| # For structured arrays, return an array for each field. |
| return [arr[field] for field in dt.names] |
| else: |
| return arr.T |
| else: |
| return arr |
| |
| |
| @set_array_function_like_doc |
| @set_module('numpy') |
| def loadtxt(fname, dtype=float, comments='#', delimiter=None, |
| converters=None, skiprows=0, usecols=None, unpack=False, |
| ndmin=0, encoding='bytes', max_rows=None, *, quotechar=None, |
| like=None): |
| r""" |
| Load data from a text file. |
| |
| Parameters |
| ---------- |
| fname : file, str, pathlib.Path, list of str, generator |
| File, filename, list, or generator to read. If the filename |
| extension is ``.gz`` or ``.bz2``, the file is first decompressed. Note |
| that generators must return bytes or strings. The strings |
| in a list or produced by a generator are treated as lines. |
| dtype : data-type, optional |
| Data-type of the resulting array; default: float. If this is a |
| structured data-type, the resulting array will be 1-dimensional, and |
| each row will be interpreted as an element of the array. In this |
| case, the number of columns used must match the number of fields in |
| the data-type. |
| comments : str or sequence of str or None, optional |
| The characters or list of characters used to indicate the start of a |
| comment. None implies no comments. For backwards compatibility, byte |
| strings will be decoded as 'latin1'. The default is '#'. |
| delimiter : str, optional |
| The character used to separate the values. For backwards compatibility, |
| byte strings will be decoded as 'latin1'. The default is whitespace. |
| |
| .. versionchanged:: 1.23.0 |
| Only single character delimiters are supported. Newline characters |
| cannot be used as the delimiter. |
| |
| converters : dict or callable, optional |
| Converter functions to customize value parsing. If `converters` is |
| callable, the function is applied to all columns, else it must be a |
| dict that maps column number to a parser function. |
| See examples for further details. |
| Default: None. |
| |
| .. versionchanged:: 1.23.0 |
| The ability to pass a single callable to be applied to all columns |
| was added. |
| |
| skiprows : int, optional |
| Skip the first `skiprows` lines, including comments; default: 0. |
| usecols : int or sequence, optional |
| Which columns to read, with 0 being the first. For example, |
| ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. |
| The default, None, results in all columns being read. |
| |
| .. versionchanged:: 1.11.0 |
| When a single column has to be read it is possible to use |
| an integer instead of a tuple. E.g ``usecols = 3`` reads the |
| fourth column the same way as ``usecols = (3,)`` would. |
| unpack : bool, optional |
| If True, the returned array is transposed, so that arguments may be |
| unpacked using ``x, y, z = loadtxt(...)``. When used with a |
| structured data-type, arrays are returned for each field. |
| Default is False. |
| ndmin : int, optional |
| The returned array will have at least `ndmin` dimensions. |
| Otherwise mono-dimensional axes will be squeezed. |
| Legal values: 0 (default), 1 or 2. |
| |
| .. versionadded:: 1.6.0 |
| encoding : str, optional |
| Encoding used to decode the inputfile. Does not apply to input streams. |
| The special value 'bytes' enables backward compatibility workarounds |
| that ensures you receive byte arrays as results if possible and passes |
| 'latin1' encoded strings to converters. Override this value to receive |
| unicode arrays and pass strings as input to converters. If set to None |
| the system default is used. The default value is 'bytes'. |
| |
| .. versionadded:: 1.14.0 |
| max_rows : int, optional |
| Read `max_rows` rows of content after `skiprows` lines. The default is |
| to read all the rows. Note that empty rows containing no data such as |
| empty lines and comment lines are not counted towards `max_rows`, |
| while such lines are counted in `skiprows`. |
| |
| .. versionadded:: 1.16.0 |
| |
| .. versionchanged:: 1.23.0 |
| Lines containing no data, including comment lines (e.g., lines |
| starting with '#' or as specified via `comments`) are not counted |
| towards `max_rows`. |
| quotechar : unicode character or None, optional |
| The character used to denote the start and end of a quoted item. |
| Occurrences of the delimiter or comment characters are ignored within |
| a quoted item. The default value is ``quotechar=None``, which means |
| quoting support is disabled. |
| |
| If two consecutive instances of `quotechar` are found within a quoted |
| field, the first is treated as an escape character. See examples. |
| |
| .. versionadded:: 1.23.0 |
| ${ARRAY_FUNCTION_LIKE} |
| |
| .. versionadded:: 1.20.0 |
| |
| Returns |
| ------- |
| out : ndarray |
| Data read from the text file. |
| |
| See Also |
| -------- |
| load, fromstring, fromregex |
| genfromtxt : Load data with missing values handled as specified. |
| scipy.io.loadmat : reads MATLAB data files |
| |
| Notes |
| ----- |
| This function aims to be a fast reader for simply formatted files. The |
| `genfromtxt` function provides more sophisticated handling of, e.g., |
| lines with missing values. |
| |
| Each row in the input text file must have the same number of values to be |
| able to read all values. If all rows do not have same number of values, a |
| subset of up to n columns (where n is the least number of values present |
| in all rows) can be read by specifying the columns via `usecols`. |
| |
| .. versionadded:: 1.10.0 |
| |
| The strings produced by the Python float.hex method can be used as |
| input for floats. |
| |
| Examples |
| -------- |
| >>> from io import StringIO # StringIO behaves like a file object |
| >>> c = StringIO("0 1\n2 3") |
| >>> np.loadtxt(c) |
| array([[0., 1.], |
| [2., 3.]]) |
| |
| >>> d = StringIO("M 21 72\nF 35 58") |
| >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'), |
| ... 'formats': ('S1', 'i4', 'f4')}) |
| array([(b'M', 21, 72.), (b'F', 35, 58.)], |
| dtype=[('gender', 'S1'), ('age', '<i4'), ('weight', '<f4')]) |
| |
| >>> c = StringIO("1,0,2\n3,0,4") |
| >>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True) |
| >>> x |
| array([1., 3.]) |
| >>> y |
| array([2., 4.]) |
| |
| The `converters` argument is used to specify functions to preprocess the |
| text prior to parsing. `converters` can be a dictionary that maps |
| preprocessing functions to each column: |
| |
| >>> s = StringIO("1.618, 2.296\n3.141, 4.669\n") |
| >>> conv = { |
| ... 0: lambda x: np.floor(float(x)), # conversion fn for column 0 |
| ... 1: lambda x: np.ceil(float(x)), # conversion fn for column 1 |
| ... } |
| >>> np.loadtxt(s, delimiter=",", converters=conv) |
| array([[1., 3.], |
| [3., 5.]]) |
| |
| `converters` can be a callable instead of a dictionary, in which case it |
| is applied to all columns: |
| |
| >>> s = StringIO("0xDE 0xAD\n0xC0 0xDE") |
| >>> import functools |
| >>> conv = functools.partial(int, base=16) |
| >>> np.loadtxt(s, converters=conv) |
| array([[222., 173.], |
| [192., 222.]]) |
| |
| This example shows how `converters` can be used to convert a field |
| with a trailing minus sign into a negative number. |
| |
| >>> s = StringIO('10.01 31.25-\n19.22 64.31\n17.57- 63.94') |
| >>> def conv(fld): |
| ... return -float(fld[:-1]) if fld.endswith(b'-') else float(fld) |
| ... |
| >>> np.loadtxt(s, converters=conv) |
| array([[ 10.01, -31.25], |
| [ 19.22, 64.31], |
| [-17.57, 63.94]]) |
| |
| Using a callable as the converter can be particularly useful for handling |
| values with different formatting, e.g. floats with underscores: |
| |
| >>> s = StringIO("1 2.7 100_000") |
| >>> np.loadtxt(s, converters=float) |
| array([1.e+00, 2.7e+00, 1.e+05]) |
| |
| This idea can be extended to automatically handle values specified in |
| many different formats: |
| |
| >>> def conv(val): |
| ... try: |
| ... return float(val) |
| ... except ValueError: |
| ... return float.fromhex(val) |
| >>> s = StringIO("1, 2.5, 3_000, 0b4, 0x1.4000000000000p+2") |
| >>> np.loadtxt(s, delimiter=",", converters=conv, encoding=None) |
| array([1.0e+00, 2.5e+00, 3.0e+03, 1.8e+02, 5.0e+00]) |
| |
| Note that with the default ``encoding="bytes"``, the inputs to the |
| converter function are latin-1 encoded byte strings. To deactivate the |
| implicit encoding prior to conversion, use ``encoding=None`` |
| |
| >>> s = StringIO('10.01 31.25-\n19.22 64.31\n17.57- 63.94') |
| >>> conv = lambda x: -float(x[:-1]) if x.endswith('-') else float(x) |
| >>> np.loadtxt(s, converters=conv, encoding=None) |
| array([[ 10.01, -31.25], |
| [ 19.22, 64.31], |
| [-17.57, 63.94]]) |
| |
| Support for quoted fields is enabled with the `quotechar` parameter. |
| Comment and delimiter characters are ignored when they appear within a |
| quoted item delineated by `quotechar`: |
| |
| >>> s = StringIO('"alpha, #42", 10.0\n"beta, #64", 2.0\n') |
| >>> dtype = np.dtype([("label", "U12"), ("value", float)]) |
| >>> np.loadtxt(s, dtype=dtype, delimiter=",", quotechar='"') |
| array([('alpha, #42', 10.), ('beta, #64', 2.)], |
| dtype=[('label', '<U12'), ('value', '<f8')]) |
| |
| Quoted fields can be separated by multiple whitespace characters: |
| |
| >>> s = StringIO('"alpha, #42" 10.0\n"beta, #64" 2.0\n') |
| >>> dtype = np.dtype([("label", "U12"), ("value", float)]) |
| >>> np.loadtxt(s, dtype=dtype, delimiter=None, quotechar='"') |
| array([('alpha, #42', 10.), ('beta, #64', 2.)], |
| dtype=[('label', '<U12'), ('value', '<f8')]) |
| |
| Two consecutive quote characters within a quoted field are treated as a |
| single escaped character: |
| |
| >>> s = StringIO('"Hello, my name is ""Monty""!"') |
| >>> np.loadtxt(s, dtype="U", delimiter=",", quotechar='"') |
| array('Hello, my name is "Monty"!', dtype='<U26') |
| |
| Read subset of columns when all rows do not contain equal number of values: |
| |
| >>> d = StringIO("1 2\n2 4\n3 9 12\n4 16 20") |
| >>> np.loadtxt(d, usecols=(0, 1)) |
| array([[ 1., 2.], |
| [ 2., 4.], |
| [ 3., 9.], |
| [ 4., 16.]]) |
| |
| """ |
| |
| if like is not None: |
| return _loadtxt_with_like( |
| like, fname, dtype=dtype, comments=comments, delimiter=delimiter, |
| converters=converters, skiprows=skiprows, usecols=usecols, |
| unpack=unpack, ndmin=ndmin, encoding=encoding, |
| max_rows=max_rows |
| ) |
| |
| if isinstance(delimiter, bytes): |
| delimiter.decode("latin1") |
| |
| if dtype is None: |
| dtype = np.float64 |
| |
| comment = comments |
| # Control character type conversions for Py3 convenience |
| if comment is not None: |
| if isinstance(comment, (str, bytes)): |
| comment = [comment] |
| comment = [ |
| x.decode('latin1') if isinstance(x, bytes) else x for x in comment] |
| if isinstance(delimiter, bytes): |
| delimiter = delimiter.decode('latin1') |
| |
| arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter, |
| converters=converters, skiplines=skiprows, usecols=usecols, |
| unpack=unpack, ndmin=ndmin, encoding=encoding, |
| max_rows=max_rows, quote=quotechar) |
| |
| return arr |
| |
| |
| _loadtxt_with_like = array_function_dispatch()(loadtxt) |
| |
| |
| def _savetxt_dispatcher(fname, X, fmt=None, delimiter=None, newline=None, |
| header=None, footer=None, comments=None, |
| encoding=None): |
| return (X,) |
| |
| |
| @array_function_dispatch(_savetxt_dispatcher) |
| def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', |
| footer='', comments='# ', encoding=None): |
| """ |
| Save an array to a text file. |
| |
| Parameters |
| ---------- |
| fname : filename or file handle |
| If the filename ends in ``.gz``, the file is automatically saved in |
| compressed gzip format. `loadtxt` understands gzipped files |
| transparently. |
| X : 1D or 2D array_like |
| Data to be saved to a text file. |
| fmt : str or sequence of strs, optional |
| A single format (%10.5f), a sequence of formats, or a |
| multi-format string, e.g. 'Iteration %d -- %10.5f', in which |
| case `delimiter` is ignored. For complex `X`, the legal options |
| for `fmt` are: |
| |
| * a single specifier, `fmt='%.4e'`, resulting in numbers formatted |
| like `' (%s+%sj)' % (fmt, fmt)` |
| * a full string specifying every real and imaginary part, e.g. |
| `' %.4e %+.4ej %.4e %+.4ej %.4e %+.4ej'` for 3 columns |
| * a list of specifiers, one per column - in this case, the real |
| and imaginary part must have separate specifiers, |
| e.g. `['%.3e + %.3ej', '(%.15e%+.15ej)']` for 2 columns |
| delimiter : str, optional |
| String or character separating columns. |
| newline : str, optional |
| String or character separating lines. |
| |
| .. versionadded:: 1.5.0 |
| header : str, optional |
| String that will be written at the beginning of the file. |
| |
| .. versionadded:: 1.7.0 |
| footer : str, optional |
| String that will be written at the end of the file. |
| |
| .. versionadded:: 1.7.0 |
| comments : str, optional |
| String that will be prepended to the ``header`` and ``footer`` strings, |
| to mark them as comments. Default: '# ', as expected by e.g. |
| ``numpy.loadtxt``. |
| |
| .. versionadded:: 1.7.0 |
| encoding : {None, str}, optional |
| Encoding used to encode the outputfile. Does not apply to output |
| streams. If the encoding is something other than 'bytes' or 'latin1' |
| you will not be able to load the file in NumPy versions < 1.14. Default |
| is 'latin1'. |
| |
| .. versionadded:: 1.14.0 |
| |
| |
| See Also |
| -------- |
| save : Save an array to a binary file in NumPy ``.npy`` format |
| savez : Save several arrays into an uncompressed ``.npz`` archive |
| savez_compressed : Save several arrays into a compressed ``.npz`` archive |
| |
| Notes |
| ----- |
| Further explanation of the `fmt` parameter |
| (``%[flag]width[.precision]specifier``): |
| |
| flags: |
| ``-`` : left justify |
| |
| ``+`` : Forces to precede result with + or -. |
| |
| ``0`` : Left pad the number with zeros instead of space (see width). |
| |
| width: |
| Minimum number of characters to be printed. The value is not truncated |
| if it has more characters. |
| |
| precision: |
| - For integer specifiers (eg. ``d,i,o,x``), the minimum number of |
| digits. |
| - For ``e, E`` and ``f`` specifiers, the number of digits to print |
| after the decimal point. |
| - For ``g`` and ``G``, the maximum number of significant digits. |
| - For ``s``, the maximum number of characters. |
| |
| specifiers: |
| ``c`` : character |
| |
| ``d`` or ``i`` : signed decimal integer |
| |
| ``e`` or ``E`` : scientific notation with ``e`` or ``E``. |
| |
| ``f`` : decimal floating point |
| |
| ``g,G`` : use the shorter of ``e,E`` or ``f`` |
| |
| ``o`` : signed octal |
| |
| ``s`` : string of characters |
| |
| ``u`` : unsigned decimal integer |
| |
| ``x,X`` : unsigned hexadecimal integer |
| |
| This explanation of ``fmt`` is not complete, for an exhaustive |
| specification see [1]_. |
| |
| References |
| ---------- |
| .. [1] `Format Specification Mini-Language |
| <https://docs.python.org/library/string.html#format-specification-mini-language>`_, |
| Python Documentation. |
| |
| Examples |
| -------- |
| >>> x = y = z = np.arange(0.0,5.0,1.0) |
| >>> np.savetxt('test.out', x, delimiter=',') # X is an array |
| >>> np.savetxt('test.out', (x,y,z)) # x,y,z equal sized 1D arrays |
| >>> np.savetxt('test.out', x, fmt='%1.4e') # use exponential notation |
| |
| """ |
| |
| # Py3 conversions first |
| if isinstance(fmt, bytes): |
| fmt = asstr(fmt) |
| delimiter = asstr(delimiter) |
| |
| class WriteWrap: |
| """Convert to bytes on bytestream inputs. |
| |
| """ |
| def __init__(self, fh, encoding): |
| self.fh = fh |
| self.encoding = encoding |
| self.do_write = self.first_write |
| |
| def close(self): |
| self.fh.close() |
| |
| def write(self, v): |
| self.do_write(v) |
| |
| def write_bytes(self, v): |
| if isinstance(v, bytes): |
| self.fh.write(v) |
| else: |
| self.fh.write(v.encode(self.encoding)) |
| |
| def write_normal(self, v): |
| self.fh.write(asunicode(v)) |
| |
| def first_write(self, v): |
| try: |
| self.write_normal(v) |
| self.write = self.write_normal |
| except TypeError: |
| # input is probably a bytestream |
| self.write_bytes(v) |
| self.write = self.write_bytes |
| |
| own_fh = False |
| if isinstance(fname, os_PathLike): |
| fname = os_fspath(fname) |
| if _is_string_like(fname): |
| # datasource doesn't support creating a new file ... |
| open(fname, 'wt').close() |
| fh = np.lib._datasource.open(fname, 'wt', encoding=encoding) |
| own_fh = True |
| elif hasattr(fname, 'write'): |
| # wrap to handle byte output streams |
| fh = WriteWrap(fname, encoding or 'latin1') |
| else: |
| raise ValueError('fname must be a string or file handle') |
| |
| try: |
| X = np.asarray(X) |
| |
| # Handle 1-dimensional arrays |
| if X.ndim == 0 or X.ndim > 2: |
| raise ValueError( |
| "Expected 1D or 2D array, got %dD array instead" % X.ndim) |
| elif X.ndim == 1: |
| # Common case -- 1d array of numbers |
| if X.dtype.names is None: |
| X = np.atleast_2d(X).T |
| ncol = 1 |
| |
| # Complex dtype -- each field indicates a separate column |
| else: |
| ncol = len(X.dtype.names) |
| else: |
| ncol = X.shape[1] |
| |
| iscomplex_X = np.iscomplexobj(X) |
| # `fmt` can be a string with multiple insertion points or a |
| # list of formats. E.g. '%10.5f\t%10d' or ('%10.5f', '$10d') |
| if type(fmt) in (list, tuple): |
| if len(fmt) != ncol: |
| raise AttributeError('fmt has wrong shape. %s' % str(fmt)) |
| format = asstr(delimiter).join(map(asstr, fmt)) |
| elif isinstance(fmt, str): |
| n_fmt_chars = fmt.count('%') |
| error = ValueError('fmt has wrong number of %% formats: %s' % fmt) |
| if n_fmt_chars == 1: |
| if iscomplex_X: |
| fmt = [' (%s+%sj)' % (fmt, fmt), ] * ncol |
| else: |
| fmt = [fmt, ] * ncol |
| format = delimiter.join(fmt) |
| elif iscomplex_X and n_fmt_chars != (2 * ncol): |
| raise error |
| elif ((not iscomplex_X) and n_fmt_chars != ncol): |
| raise error |
| else: |
| format = fmt |
| else: |
| raise ValueError('invalid fmt: %r' % (fmt,)) |
| |
| if len(header) > 0: |
| header = header.replace('\n', '\n' + comments) |
| fh.write(comments + header + newline) |
| if iscomplex_X: |
| for row in X: |
| row2 = [] |
| for number in row: |
| row2.append(number.real) |
| row2.append(number.imag) |
| s = format % tuple(row2) + newline |
| fh.write(s.replace('+-', '-')) |
| else: |
| for row in X: |
| try: |
| v = format % tuple(row) + newline |
| except TypeError as e: |
| raise TypeError("Mismatch between array dtype ('%s') and " |
| "format specifier ('%s')" |
| % (str(X.dtype), format)) from e |
| fh.write(v) |
| |
| if len(footer) > 0: |
| footer = footer.replace('\n', '\n' + comments) |
| fh.write(comments + footer + newline) |
| finally: |
| if own_fh: |
| fh.close() |
| |
| |
| @set_module('numpy') |
| def fromregex(file, regexp, dtype, encoding=None): |
| r""" |
| Construct an array from a text file, using regular expression parsing. |
| |
| The returned array is always a structured array, and is constructed from |
| all matches of the regular expression in the file. Groups in the regular |
| expression are converted to fields of the structured array. |
| |
| Parameters |
| ---------- |
| file : path or file |
| Filename or file object to read. |
| |
| .. versionchanged:: 1.22.0 |
| Now accepts `os.PathLike` implementations. |
| regexp : str or regexp |
| Regular expression used to parse the file. |
| Groups in the regular expression correspond to fields in the dtype. |
| dtype : dtype or list of dtypes |
| Dtype for the structured array; must be a structured datatype. |
| encoding : str, optional |
| Encoding used to decode the inputfile. Does not apply to input streams. |
| |
| .. versionadded:: 1.14.0 |
| |
| Returns |
| ------- |
| output : ndarray |
| The output array, containing the part of the content of `file` that |
| was matched by `regexp`. `output` is always a structured array. |
| |
| Raises |
| ------ |
| TypeError |
| When `dtype` is not a valid dtype for a structured array. |
| |
| See Also |
| -------- |
| fromstring, loadtxt |
| |
| Notes |
| ----- |
| Dtypes for structured arrays can be specified in several forms, but all |
| forms specify at least the data type and field name. For details see |
| `basics.rec`. |
| |
| Examples |
| -------- |
| >>> from io import StringIO |
| >>> text = StringIO("1312 foo\n1534 bar\n444 qux") |
| |
| >>> regexp = r"(\d+)\s+(...)" # match [digits, whitespace, anything] |
| >>> output = np.fromregex(text, regexp, |
| ... [('num', np.int64), ('key', 'S3')]) |
| >>> output |
| array([(1312, b'foo'), (1534, b'bar'), ( 444, b'qux')], |
| dtype=[('num', '<i8'), ('key', 'S3')]) |
| >>> output['num'] |
| array([1312, 1534, 444]) |
| |
| """ |
| own_fh = False |
| if not hasattr(file, "read"): |
| file = os.fspath(file) |
| file = np.lib._datasource.open(file, 'rt', encoding=encoding) |
| own_fh = True |
| |
| try: |
| if not isinstance(dtype, np.dtype): |
| dtype = np.dtype(dtype) |
| if dtype.names is None: |
| raise TypeError('dtype must be a structured datatype.') |
| |
| content = file.read() |
| if isinstance(content, bytes) and isinstance(regexp, str): |
| regexp = asbytes(regexp) |
| elif isinstance(content, str) and isinstance(regexp, bytes): |
| regexp = asstr(regexp) |
| |
| if not hasattr(regexp, 'match'): |
| regexp = re.compile(regexp) |
| seq = regexp.findall(content) |
| if seq and not isinstance(seq[0], tuple): |
| # Only one group is in the regexp. |
| # Create the new array as a single data-type and then |
| # re-interpret as a single-field structured array. |
| newdtype = np.dtype(dtype[dtype.names[0]]) |
| output = np.array(seq, dtype=newdtype) |
| output.dtype = dtype |
| else: |
| output = np.array(seq, dtype=dtype) |
| |
| return output |
| finally: |
| if own_fh: |
| file.close() |
| |
| |
| #####-------------------------------------------------------------------------- |
| #---- --- ASCII functions --- |
| #####-------------------------------------------------------------------------- |
| |
| |
| @set_array_function_like_doc |
| @set_module('numpy') |
| def genfromtxt(fname, dtype=float, comments='#', delimiter=None, |
| skip_header=0, skip_footer=0, converters=None, |
| missing_values=None, filling_values=None, usecols=None, |
| names=None, excludelist=None, |
| deletechars=''.join(sorted(NameValidator.defaultdeletechars)), |
| replace_space='_', autostrip=False, case_sensitive=True, |
| defaultfmt="f%i", unpack=None, usemask=False, loose=True, |
| invalid_raise=True, max_rows=None, encoding='bytes', |
| *, ndmin=0, like=None): |
| """ |
| Load data from a text file, with missing values handled as specified. |
| |
| Each line past the first `skip_header` lines is split at the `delimiter` |
| character, and characters following the `comments` character are discarded. |
| |
| Parameters |
| ---------- |
| fname : file, str, pathlib.Path, list of str, generator |
| File, filename, list, or generator to read. If the filename |
| extension is ``.gz`` or ``.bz2``, the file is first decompressed. Note |
| that generators must return bytes or strings. The strings |
| in a list or produced by a generator are treated as lines. |
| dtype : dtype, optional |
| Data type of the resulting array. |
| If None, the dtypes will be determined by the contents of each |
| column, individually. |
| comments : str, optional |
| The character used to indicate the start of a comment. |
| All the characters occurring on a line after a comment are discarded. |
| delimiter : str, int, or sequence, optional |
| The string used to separate values. By default, any consecutive |
| whitespaces act as delimiter. An integer or sequence of integers |
| can also be provided as width(s) of each field. |
| skiprows : int, optional |
| `skiprows` was removed in numpy 1.10. Please use `skip_header` instead. |
| skip_header : int, optional |
| The number of lines to skip at the beginning of the file. |
| skip_footer : int, optional |
| The number of lines to skip at the end of the file. |
| converters : variable, optional |
| The set of functions that convert the data of a column to a value. |
| The converters can also be used to provide a default value |
| for missing data: ``converters = {3: lambda s: float(s or 0)}``. |
| missing : variable, optional |
| `missing` was removed in numpy 1.10. Please use `missing_values` |
| instead. |
| missing_values : variable, optional |
| The set of strings corresponding to missing data. |
| filling_values : variable, optional |
| The set of values to be used as default when the data are missing. |
| usecols : sequence, optional |
| Which columns to read, with 0 being the first. For example, |
| ``usecols = (1, 4, 5)`` will extract the 2nd, 5th and 6th columns. |
| names : {None, True, str, sequence}, optional |
| If `names` is True, the field names are read from the first line after |
| the first `skip_header` lines. This line can optionally be preceded |
| by a comment delimiter. If `names` is a sequence or a single-string of |
| comma-separated names, the names will be used to define the field names |
| in a structured dtype. If `names` is None, the names of the dtype |
| fields will be used, if any. |
| excludelist : sequence, optional |
| A list of names to exclude. This list is appended to the default list |
| ['return','file','print']. Excluded names are appended with an |
| underscore: for example, `file` would become `file_`. |
| deletechars : str, optional |
| A string combining invalid characters that must be deleted from the |
| names. |
| defaultfmt : str, optional |
| A format used to define default field names, such as "f%i" or "f_%02i". |
| autostrip : bool, optional |
| Whether to automatically strip white spaces from the variables. |
| replace_space : char, optional |
| Character(s) used in replacement of white spaces in the variable |
| names. By default, use a '_'. |
| case_sensitive : {True, False, 'upper', 'lower'}, optional |
| If True, field names are case sensitive. |
| If False or 'upper', field names are converted to upper case. |
| If 'lower', field names are converted to lower case. |
| unpack : bool, optional |
| If True, the returned array is transposed, so that arguments may be |
| unpacked using ``x, y, z = genfromtxt(...)``. When used with a |
| structured data-type, arrays are returned for each field. |
| Default is False. |
| usemask : bool, optional |
| If True, return a masked array. |
| If False, return a regular array. |
| loose : bool, optional |
| If True, do not raise errors for invalid values. |
| invalid_raise : bool, optional |
| If True, an exception is raised if an inconsistency is detected in the |
| number of columns. |
| If False, a warning is emitted and the offending lines are skipped. |
| max_rows : int, optional |
| The maximum number of rows to read. Must not be used with skip_footer |
| at the same time. If given, the value must be at least 1. Default is |
| to read the entire file. |
| |
| .. versionadded:: 1.10.0 |
| encoding : str, optional |
| Encoding used to decode the inputfile. Does not apply when `fname` is |
| a file object. The special value 'bytes' enables backward compatibility |
| workarounds that ensure that you receive byte arrays when possible |
| and passes latin1 encoded strings to converters. Override this value to |
| receive unicode arrays and pass strings as input to converters. If set |
| to None the system default is used. The default value is 'bytes'. |
| |
| .. versionadded:: 1.14.0 |
| ndmin : int, optional |
| Same parameter as `loadtxt` |
| |
| .. versionadded:: 1.23.0 |
| ${ARRAY_FUNCTION_LIKE} |
| |
| .. versionadded:: 1.20.0 |
| |
| Returns |
| ------- |
| out : ndarray |
| Data read from the text file. If `usemask` is True, this is a |
| masked array. |
| |
| See Also |
| -------- |
| numpy.loadtxt : equivalent function when no data is missing. |
| |
| Notes |
| ----- |
| * When spaces are used as delimiters, or when no delimiter has been given |
| as input, there should not be any missing data between two fields. |
| * When the variables are named (either by a flexible dtype or with `names`), |
| there must not be any header in the file (else a ValueError |
| exception is raised). |
| * Individual values are not stripped of spaces by default. |
| When using a custom converter, make sure the function does remove spaces. |
| |
| References |
| ---------- |
| .. [1] NumPy User Guide, section `I/O with NumPy |
| <https://docs.scipy.org/doc/numpy/user/basics.io.genfromtxt.html>`_. |
| |
| Examples |
| -------- |
| >>> from io import StringIO |
| >>> import numpy as np |
| |
| Comma delimited file with mixed dtype |
| |
| >>> s = StringIO(u"1,1.3,abcde") |
| >>> data = np.genfromtxt(s, dtype=[('myint','i8'),('myfloat','f8'), |
| ... ('mystring','S5')], delimiter=",") |
| >>> data |
| array((1, 1.3, b'abcde'), |
| dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')]) |
| |
| Using dtype = None |
| |
| >>> _ = s.seek(0) # needed for StringIO example only |
| >>> data = np.genfromtxt(s, dtype=None, |
| ... names = ['myint','myfloat','mystring'], delimiter=",") |
| >>> data |
| array((1, 1.3, b'abcde'), |
| dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')]) |
| |
| Specifying dtype and names |
| |
| >>> _ = s.seek(0) |
| >>> data = np.genfromtxt(s, dtype="i8,f8,S5", |
| ... names=['myint','myfloat','mystring'], delimiter=",") |
| >>> data |
| array((1, 1.3, b'abcde'), |
| dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')]) |
| |
| An example with fixed-width columns |
| |
| >>> s = StringIO(u"11.3abcde") |
| >>> data = np.genfromtxt(s, dtype=None, names=['intvar','fltvar','strvar'], |
| ... delimiter=[1,3,5]) |
| >>> data |
| array((1, 1.3, b'abcde'), |
| dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', 'S5')]) |
| |
| An example to show comments |
| |
| >>> f = StringIO(''' |
| ... text,# of chars |
| ... hello world,11 |
| ... numpy,5''') |
| >>> np.genfromtxt(f, dtype='S12,S12', delimiter=',') |
| array([(b'text', b''), (b'hello world', b'11'), (b'numpy', b'5')], |
| dtype=[('f0', 'S12'), ('f1', 'S12')]) |
| |
| """ |
| |
| if like is not None: |
| return _genfromtxt_with_like( |
| like, fname, dtype=dtype, comments=comments, delimiter=delimiter, |
| skip_header=skip_header, skip_footer=skip_footer, |
| converters=converters, missing_values=missing_values, |
| filling_values=filling_values, usecols=usecols, names=names, |
| excludelist=excludelist, deletechars=deletechars, |
| replace_space=replace_space, autostrip=autostrip, |
| case_sensitive=case_sensitive, defaultfmt=defaultfmt, |
| unpack=unpack, usemask=usemask, loose=loose, |
| invalid_raise=invalid_raise, max_rows=max_rows, encoding=encoding, |
| ndmin=ndmin, |
| ) |
| |
| _ensure_ndmin_ndarray_check_param(ndmin) |
| |
| if max_rows is not None: |
| if skip_footer: |
| raise ValueError( |
| "The keywords 'skip_footer' and 'max_rows' can not be " |
| "specified at the same time.") |
| if max_rows < 1: |
| raise ValueError("'max_rows' must be at least 1.") |
| |
| if usemask: |
| from numpy.ma import MaskedArray, make_mask_descr |
| # Check the input dictionary of converters |
| user_converters = converters or {} |
| if not isinstance(user_converters, dict): |
| raise TypeError( |
| "The input argument 'converter' should be a valid dictionary " |
| "(got '%s' instead)" % type(user_converters)) |
| |
| if encoding == 'bytes': |
| encoding = None |
| byte_converters = True |
| else: |
| byte_converters = False |
| |
| # Initialize the filehandle, the LineSplitter and the NameValidator |
| if isinstance(fname, os_PathLike): |
| fname = os_fspath(fname) |
| if isinstance(fname, str): |
| fid = np.lib._datasource.open(fname, 'rt', encoding=encoding) |
| fid_ctx = contextlib.closing(fid) |
| else: |
| fid = fname |
| fid_ctx = contextlib.nullcontext(fid) |
| try: |
| fhd = iter(fid) |
| except TypeError as e: |
| raise TypeError( |
| "fname must be a string, a filehandle, a sequence of strings,\n" |
| f"or an iterator of strings. Got {type(fname)} instead." |
| ) from e |
| with fid_ctx: |
| split_line = LineSplitter(delimiter=delimiter, comments=comments, |
| autostrip=autostrip, encoding=encoding) |
| validate_names = NameValidator(excludelist=excludelist, |
| deletechars=deletechars, |
| case_sensitive=case_sensitive, |
| replace_space=replace_space) |
| |
| # Skip the first `skip_header` rows |
| try: |
| for i in range(skip_header): |
| next(fhd) |
| |
| # Keep on until we find the first valid values |
| first_values = None |
| |
| while not first_values: |
| first_line = _decode_line(next(fhd), encoding) |
| if (names is True) and (comments is not None): |
| if comments in first_line: |
| first_line = ( |
| ''.join(first_line.split(comments)[1:])) |
| first_values = split_line(first_line) |
| except StopIteration: |
| # return an empty array if the datafile is empty |
| first_line = '' |
| first_values = [] |
| warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2) |
| |
| # Should we take the first values as names ? |
| if names is True: |
| fval = first_values[0].strip() |
| if comments is not None: |
| if fval in comments: |
| del first_values[0] |
| |
| # Check the columns to use: make sure `usecols` is a list |
| if usecols is not None: |
| try: |
| usecols = [_.strip() for _ in usecols.split(",")] |
| except AttributeError: |
| try: |
| usecols = list(usecols) |
| except TypeError: |
| usecols = [usecols, ] |
| nbcols = len(usecols or first_values) |
| |
| # Check the names and overwrite the dtype.names if needed |
| if names is True: |
| names = validate_names([str(_.strip()) for _ in first_values]) |
| first_line = '' |
| elif _is_string_like(names): |
| names = validate_names([_.strip() for _ in names.split(',')]) |
| elif names: |
| names = validate_names(names) |
| # Get the dtype |
| if dtype is not None: |
| dtype = easy_dtype(dtype, defaultfmt=defaultfmt, names=names, |
| excludelist=excludelist, |
| deletechars=deletechars, |
| case_sensitive=case_sensitive, |
| replace_space=replace_space) |
| # Make sure the names is a list (for 2.5) |
| if names is not None: |
| names = list(names) |
| |
| if usecols: |
| for (i, current) in enumerate(usecols): |
| # if usecols is a list of names, convert to a list of indices |
| if _is_string_like(current): |
| usecols[i] = names.index(current) |
| elif current < 0: |
| usecols[i] = current + len(first_values) |
| # If the dtype is not None, make sure we update it |
| if (dtype is not None) and (len(dtype) > nbcols): |
| descr = dtype.descr |
| dtype = np.dtype([descr[_] for _ in usecols]) |
| names = list(dtype.names) |
| # If `names` is not None, update the names |
| elif (names is not None) and (len(names) > nbcols): |
| names = [names[_] for _ in usecols] |
| elif (names is not None) and (dtype is not None): |
| names = list(dtype.names) |
| |
| # Process the missing values ............................... |
| # Rename missing_values for convenience |
| user_missing_values = missing_values or () |
| if isinstance(user_missing_values, bytes): |
| user_missing_values = user_missing_values.decode('latin1') |
| |
| # Define the list of missing_values (one column: one list) |
| missing_values = [list(['']) for _ in range(nbcols)] |
| |
| # We have a dictionary: process it field by field |
| if isinstance(user_missing_values, dict): |
| # Loop on the items |
| for (key, val) in user_missing_values.items(): |
| # Is the key a string ? |
| if _is_string_like(key): |
| try: |
| # Transform it into an integer |
| key = names.index(key) |
| except ValueError: |
| # We couldn't find it: the name must have been dropped |
| continue |
| # Redefine the key as needed if it's a column number |
| if usecols: |
| try: |
| key = usecols.index(key) |
| except ValueError: |
| pass |
| # Transform the value as a list of string |
| if isinstance(val, (list, tuple)): |
| val = [str(_) for _ in val] |
| else: |
| val = [str(val), ] |
| # Add the value(s) to the current list of missing |
| if key is None: |
| # None acts as default |
| for miss in missing_values: |
| miss.extend(val) |
| else: |
| missing_values[key].extend(val) |
| # We have a sequence : each item matches a column |
| elif isinstance(user_missing_values, (list, tuple)): |
| for (value, entry) in zip(user_missing_values, missing_values): |
| value = str(value) |
| if value not in entry: |
| entry.append(value) |
| # We have a string : apply it to all entries |
| elif isinstance(user_missing_values, str): |
| user_value = user_missing_values.split(",") |
| for entry in missing_values: |
| entry.extend(user_value) |
| # We have something else: apply it to all entries |
| else: |
| for entry in missing_values: |
| entry.extend([str(user_missing_values)]) |
| |
| # Process the filling_values ............................... |
| # Rename the input for convenience |
| user_filling_values = filling_values |
| if user_filling_values is None: |
| user_filling_values = [] |
| # Define the default |
| filling_values = [None] * nbcols |
| # We have a dictionary : update each entry individually |
| if isinstance(user_filling_values, dict): |
| for (key, val) in user_filling_values.items(): |
| if _is_string_like(key): |
| try: |
| # Transform it into an integer |
| key = names.index(key) |
| except ValueError: |
| # We couldn't find it: the name must have been dropped, |
| continue |
| # Redefine the key if it's a column number and usecols is defined |
| if usecols: |
| try: |
| key = usecols.index(key) |
| except ValueError: |
| pass |
| # Add the value to the list |
| filling_values[key] = val |
| # We have a sequence : update on a one-to-one basis |
| elif isinstance(user_filling_values, (list, tuple)): |
| n = len(user_filling_values) |
| if (n <= nbcols): |
| filling_values[:n] = user_filling_values |
| else: |
| filling_values = user_filling_values[:nbcols] |
| # We have something else : use it for all entries |
| else: |
| filling_values = [user_filling_values] * nbcols |
| |
| # Initialize the converters ................................ |
| if dtype is None: |
| # Note: we can't use a [...]*nbcols, as we would have 3 times the same |
| # ... converter, instead of 3 different converters. |
| converters = [StringConverter(None, missing_values=miss, default=fill) |
| for (miss, fill) in zip(missing_values, filling_values)] |
| else: |
| dtype_flat = flatten_dtype(dtype, flatten_base=True) |
| # Initialize the converters |
| if len(dtype_flat) > 1: |
| # Flexible type : get a converter from each dtype |
| zipit = zip(dtype_flat, missing_values, filling_values) |
| converters = [StringConverter(dt, locked=True, |
| missing_values=miss, default=fill) |
| for (dt, miss, fill) in zipit] |
| else: |
| # Set to a default converter (but w/ different missing values) |
| zipit = zip(missing_values, filling_values) |
| converters = [StringConverter(dtype, locked=True, |
| missing_values=miss, default=fill) |
| for (miss, fill) in zipit] |
| # Update the converters to use the user-defined ones |
| uc_update = [] |
| for (j, conv) in user_converters.items(): |
| # If the converter is specified by column names, use the index instead |
| if _is_string_like(j): |
| try: |
| j = names.index(j) |
| i = j |
| except ValueError: |
| continue |
| elif usecols: |
| try: |
| i = usecols.index(j) |
| except ValueError: |
| # Unused converter specified |
| continue |
| else: |
| i = j |
| # Find the value to test - first_line is not filtered by usecols: |
| if len(first_line): |
| testing_value = first_values[j] |
| else: |
| testing_value = None |
| if conv is bytes: |
| user_conv = asbytes |
| elif byte_converters: |
| # converters may use decode to workaround numpy's old behaviour, |
| # so encode the string again before passing to the user converter |
| def tobytes_first(x, conv): |
| if type(x) is bytes: |
| return conv(x) |
| return conv(x.encode("latin1")) |
| user_conv = functools.partial(tobytes_first, conv=conv) |
| else: |
| user_conv = conv |
| converters[i].update(user_conv, locked=True, |
| testing_value=testing_value, |
| default=filling_values[i], |
| missing_values=missing_values[i],) |
| uc_update.append((i, user_conv)) |
| # Make sure we have the corrected keys in user_converters... |
| user_converters.update(uc_update) |
| |
| # Fixme: possible error as following variable never used. |
| # miss_chars = [_.missing_values for _ in converters] |
| |
| # Initialize the output lists ... |
| # ... rows |
| rows = [] |
| append_to_rows = rows.append |
| # ... masks |
| if usemask: |
| masks = [] |
| append_to_masks = masks.append |
| # ... invalid |
| invalid = [] |
| append_to_invalid = invalid.append |
| |
| # Parse each line |
| for (i, line) in enumerate(itertools.chain([first_line, ], fhd)): |
| values = split_line(line) |
| nbvalues = len(values) |
| # Skip an empty line |
| if nbvalues == 0: |
| continue |
| if usecols: |
| # Select only the columns we need |
| try: |
| values = [values[_] for _ in usecols] |
| except IndexError: |
| append_to_invalid((i + skip_header + 1, nbvalues)) |
| continue |
| elif nbvalues != nbcols: |
| append_to_invalid((i + skip_header + 1, nbvalues)) |
| continue |
| # Store the values |
| append_to_rows(tuple(values)) |
| if usemask: |
| append_to_masks(tuple([v.strip() in m |
| for (v, m) in zip(values, |
| missing_values)])) |
| if len(rows) == max_rows: |
| break |
| |
| # Upgrade the converters (if needed) |
| if dtype is None: |
| for (i, converter) in enumerate(converters): |
| current_column = [itemgetter(i)(_m) for _m in rows] |
| try: |
| converter.iterupgrade(current_column) |
| except ConverterLockError: |
| errmsg = "Converter #%i is locked and cannot be upgraded: " % i |
| current_column = map(itemgetter(i), rows) |
| for (j, value) in enumerate(current_column): |
| try: |
| converter.upgrade(value) |
| except (ConverterError, ValueError): |
| errmsg += "(occurred line #%i for value '%s')" |
| errmsg %= (j + 1 + skip_header, value) |
| raise ConverterError(errmsg) |
| |
| # Check that we don't have invalid values |
| nbinvalid = len(invalid) |
| if nbinvalid > 0: |
| nbrows = len(rows) + nbinvalid - skip_footer |
| # Construct the error message |
| template = " Line #%%i (got %%i columns instead of %i)" % nbcols |
| if skip_footer > 0: |
| nbinvalid_skipped = len([_ for _ in invalid |
| if _[0] > nbrows + skip_header]) |
| invalid = invalid[:nbinvalid - nbinvalid_skipped] |
| skip_footer -= nbinvalid_skipped |
| # |
| # nbrows -= skip_footer |
| # errmsg = [template % (i, nb) |
| # for (i, nb) in invalid if i < nbrows] |
| # else: |
| errmsg = [template % (i, nb) |
| for (i, nb) in invalid] |
| if len(errmsg): |
| errmsg.insert(0, "Some errors were detected !") |
| errmsg = "\n".join(errmsg) |
| # Raise an exception ? |
| if invalid_raise: |
| raise ValueError(errmsg) |
| # Issue a warning ? |
| else: |
| warnings.warn(errmsg, ConversionWarning, stacklevel=2) |
| |
| # Strip the last skip_footer data |
| if skip_footer > 0: |
| rows = rows[:-skip_footer] |
| if usemask: |
| masks = masks[:-skip_footer] |
| |
| # Convert each value according to the converter: |
| # We want to modify the list in place to avoid creating a new one... |
| if loose: |
| rows = list( |
| zip(*[[conv._loose_call(_r) for _r in map(itemgetter(i), rows)] |
| for (i, conv) in enumerate(converters)])) |
| else: |
| rows = list( |
| zip(*[[conv._strict_call(_r) for _r in map(itemgetter(i), rows)] |
| for (i, conv) in enumerate(converters)])) |
| |
| # Reset the dtype |
| data = rows |
| if dtype is None: |
| # Get the dtypes from the types of the converters |
| column_types = [conv.type for conv in converters] |
| # Find the columns with strings... |
| strcolidx = [i for (i, v) in enumerate(column_types) |
| if v == np.str_] |
| |
| if byte_converters and strcolidx: |
| # convert strings back to bytes for backward compatibility |
| warnings.warn( |
| "Reading unicode strings without specifying the encoding " |
| "argument is deprecated. Set the encoding, use None for the " |
| "system default.", |
| np.VisibleDeprecationWarning, stacklevel=2) |
| def encode_unicode_cols(row_tup): |
| row = list(row_tup) |
| for i in strcolidx: |
| row[i] = row[i].encode('latin1') |
| return tuple(row) |
| |
| try: |
| data = [encode_unicode_cols(r) for r in data] |
| except UnicodeEncodeError: |
| pass |
| else: |
| for i in strcolidx: |
| column_types[i] = np.bytes_ |
| |
| # Update string types to be the right length |
| sized_column_types = column_types[:] |
| for i, col_type in enumerate(column_types): |
| if np.issubdtype(col_type, np.character): |
| n_chars = max(len(row[i]) for row in data) |
| sized_column_types[i] = (col_type, n_chars) |
| |
| if names is None: |
| # If the dtype is uniform (before sizing strings) |
| base = { |
| c_type |
| for c, c_type in zip(converters, column_types) |
| if c._checked} |
| if len(base) == 1: |
| uniform_type, = base |
| (ddtype, mdtype) = (uniform_type, bool) |
| else: |
| ddtype = [(defaultfmt % i, dt) |
| for (i, dt) in enumerate(sized_column_types)] |
| if usemask: |
| mdtype = [(defaultfmt % i, bool) |
| for (i, dt) in enumerate(sized_column_types)] |
| else: |
| ddtype = list(zip(names, sized_column_types)) |
| mdtype = list(zip(names, [bool] * len(sized_column_types))) |
| output = np.array(data, dtype=ddtype) |
| if usemask: |
| outputmask = np.array(masks, dtype=mdtype) |
| else: |
| # Overwrite the initial dtype names if needed |
| if names and dtype.names is not None: |
| dtype.names = names |
| # Case 1. We have a structured type |
| if len(dtype_flat) > 1: |
| # Nested dtype, eg [('a', int), ('b', [('b0', int), ('b1', 'f4')])] |
| # First, create the array using a flattened dtype: |
| # [('a', int), ('b1', int), ('b2', float)] |
| # Then, view the array using the specified dtype. |
| if 'O' in (_.char for _ in dtype_flat): |
| if has_nested_fields(dtype): |
| raise NotImplementedError( |
| "Nested fields involving objects are not supported...") |
| else: |
| output = np.array(data, dtype=dtype) |
| else: |
| rows = np.array(data, dtype=[('', _) for _ in dtype_flat]) |
| output = rows.view(dtype) |
| # Now, process the rowmasks the same way |
| if usemask: |
| rowmasks = np.array( |
| masks, dtype=np.dtype([('', bool) for t in dtype_flat])) |
| # Construct the new dtype |
| mdtype = make_mask_descr(dtype) |
| outputmask = rowmasks.view(mdtype) |
| # Case #2. We have a basic dtype |
| else: |
| # We used some user-defined converters |
| if user_converters: |
| ishomogeneous = True |
| descr = [] |
| for i, ttype in enumerate([conv.type for conv in converters]): |
| # Keep the dtype of the current converter |
| if i in user_converters: |
| ishomogeneous &= (ttype == dtype.type) |
| if np.issubdtype(ttype, np.character): |
| ttype = (ttype, max(len(row[i]) for row in data)) |
| descr.append(('', ttype)) |
| else: |
| descr.append(('', dtype)) |
| # So we changed the dtype ? |
| if not ishomogeneous: |
| # We have more than one field |
| if len(descr) > 1: |
| dtype = np.dtype(descr) |
| # We have only one field: drop the name if not needed. |
| else: |
| dtype = np.dtype(ttype) |
| # |
| output = np.array(data, dtype) |
| if usemask: |
| if dtype.names is not None: |
| mdtype = [(_, bool) for _ in dtype.names] |
| else: |
| mdtype = bool |
| outputmask = np.array(masks, dtype=mdtype) |
| # Try to take care of the missing data we missed |
| names = output.dtype.names |
| if usemask and names: |
| for (name, conv) in zip(names, converters): |
| missing_values = [conv(_) for _ in conv.missing_values |
| if _ != ''] |
| for mval in missing_values: |
| outputmask[name] |= (output[name] == mval) |
| # Construct the final array |
| if usemask: |
| output = output.view(MaskedArray) |
| output._mask = outputmask |
| |
| output = _ensure_ndmin_ndarray(output, ndmin=ndmin) |
| |
| if unpack: |
| if names is None: |
| return output.T |
| elif len(names) == 1: |
| # squeeze single-name dtypes too |
| return output[names[0]] |
| else: |
| # For structured arrays with multiple fields, |
| # return an array for each field. |
| return [output[field] for field in names] |
| return output |
| |
| |
| _genfromtxt_with_like = array_function_dispatch()(genfromtxt) |
| |
| |
| def recfromtxt(fname, **kwargs): |
| """ |
| Load ASCII data from a file and return it in a record array. |
| |
| If ``usemask=False`` a standard `recarray` is returned, |
| if ``usemask=True`` a MaskedRecords array is returned. |
| |
| Parameters |
| ---------- |
| fname, kwargs : For a description of input parameters, see `genfromtxt`. |
| |
| See Also |
| -------- |
| numpy.genfromtxt : generic function |
| |
| Notes |
| ----- |
| By default, `dtype` is None, which means that the data-type of the output |
| array will be determined from the data. |
| |
| """ |
| kwargs.setdefault("dtype", None) |
| usemask = kwargs.get('usemask', False) |
| output = genfromtxt(fname, **kwargs) |
| if usemask: |
| from numpy.ma.mrecords import MaskedRecords |
| output = output.view(MaskedRecords) |
| else: |
| output = output.view(np.recarray) |
| return output |
| |
| |
| def recfromcsv(fname, **kwargs): |
| """ |
| Load ASCII data stored in a comma-separated file. |
| |
| The returned array is a record array (if ``usemask=False``, see |
| `recarray`) or a masked record array (if ``usemask=True``, |
| see `ma.mrecords.MaskedRecords`). |
| |
| Parameters |
| ---------- |
| fname, kwargs : For a description of input parameters, see `genfromtxt`. |
| |
| See Also |
| -------- |
| numpy.genfromtxt : generic function to load ASCII data. |
| |
| Notes |
| ----- |
| By default, `dtype` is None, which means that the data-type of the output |
| array will be determined from the data. |
| |
| """ |
| # Set default kwargs for genfromtxt as relevant to csv import. |
| kwargs.setdefault("case_sensitive", "lower") |
| kwargs.setdefault("names", True) |
| kwargs.setdefault("delimiter", ",") |
| kwargs.setdefault("dtype", None) |
| output = genfromtxt(fname, **kwargs) |
| |
| usemask = kwargs.get("usemask", False) |
| if usemask: |
| from numpy.ma.mrecords import MaskedRecords |
| output = output.view(MaskedRecords) |
| else: |
| output = output.view(np.recarray) |
| return output |