| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import pytest |
| import sys |
| import weakref |
| |
| import numpy as np |
| import pyarrow as pa |
| |
| try: |
| from scipy.sparse import csr_matrix, coo_matrix |
| except ImportError: |
| coo_matrix = None |
| csr_matrix = None |
| |
| try: |
| import sparse |
| except ImportError: |
| sparse = None |
| |
| |
| tensor_type_pairs = [ |
| ('i1', pa.int8()), |
| ('i2', pa.int16()), |
| ('i4', pa.int32()), |
| ('i8', pa.int64()), |
| ('u1', pa.uint8()), |
| ('u2', pa.uint16()), |
| ('u4', pa.uint32()), |
| ('u8', pa.uint64()), |
| ('f2', pa.float16()), |
| ('f4', pa.float32()), |
| ('f8', pa.float64()) |
| ] |
| |
| |
| @pytest.mark.parametrize('sparse_tensor_type', [ |
| pa.SparseCSRMatrix, |
| pa.SparseCSCMatrix, |
| pa.SparseCOOTensor, |
| pa.SparseCSFTensor, |
| ]) |
| def test_sparse_tensor_attrs(sparse_tensor_type): |
| data = np.array([ |
| [8, 0, 2, 0, 0, 0], |
| [0, 0, 0, 0, 0, 5], |
| [3, 0, 0, 0, 0, 0], |
| [0, 0, 0, 0, 4, 6], |
| ]) |
| dim_names = ('x', 'y') |
| sparse_tensor = sparse_tensor_type.from_dense_numpy(data, dim_names) |
| |
| assert sparse_tensor.ndim == 2 |
| assert sparse_tensor.size == 24 |
| assert sparse_tensor.shape == data.shape |
| assert sparse_tensor.is_mutable |
| assert sparse_tensor.dim_name(0) == dim_names[0] |
| assert sparse_tensor.dim_names == dim_names |
| assert sparse_tensor.non_zero_length == 6 |
| |
| wr = weakref.ref(sparse_tensor) |
| assert wr() is not None |
| del sparse_tensor |
| assert wr() is None |
| |
| |
| def test_sparse_coo_tensor_base_object(): |
| expected_data = np.array([[8, 2, 5, 3, 4, 6]]).T |
| expected_coords = np.array([ |
| [0, 0, 1, 2, 3, 3], |
| [0, 2, 5, 0, 4, 5], |
| ]).T |
| array = np.array([ |
| [8, 0, 2, 0, 0, 0], |
| [0, 0, 0, 0, 0, 5], |
| [3, 0, 0, 0, 0, 0], |
| [0, 0, 0, 0, 4, 6], |
| ]) |
| sparse_tensor = pa.SparseCOOTensor.from_dense_numpy(array) |
| n = sys.getrefcount(sparse_tensor) |
| result_data, result_coords = sparse_tensor.to_numpy() |
| assert sparse_tensor.has_canonical_format |
| assert sys.getrefcount(sparse_tensor) == n + 2 |
| |
| sparse_tensor = None |
| assert np.array_equal(expected_data, result_data) |
| assert np.array_equal(expected_coords, result_coords) |
| assert result_coords.flags.c_contiguous # row-major |
| |
| |
| def test_sparse_csr_matrix_base_object(): |
| data = np.array([[8, 2, 5, 3, 4, 6]]).T |
| indptr = np.array([0, 2, 3, 4, 6]) |
| indices = np.array([0, 2, 5, 0, 4, 5]) |
| array = np.array([ |
| [8, 0, 2, 0, 0, 0], |
| [0, 0, 0, 0, 0, 5], |
| [3, 0, 0, 0, 0, 0], |
| [0, 0, 0, 0, 4, 6], |
| ]) |
| sparse_tensor = pa.SparseCSRMatrix.from_dense_numpy(array) |
| n = sys.getrefcount(sparse_tensor) |
| result_data, result_indptr, result_indices = sparse_tensor.to_numpy() |
| assert sys.getrefcount(sparse_tensor) == n + 3 |
| |
| sparse_tensor = None |
| assert np.array_equal(data, result_data) |
| assert np.array_equal(indptr, result_indptr) |
| assert np.array_equal(indices, result_indices) |
| |
| |
| def test_sparse_csf_tensor_base_object(): |
| data = np.array([[8, 2, 5, 3, 4, 6]]).T |
| indptr = [np.array([0, 2, 3, 4, 6])] |
| indices = [ |
| np.array([0, 1, 2, 3]), |
| np.array([0, 2, 5, 0, 4, 5]) |
| ] |
| array = np.array([ |
| [8, 0, 2, 0, 0, 0], |
| [0, 0, 0, 0, 0, 5], |
| [3, 0, 0, 0, 0, 0], |
| [0, 0, 0, 0, 4, 6], |
| ]) |
| sparse_tensor = pa.SparseCSFTensor.from_dense_numpy(array) |
| n = sys.getrefcount(sparse_tensor) |
| result_data, result_indptr, result_indices = sparse_tensor.to_numpy() |
| assert sys.getrefcount(sparse_tensor) == n + 4 |
| |
| sparse_tensor = None |
| assert np.array_equal(data, result_data) |
| assert np.array_equal(indptr[0], result_indptr[0]) |
| assert np.array_equal(indices[0], result_indices[0]) |
| assert np.array_equal(indices[1], result_indices[1]) |
| |
| |
| @pytest.mark.parametrize('sparse_tensor_type', [ |
| pa.SparseCSRMatrix, |
| pa.SparseCSCMatrix, |
| pa.SparseCOOTensor, |
| pa.SparseCSFTensor, |
| ]) |
| def test_sparse_tensor_equals(sparse_tensor_type): |
| def eq(a, b): |
| assert a.equals(b) |
| assert a == b |
| assert not (a != b) |
| |
| def ne(a, b): |
| assert not a.equals(b) |
| assert not (a == b) |
| assert a != b |
| |
| data = np.random.randn(10, 6)[::, ::2] |
| sparse_tensor1 = sparse_tensor_type.from_dense_numpy(data) |
| sparse_tensor2 = sparse_tensor_type.from_dense_numpy( |
| np.ascontiguousarray(data)) |
| eq(sparse_tensor1, sparse_tensor2) |
| data = data.copy() |
| data[9, 0] = 1.0 |
| sparse_tensor2 = sparse_tensor_type.from_dense_numpy( |
| np.ascontiguousarray(data)) |
| ne(sparse_tensor1, sparse_tensor2) |
| |
| |
| @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) |
| def test_sparse_coo_tensor_from_dense(dtype_str, arrow_type): |
| dtype = np.dtype(dtype_str) |
| expected_data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) |
| expected_coords = np.array([ |
| [0, 0, 1, 2, 3, 3], |
| [0, 2, 5, 0, 4, 5], |
| ]).T |
| array = np.array([ |
| [8, 0, 2, 0, 0, 0], |
| [0, 0, 0, 0, 0, 5], |
| [3, 0, 0, 0, 0, 0], |
| [0, 0, 0, 0, 4, 6], |
| ]).astype(dtype) |
| tensor = pa.Tensor.from_numpy(array) |
| |
| # Test from numpy array |
| sparse_tensor = pa.SparseCOOTensor.from_dense_numpy(array) |
| repr(sparse_tensor) |
| result_data, result_coords = sparse_tensor.to_numpy() |
| assert sparse_tensor.type == arrow_type |
| assert np.array_equal(expected_data, result_data) |
| assert np.array_equal(expected_coords, result_coords) |
| |
| # Test from Tensor |
| sparse_tensor = pa.SparseCOOTensor.from_tensor(tensor) |
| repr(sparse_tensor) |
| result_data, result_coords = sparse_tensor.to_numpy() |
| assert sparse_tensor.type == arrow_type |
| assert np.array_equal(expected_data, result_data) |
| assert np.array_equal(expected_coords, result_coords) |
| |
| |
| @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) |
| def test_sparse_csr_matrix_from_dense(dtype_str, arrow_type): |
| dtype = np.dtype(dtype_str) |
| data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) |
| indptr = np.array([0, 2, 3, 4, 6]) |
| indices = np.array([0, 2, 5, 0, 4, 5]) |
| array = np.array([ |
| [8, 0, 2, 0, 0, 0], |
| [0, 0, 0, 0, 0, 5], |
| [3, 0, 0, 0, 0, 0], |
| [0, 0, 0, 0, 4, 6], |
| ]).astype(dtype) |
| tensor = pa.Tensor.from_numpy(array) |
| |
| # Test from numpy array |
| sparse_tensor = pa.SparseCSRMatrix.from_dense_numpy(array) |
| repr(sparse_tensor) |
| result_data, result_indptr, result_indices = sparse_tensor.to_numpy() |
| assert sparse_tensor.type == arrow_type |
| assert np.array_equal(data, result_data) |
| assert np.array_equal(indptr, result_indptr) |
| assert np.array_equal(indices, result_indices) |
| |
| # Test from Tensor |
| sparse_tensor = pa.SparseCSRMatrix.from_tensor(tensor) |
| repr(sparse_tensor) |
| result_data, result_indptr, result_indices = sparse_tensor.to_numpy() |
| assert sparse_tensor.type == arrow_type |
| assert np.array_equal(data, result_data) |
| assert np.array_equal(indptr, result_indptr) |
| assert np.array_equal(indices, result_indices) |
| |
| |
| @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) |
| def test_sparse_csf_tensor_from_dense_numpy(dtype_str, arrow_type): |
| dtype = np.dtype(dtype_str) |
| data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) |
| indptr = [np.array([0, 2, 3, 4, 6])] |
| indices = [ |
| np.array([0, 1, 2, 3]), |
| np.array([0, 2, 5, 0, 4, 5]) |
| ] |
| array = np.array([ |
| [8, 0, 2, 0, 0, 0], |
| [0, 0, 0, 0, 0, 5], |
| [3, 0, 0, 0, 0, 0], |
| [0, 0, 0, 0, 4, 6], |
| ]).astype(dtype) |
| |
| # Test from numpy array |
| sparse_tensor = pa.SparseCSFTensor.from_dense_numpy(array) |
| repr(sparse_tensor) |
| result_data, result_indptr, result_indices = sparse_tensor.to_numpy() |
| assert sparse_tensor.type == arrow_type |
| assert np.array_equal(data, result_data) |
| assert np.array_equal(indptr[0], result_indptr[0]) |
| assert np.array_equal(indices[0], result_indices[0]) |
| assert np.array_equal(indices[1], result_indices[1]) |
| |
| |
| @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) |
| def test_sparse_csf_tensor_from_dense_tensor(dtype_str, arrow_type): |
| dtype = np.dtype(dtype_str) |
| data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) |
| indptr = [np.array([0, 2, 3, 4, 6])] |
| indices = [ |
| np.array([0, 1, 2, 3]), |
| np.array([0, 2, 5, 0, 4, 5]) |
| ] |
| array = np.array([ |
| [8, 0, 2, 0, 0, 0], |
| [0, 0, 0, 0, 0, 5], |
| [3, 0, 0, 0, 0, 0], |
| [0, 0, 0, 0, 4, 6], |
| ]).astype(dtype) |
| tensor = pa.Tensor.from_numpy(array) |
| |
| # Test from Tensor |
| sparse_tensor = pa.SparseCSFTensor.from_tensor(tensor) |
| repr(sparse_tensor) |
| result_data, result_indptr, result_indices = sparse_tensor.to_numpy() |
| assert sparse_tensor.type == arrow_type |
| assert np.array_equal(data, result_data) |
| assert np.array_equal(indptr[0], result_indptr[0]) |
| assert np.array_equal(indices[0], result_indices[0]) |
| assert np.array_equal(indices[1], result_indices[1]) |
| |
| |
| @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) |
| def test_sparse_coo_tensor_numpy_roundtrip(dtype_str, arrow_type): |
| dtype = np.dtype(dtype_str) |
| data = np.array([[1, 2, 3, 4, 5, 6]]).T.astype(dtype) |
| coords = np.array([ |
| [0, 0, 2, 3, 1, 3], |
| [0, 2, 0, 4, 5, 5], |
| ]).T |
| shape = (4, 6) |
| dim_names = ('x', 'y') |
| |
| sparse_tensor = pa.SparseCOOTensor.from_numpy(data, coords, shape, |
| dim_names) |
| repr(sparse_tensor) |
| result_data, result_coords = sparse_tensor.to_numpy() |
| assert sparse_tensor.type == arrow_type |
| assert np.array_equal(data, result_data) |
| assert np.array_equal(coords, result_coords) |
| assert sparse_tensor.dim_names == dim_names |
| |
| |
| @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) |
| def test_sparse_csr_matrix_numpy_roundtrip(dtype_str, arrow_type): |
| dtype = np.dtype(dtype_str) |
| data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) |
| indptr = np.array([0, 2, 3, 4, 6]) |
| indices = np.array([0, 2, 5, 0, 4, 5]) |
| shape = (4, 6) |
| dim_names = ('x', 'y') |
| |
| sparse_tensor = pa.SparseCSRMatrix.from_numpy(data, indptr, indices, |
| shape, dim_names) |
| repr(sparse_tensor) |
| result_data, result_indptr, result_indices = sparse_tensor.to_numpy() |
| assert sparse_tensor.type == arrow_type |
| assert np.array_equal(data, result_data) |
| assert np.array_equal(indptr, result_indptr) |
| assert np.array_equal(indices, result_indices) |
| assert sparse_tensor.dim_names == dim_names |
| |
| |
| @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) |
| def test_sparse_csf_tensor_numpy_roundtrip(dtype_str, arrow_type): |
| dtype = np.dtype(dtype_str) |
| data = np.array([[8, 2, 5, 3, 4, 6]]).T.astype(dtype) |
| indptr = [np.array([0, 2, 3, 4, 6])] |
| indices = [ |
| np.array([0, 1, 2, 3]), |
| np.array([0, 2, 5, 0, 4, 5]) |
| ] |
| axis_order = (0, 1) |
| shape = (4, 6) |
| dim_names = ('x', 'y') |
| |
| sparse_tensor = pa.SparseCSFTensor.from_numpy(data, indptr, indices, |
| shape, axis_order, |
| dim_names) |
| repr(sparse_tensor) |
| result_data, result_indptr, result_indices = sparse_tensor.to_numpy() |
| assert sparse_tensor.type == arrow_type |
| assert np.array_equal(data, result_data) |
| assert np.array_equal(indptr[0], result_indptr[0]) |
| assert np.array_equal(indices[0], result_indices[0]) |
| assert np.array_equal(indices[1], result_indices[1]) |
| assert sparse_tensor.dim_names == dim_names |
| |
| |
| @pytest.mark.parametrize('sparse_tensor_type', [ |
| pa.SparseCSRMatrix, |
| pa.SparseCSCMatrix, |
| pa.SparseCOOTensor, |
| pa.SparseCSFTensor, |
| ]) |
| @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) |
| def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type): |
| dtype = np.dtype(dtype_str) |
| array = np.array([[4, 0, 9, 0], |
| [0, 7, 0, 0], |
| [0, 0, 0, 0], |
| [0, 0, 0, 5]]).astype(dtype) |
| dim_names = ('x', 'y') |
| |
| sparse_tensor = sparse_tensor_type.from_dense_numpy(array, dim_names) |
| tensor = sparse_tensor.to_tensor() |
| result_array = tensor.to_numpy() |
| |
| assert sparse_tensor.type == arrow_type |
| assert tensor.type == arrow_type |
| assert sparse_tensor.dim_names == dim_names |
| assert np.array_equal(array, result_array) |
| |
| |
| @pytest.mark.skipif(not coo_matrix, reason="requires scipy") |
| @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) |
| def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type): |
| dtype = np.dtype(dtype_str) |
| data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype) |
| row = np.array([0, 0, 2, 3, 1, 3]) |
| col = np.array([0, 2, 0, 4, 5, 5]) |
| shape = (4, 6) |
| dim_names = ('x', 'y') |
| |
| # non-canonical sparse coo matrix |
| scipy_matrix = coo_matrix((data, (row, col)), shape=shape) |
| sparse_tensor = pa.SparseCOOTensor.from_scipy(scipy_matrix, |
| dim_names=dim_names) |
| out_scipy_matrix = sparse_tensor.to_scipy() |
| |
| assert not scipy_matrix.has_canonical_format |
| assert not sparse_tensor.has_canonical_format |
| assert not out_scipy_matrix.has_canonical_format |
| assert sparse_tensor.type == arrow_type |
| assert sparse_tensor.dim_names == dim_names |
| assert scipy_matrix.dtype == out_scipy_matrix.dtype |
| assert np.array_equal(scipy_matrix.data, out_scipy_matrix.data) |
| assert np.array_equal(scipy_matrix.row, out_scipy_matrix.row) |
| assert np.array_equal(scipy_matrix.col, out_scipy_matrix.col) |
| |
| if dtype_str == 'f2': |
| dense_array = \ |
| scipy_matrix.astype(np.float32).toarray().astype(np.float16) |
| else: |
| dense_array = scipy_matrix.toarray() |
| assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy()) |
| |
| # canonical sparse coo matrix |
| scipy_matrix.sum_duplicates() |
| sparse_tensor = pa.SparseCOOTensor.from_scipy(scipy_matrix, |
| dim_names=dim_names) |
| out_scipy_matrix = sparse_tensor.to_scipy() |
| |
| assert scipy_matrix.has_canonical_format |
| assert sparse_tensor.has_canonical_format |
| assert out_scipy_matrix.has_canonical_format |
| |
| |
| @pytest.mark.skipif(not csr_matrix, reason="requires scipy") |
| @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) |
| def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type): |
| dtype = np.dtype(dtype_str) |
| data = np.array([8, 2, 5, 3, 4, 6]).astype(dtype) |
| indptr = np.array([0, 2, 3, 4, 6]) |
| indices = np.array([0, 2, 5, 0, 4, 5]) |
| shape = (4, 6) |
| dim_names = ('x', 'y') |
| |
| sparse_array = csr_matrix((data, indices, indptr), shape=shape) |
| sparse_tensor = pa.SparseCSRMatrix.from_scipy(sparse_array, |
| dim_names=dim_names) |
| out_sparse_array = sparse_tensor.to_scipy() |
| |
| assert sparse_tensor.type == arrow_type |
| assert sparse_tensor.dim_names == dim_names |
| assert sparse_array.dtype == out_sparse_array.dtype |
| assert np.array_equal(sparse_array.data, out_sparse_array.data) |
| assert np.array_equal(sparse_array.indptr, out_sparse_array.indptr) |
| assert np.array_equal(sparse_array.indices, out_sparse_array.indices) |
| |
| if dtype_str == 'f2': |
| dense_array = \ |
| sparse_array.astype(np.float32).toarray().astype(np.float16) |
| else: |
| dense_array = sparse_array.toarray() |
| assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy()) |
| |
| |
| @pytest.mark.skipif(not sparse, reason="requires pydata/sparse") |
| @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) |
| def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type): |
| dtype = np.dtype(dtype_str) |
| data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype) |
| coords = np.array([ |
| [0, 0, 2, 3, 1, 3], |
| [0, 2, 0, 4, 5, 5], |
| ]) |
| shape = (4, 6) |
| dim_names = ("x", "y") |
| |
| sparse_array = sparse.COO(data=data, coords=coords, shape=shape) |
| sparse_tensor = pa.SparseCOOTensor.from_pydata_sparse(sparse_array, |
| dim_names=dim_names) |
| out_sparse_array = sparse_tensor.to_pydata_sparse() |
| |
| assert sparse_tensor.type == arrow_type |
| assert sparse_tensor.dim_names == dim_names |
| assert sparse_array.dtype == out_sparse_array.dtype |
| assert np.array_equal(sparse_array.data, out_sparse_array.data) |
| assert np.array_equal(sparse_array.coords, out_sparse_array.coords) |
| assert np.array_equal(sparse_array.todense(), |
| sparse_tensor.to_tensor().to_numpy()) |