python/pyspark/pandas/tests/indexes/test_indexing_iloc.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import unittest

 import numpy as np
 import pandas as pd

 from pyspark import pandas as ps
 from pyspark.pandas.exceptions import SparkPandasIndexingError, SparkPandasNotImplementedError
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils


 class IndexingILocMixin:
     @property
     def pdf(self):
         return pd.DataFrame(
             {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
             index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
         )

     @property
     def pdf2(self):
         return pd.DataFrame(
             {0: [1, 2, 3, 4, 5, 6, 7, 8, 9], 1: [4, 5, 6, 3, 2, 1, 0, 0, 0]},
             index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
         )

     @property
     def psdf(self):
         return ps.from_pandas(self.pdf)

     @property
     def psdf2(self):
         return ps.from_pandas(self.pdf2)

     def test_iloc(self):
         pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
         psdf = ps.from_pandas(pdf)

         self.assert_eq(psdf.iloc[0, 0], pdf.iloc[0, 0])
         for indexer in [0, [0], [0, 1], [1, 0], [False, True, True], slice(0, 1)]:
             self.assert_eq(psdf.iloc[:, indexer], pdf.iloc[:, indexer])
             self.assert_eq(psdf.iloc[:1, indexer], pdf.iloc[:1, indexer])
             self.assert_eq(psdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer])
             # self.assert_eq(psdf.iloc[psdf.index == 2, indexer], pdf.iloc[pdf.index == 2, indexer])

         self.assertRaisesRegex(
             SparkPandasNotImplementedError,
             ".iloc requires numeric slice, conditional boolean",
             lambda: ps.range(10).iloc["a", :],
         )

     def test_iloc_multiindex_columns(self):
         arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])]

         pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays)
         psdf = ps.from_pandas(pdf)

         for indexer in [0, [0], [0, 1], [1, 0], [False, True, True, True], slice(0, 1)]:
             self.assert_eq(psdf.iloc[:, indexer], pdf.iloc[:, indexer])
             self.assert_eq(psdf.iloc[:1, indexer], pdf.iloc[:1, indexer])
             self.assert_eq(psdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer])
             # self.assert_eq(psdf.iloc[psdf.index == "B", indexer],
             #                pdf.iloc[pdf.index == "B", indexer])

     def test_iloc_series(self):
         pser = pd.Series([1, 2, 3])
         psser = ps.from_pandas(pser)

         self.assert_eq(psser.iloc[0], pser.iloc[0])
         self.assert_eq(psser.iloc[:], pser.iloc[:])
         self.assert_eq(psser.iloc[:1], pser.iloc[:1])
         self.assert_eq(psser.iloc[:-1], pser.iloc[:-1])

         self.assert_eq((psser + 1).iloc[0], (pser + 1).iloc[0])
         self.assert_eq((psser + 1).iloc[:], (pser + 1).iloc[:])
         self.assert_eq((psser + 1).iloc[:1], (pser + 1).iloc[:1])
         self.assert_eq((psser + 1).iloc[:-1], (pser + 1).iloc[:-1])

     def test_iloc_slice_rows_sel(self):
         pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5})
         psdf = ps.from_pandas(pdf)

         for rows_sel in [
             slice(None),
             slice(0, 1),
             slice(1, 2),
             slice(-3, None),
             slice(None, -3),
             slice(None, 0),
             slice(None, None, 3),
             slice(3, 8, 2),
             slice(None, None, -2),
             slice(8, 3, -2),
             slice(8, None, -2),
             slice(None, 3, -2),
         ]:
             with self.subTest(rows_sel=rows_sel):
                 self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
                 self.assert_eq(
                     psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index()
                 )
                 self.assert_eq(
                     (psdf.A + 1).iloc[rows_sel].sort_index(),
                     (pdf.A + 1).iloc[rows_sel].sort_index(),
                 )

     def test_iloc_iterable_rows_sel(self):
         pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5})
         psdf = ps.from_pandas(pdf)

         for rows_sel in [
             [],
             np.array([0, 1]),
             [1, 2],
             np.array([-3]),
             [3],
             np.array([-2]),
             [8, 3, -5],
         ]:
             with self.subTest(rows_sel=rows_sel):
                 self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
                 self.assert_eq(
                     psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index()
                 )
                 self.assert_eq(
                     (psdf.A + 1).iloc[rows_sel].sort_index(),
                     (pdf.A + 1).iloc[rows_sel].sort_index(),
                 )

             with self.subTest(rows_sel=rows_sel):
                 self.assert_eq(
                     psdf.iloc[rows_sel, :].sort_index(), pdf.iloc[rows_sel, :].sort_index()
                 )

             with self.subTest(rows_sel=rows_sel):
                 self.assert_eq(
                     psdf.iloc[rows_sel, :1].sort_index(), pdf.iloc[rows_sel, :1].sort_index()
                 )

     def test_frame_iloc_setitem(self):
         pdf = pd.DataFrame(
             [[1, 2], [4, 5], [7, 8]],
             index=["cobra", "viper", "sidewinder"],
             columns=["max_speed", "shield"],
         )
         psdf = ps.from_pandas(pdf)

         pdf.iloc[[1, 2], [1, 0]] = 10
         psdf.iloc[[1, 2], [1, 0]] = 10
         self.assert_eq(psdf, pdf)

         pdf.iloc[0, 1] = 50
         psdf.iloc[0, 1] = 50
         self.assert_eq(psdf, pdf)

         with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
             psdf.iloc[0, 0] = -psdf.max_speed
         with self.assertRaisesRegex(ValueError, "shape mismatch"):
             psdf.iloc[:, [1, 0]] = -psdf.max_speed
         with self.assertRaisesRegex(ValueError, "Only a dataframe with one column can be assigned"):
             psdf.iloc[:, 0] = psdf

         pdf = pd.DataFrame(
             [[1], [4], [7]], index=["cobra", "viper", "sidewinder"], columns=["max_speed"]
         )
         psdf = ps.from_pandas(pdf)

         pdf.iloc[:, 0] = pdf
         psdf.iloc[:, 0] = psdf
         self.assert_eq(psdf, pdf)

     def test_series_iloc_setitem(self):
         pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
         psdf = ps.from_pandas(pdf)

         pser = pdf.x
         psery = pdf.y
         psser = psdf.x
         pssery = psdf.y

         piloc = pser.iloc
         kiloc = psser.iloc

         pser1 = pser + 1
         psser1 = psser + 1

         for key, value in [
             ([1, 2], 10),
             (1, 50),
             (slice(None), 10),
             (slice(None, 1), 20),
             (slice(1, None), 30),
         ]:
             with self.subTest(key=key, value=value):
                 pser.iloc[key] = value
                 psser.iloc[key] = value
                 self.assert_eq(psser, pser)
                 self.assert_eq(psdf, pdf)
                 self.assert_eq(pssery, psery)

                 piloc[key] = -value
                 kiloc[key] = -value
                 self.assert_eq(psser, pser)
                 self.assert_eq(psdf, pdf)
                 self.assert_eq(pssery, psery)

                 pser1.iloc[key] = value
                 psser1.iloc[key] = value
                 self.assert_eq(psser1, pser1)
                 self.assert_eq(psdf, pdf)
                 self.assert_eq(pssery, psery)

         with self.assertRaises(ValueError):
             psser.iloc[1] = -psser

         pser = pd.Index([1, 2, 3]).to_series()
         psser = ps.Index([1, 2, 3]).to_series()

         pser1 = pser + 1
         psser1 = psser + 1

         pser.iloc[0] = 10
         psser.iloc[0] = 10
         self.assert_eq(psser, pser)

         pser1.iloc[0] = 20
         psser1.iloc[0] = 20
         self.assert_eq(psser1, pser1)

         pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
         psdf = ps.from_pandas(pdf)

         pser = pdf.a
         psser = psdf.a

         pser.iloc[[0, 1, 2]] = -pdf.b
         psser.iloc[[0, 1, 2]] = -psdf.b
         self.assert_eq(psser, pser)
         self.assert_eq(psdf, pdf)

         with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
             psser.iloc[1] = psdf[["b"]]

     def test_iloc_raises(self):
         pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
         psdf = ps.from_pandas(pdf)

         with self.assertRaisesRegex(SparkPandasIndexingError, "Only accepts pairs of candidates"):
             psdf.iloc[[0, 1], [0, 1], [1, 2]]

         with self.assertRaisesRegex(SparkPandasIndexingError, "Too many indexers"):
             psdf.A.iloc[[0, 1], [0, 1]]

         with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"):
             psdf.iloc[:"b", :]

         with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"):
             psdf.iloc[:, :"b"]

         with self.assertRaisesRegex(TypeError, "cannot perform reduce with flexible type"):
             psdf.iloc[:, ["A"]]

         with self.assertRaisesRegex(ValueError, "Location based indexing can only have"):
             psdf.iloc[:, "A"]

         with self.assertRaisesRegex(IndexError, "out of range"):
             psdf.iloc[:, [5, 6]]


 class IndexingILocTests(
     IndexingILocMixin,
     PandasOnSparkTestCase,
     SQLTestUtils,
 ):
     pass


 if __name__ == "__main__":
     from pyspark.pandas.tests.indexes.test_indexing_iloc import *  # noqa: F401

     try:
         import xmlrunner

         testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
     except ImportError:
         testRunner = None
     unittest.main(testRunner=testRunner, verbosity=2)
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	import unittest

	import numpy as np
	import pandas as pd

	from pyspark import pandas as ps
	from pyspark.pandas.exceptions import SparkPandasIndexingError, SparkPandasNotImplementedError
	from pyspark.testing.pandasutils import PandasOnSparkTestCase
	from pyspark.testing.sqlutils import SQLTestUtils


	class IndexingILocMixin:
	@property
	def pdf(self):
	return pd.DataFrame(
	{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
	index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
	)

	@property
	def pdf2(self):
	return pd.DataFrame(
	{0: [1, 2, 3, 4, 5, 6, 7, 8, 9], 1: [4, 5, 6, 3, 2, 1, 0, 0, 0]},
	index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
	)

	@property
	def psdf(self):
	return ps.from_pandas(self.pdf)

	@property
	def psdf2(self):
	return ps.from_pandas(self.pdf2)

	def test_iloc(self):
	pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
	psdf = ps.from_pandas(pdf)

	self.assert_eq(psdf.iloc[0, 0], pdf.iloc[0, 0])
	for indexer in [0, [0], [0, 1], [1, 0], [False, True, True], slice(0, 1)]:
	self.assert_eq(psdf.iloc[:, indexer], pdf.iloc[:, indexer])
	self.assert_eq(psdf.iloc[:1, indexer], pdf.iloc[:1, indexer])
	self.assert_eq(psdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer])
	# self.assert_eq(psdf.iloc[psdf.index == 2, indexer], pdf.iloc[pdf.index == 2, indexer])

	self.assertRaisesRegex(
	SparkPandasNotImplementedError,
	".iloc requires numeric slice, conditional boolean",
	lambda: ps.range(10).iloc["a", :],
	)

	def test_iloc_multiindex_columns(self):
	arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])]

	pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays)
	psdf = ps.from_pandas(pdf)

	for indexer in [0, [0], [0, 1], [1, 0], [False, True, True, True], slice(0, 1)]:
	self.assert_eq(psdf.iloc[:, indexer], pdf.iloc[:, indexer])
	self.assert_eq(psdf.iloc[:1, indexer], pdf.iloc[:1, indexer])
	self.assert_eq(psdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer])
	# self.assert_eq(psdf.iloc[psdf.index == "B", indexer],
	# pdf.iloc[pdf.index == "B", indexer])

	def test_iloc_series(self):
	pser = pd.Series([1, 2, 3])
	psser = ps.from_pandas(pser)

	self.assert_eq(psser.iloc[0], pser.iloc[0])
	self.assert_eq(psser.iloc[:], pser.iloc[:])
	self.assert_eq(psser.iloc[:1], pser.iloc[:1])
	self.assert_eq(psser.iloc[:-1], pser.iloc[:-1])

	self.assert_eq((psser + 1).iloc[0], (pser + 1).iloc[0])
	self.assert_eq((psser + 1).iloc[:], (pser + 1).iloc[:])
	self.assert_eq((psser + 1).iloc[:1], (pser + 1).iloc[:1])
	self.assert_eq((psser + 1).iloc[:-1], (pser + 1).iloc[:-1])

	def test_iloc_slice_rows_sel(self):
	pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5})
	psdf = ps.from_pandas(pdf)

	for rows_sel in [
	slice(None),
	slice(0, 1),
	slice(1, 2),
	slice(-3, None),
	slice(None, -3),
	slice(None, 0),
	slice(None, None, 3),
	slice(3, 8, 2),
	slice(None, None, -2),
	slice(8, 3, -2),
	slice(8, None, -2),
	slice(None, 3, -2),
	]:
	with self.subTest(rows_sel=rows_sel):
	self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
	self.assert_eq(
	psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index()
	)
	self.assert_eq(
	(psdf.A + 1).iloc[rows_sel].sort_index(),
	(pdf.A + 1).iloc[rows_sel].sort_index(),
	)

	def test_iloc_iterable_rows_sel(self):
	pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5})
	psdf = ps.from_pandas(pdf)

	for rows_sel in [
	[],
	np.array([0, 1]),
	[1, 2],
	np.array([-3]),
	[3],
	np.array([-2]),
	[8, 3, -5],
	]:
	with self.subTest(rows_sel=rows_sel):
	self.assert_eq(psdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
	self.assert_eq(
	psdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index()
	)
	self.assert_eq(
	(psdf.A + 1).iloc[rows_sel].sort_index(),
	(pdf.A + 1).iloc[rows_sel].sort_index(),
	)

	with self.subTest(rows_sel=rows_sel):
	self.assert_eq(
	psdf.iloc[rows_sel, :].sort_index(), pdf.iloc[rows_sel, :].sort_index()
	)

	with self.subTest(rows_sel=rows_sel):
	self.assert_eq(
	psdf.iloc[rows_sel, :1].sort_index(), pdf.iloc[rows_sel, :1].sort_index()
	)

	def test_frame_iloc_setitem(self):
	pdf = pd.DataFrame(
	[[1, 2], [4, 5], [7, 8]],
	index=["cobra", "viper", "sidewinder"],
	columns=["max_speed", "shield"],
	)
	psdf = ps.from_pandas(pdf)

	pdf.iloc[[1, 2], [1, 0]] = 10
	psdf.iloc[[1, 2], [1, 0]] = 10
	self.assert_eq(psdf, pdf)

	pdf.iloc[0, 1] = 50
	psdf.iloc[0, 1] = 50
	self.assert_eq(psdf, pdf)

	with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
	psdf.iloc[0, 0] = -psdf.max_speed
	with self.assertRaisesRegex(ValueError, "shape mismatch"):
	psdf.iloc[:, [1, 0]] = -psdf.max_speed
	with self.assertRaisesRegex(ValueError, "Only a dataframe with one column can be assigned"):
	psdf.iloc[:, 0] = psdf

	pdf = pd.DataFrame(
	[[1], [4], [7]], index=["cobra", "viper", "sidewinder"], columns=["max_speed"]
	)
	psdf = ps.from_pandas(pdf)

	pdf.iloc[:, 0] = pdf
	psdf.iloc[:, 0] = psdf
	self.assert_eq(psdf, pdf)

	def test_series_iloc_setitem(self):
	pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
	psdf = ps.from_pandas(pdf)

	pser = pdf.x
	psery = pdf.y
	psser = psdf.x
	pssery = psdf.y

	piloc = pser.iloc
	kiloc = psser.iloc

	pser1 = pser + 1
	psser1 = psser + 1

	for key, value in [
	([1, 2], 10),
	(1, 50),
	(slice(None), 10),
	(slice(None, 1), 20),
	(slice(1, None), 30),
	]:
	with self.subTest(key=key, value=value):
	pser.iloc[key] = value
	psser.iloc[key] = value
	self.assert_eq(psser, pser)
	self.assert_eq(psdf, pdf)
	self.assert_eq(pssery, psery)

	piloc[key] = -value
	kiloc[key] = -value
	self.assert_eq(psser, pser)
	self.assert_eq(psdf, pdf)
	self.assert_eq(pssery, psery)

	pser1.iloc[key] = value
	psser1.iloc[key] = value
	self.assert_eq(psser1, pser1)
	self.assert_eq(psdf, pdf)
	self.assert_eq(pssery, psery)

	with self.assertRaises(ValueError):
	psser.iloc[1] = -psser

	pser = pd.Index([1, 2, 3]).to_series()
	psser = ps.Index([1, 2, 3]).to_series()

	pser1 = pser + 1
	psser1 = psser + 1

	pser.iloc[0] = 10
	psser.iloc[0] = 10
	self.assert_eq(psser, pser)

	pser1.iloc[0] = 20
	psser1.iloc[0] = 20
	self.assert_eq(psser1, pser1)

	pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
	psdf = ps.from_pandas(pdf)

	pser = pdf.a
	psser = psdf.a

	pser.iloc[[0, 1, 2]] = -pdf.b
	psser.iloc[[0, 1, 2]] = -psdf.b
	self.assert_eq(psser, pser)
	self.assert_eq(psdf, pdf)

	with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
	psser.iloc[1] = psdf[["b"]]

	def test_iloc_raises(self):
	pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
	psdf = ps.from_pandas(pdf)

	with self.assertRaisesRegex(SparkPandasIndexingError, "Only accepts pairs of candidates"):
	psdf.iloc[[0, 1], [0, 1], [1, 2]]

	with self.assertRaisesRegex(SparkPandasIndexingError, "Too many indexers"):
	psdf.A.iloc[[0, 1], [0, 1]]

	with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"):
	psdf.iloc[:"b", :]

	with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"):
	psdf.iloc[:, :"b"]

	with self.assertRaisesRegex(TypeError, "cannot perform reduce with flexible type"):
	psdf.iloc[:, ["A"]]

	with self.assertRaisesRegex(ValueError, "Location based indexing can only have"):
	psdf.iloc[:, "A"]

	with self.assertRaisesRegex(IndexError, "out of range"):
	psdf.iloc[:, [5, 6]]


	class IndexingILocTests(
	IndexingILocMixin,
	PandasOnSparkTestCase,
	SQLTestUtils,
	):
	pass


	if __name__ == "__main__":
	from pyspark.pandas.tests.indexes.test_indexing_iloc import * # noqa: F401

	try:
	import xmlrunner

	testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
	except ImportError:
	testRunner = None
	unittest.main(testRunner=testRunner, verbosity=2)