blob: b60aa6707eea482d0ab1c18d3afe3d7da2bc1f83 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import pandas as pd
from apache_beam.dataframe.partitionings import Arbitrary
from apache_beam.dataframe.partitionings import Index
from apache_beam.dataframe.partitionings import Singleton
class PartitioningsTest(unittest.TestCase):
# pylint: disable=range-builtin-not-iterating
multi_index_df = pd.DataFrame({
'shape': ['dodecahedron', 'icosahedron'] * 12,
'color': ['red', 'yellow', 'blue'] * 8,
'size': range(24),
'value': range(24)
}).set_index(['shape', 'color', 'size'])
def test_index_is_subpartition(self):
ordered_list = [
Singleton(), Index([3]), Index([1, 3]), Index(), Arbitrary()
]
for loose, strict in zip(ordered_list[:-1], ordered_list[1:]):
self.assertTrue(strict.is_subpartitioning_of(loose), (strict, loose))
self.assertFalse(loose.is_subpartitioning_of(strict), (loose, strict))
# Incomparable.
self.assertFalse(Index([1, 2]).is_subpartitioning_of(Index([1, 3])))
self.assertFalse(Index([1, 3]).is_subpartitioning_of(Index([1, 2])))
def _check_partition(self, partitioning, min_non_empty, max_non_empty=None):
num_partitions = 1000
if max_non_empty is None:
max_non_empty = min_non_empty
parts = list(partitioning.partition_fn(self.multi_index_df, num_partitions))
self.assertEqual(num_partitions, len(parts))
self.assertGreaterEqual(len([p for _, p in parts if len(p)]), min_non_empty)
self.assertLessEqual(len([p for _, p in parts if len(p)]), max_non_empty)
self.assertEqual(
sorted(self.multi_index_df.value),
sorted(sum((list(p.value) for _, p in parts), [])))
def test_index_partition(self):
self._check_partition(Index([0]), 2)
self._check_partition(Index([0, 1]), 6)
self._check_partition(Index([1]), 3)
self._check_partition(Index([2]), 7, 24)
self._check_partition(Index([0, 2]), 7, 24)
self._check_partition(Index(), 7, 24)
def test_nothing_subpartition(self):
for p in [Index([1]), Index([1, 2]), Index(), Singleton()]:
self.assertTrue(Arbitrary().is_subpartitioning_of(p), p)
def test_singleton_subpartition(self):
self.assertTrue(Singleton().is_subpartitioning_of(Singleton()))
for p in [Arbitrary(), Index([1]), Index([1, 2]), Index()]:
self.assertFalse(Singleton().is_subpartitioning_of(p), p)
def test_singleton_partition(self):
parts = list(Singleton().partition_fn(pd.Series(range(10)), 1000))
self.assertEqual(1, len(parts))
if __name__ == '__main__':
unittest.main()