blob: 2e89b866986f45543ca8edfcfdfb58d939d07127 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Tests for side input utilities."""
# pytype: skip-file
import logging
import time
import unittest
import mock
from apache_beam.coders import observable
from apache_beam.runners.worker import sideinputs
def strip_windows(iterator):
return [wv.value for wv in iterator]
class FakeSource(object):
def __init__(self, items, notify_observers=False):
self.items = items
self._should_notify_observers = notify_observers
def reader(self):
return FakeSourceReader(self.items, self._should_notify_observers)
class FakeSourceReader(observable.ObservableMixin):
def __init__(self, items, notify_observers=False):
super().__init__()
self.items = items
self.entered = False
self.exited = False
self._should_notify_observers = notify_observers
def __iter__(self):
if self._should_notify_observers:
self.notify_observers(len(self.items), is_record_size=True)
for item in self.items:
yield item
def __enter__(self):
self.entered = True
return self
def __exit__(self, exception_type, exception_value, traceback):
self.exited = True
@property
def returns_windowed_values(self):
return False
class PrefetchingSourceIteratorTest(unittest.TestCase):
def test_single_source_iterator_fn(self):
sources = [
FakeSource([0, 1, 2, 3, 4, 5]),
]
iterator_fn = sideinputs.get_iterator_fn_for_sources(
sources, max_reader_threads=2)
assert list(strip_windows(iterator_fn())) == list(range(6))
def test_bytes_read_are_reported(self):
mock_read_counter = mock.MagicMock()
source_records = ['a', 'b', 'c', 'd']
sources = [
FakeSource(source_records, notify_observers=True),
]
iterator_fn = sideinputs.get_iterator_fn_for_sources(
sources, max_reader_threads=3, read_counter=mock_read_counter)
assert list(strip_windows(iterator_fn())) == source_records
mock_read_counter.add_bytes_read.assert_called_with(4)
def test_multiple_sources_iterator_fn(self):
sources = [
FakeSource([0]),
FakeSource([1, 2, 3, 4, 5]),
FakeSource([]),
FakeSource([6, 7, 8, 9, 10]),
]
iterator_fn = sideinputs.get_iterator_fn_for_sources(
sources, max_reader_threads=3)
assert sorted(strip_windows(iterator_fn())) == list(range(11))
def test_multiple_sources_single_reader_iterator_fn(self):
sources = [
FakeSource([0]),
FakeSource([1, 2, 3, 4, 5]),
FakeSource([]),
FakeSource([6, 7, 8, 9, 10]),
]
iterator_fn = sideinputs.get_iterator_fn_for_sources(
sources, max_reader_threads=1)
assert list(strip_windows(iterator_fn())) == list(range(11))
def test_source_iterator_single_source_exception(self):
class MyException(Exception):
pass
def exception_generator():
yield 0
raise MyException('I am an exception!')
sources = [
FakeSource(exception_generator()),
]
iterator_fn = sideinputs.get_iterator_fn_for_sources(sources)
seen = set()
with self.assertRaises(MyException):
for value in iterator_fn():
seen.add(value.value)
self.assertEqual(sorted(seen), [0])
def test_source_iterator_fn_exception(self):
class MyException(Exception):
pass
def exception_generator():
yield 0
time.sleep(0.1)
raise MyException('I am an exception!')
def perpetual_generator(value):
while True:
yield value
time.sleep(0.1)
sources = [
FakeSource(perpetual_generator(1)),
FakeSource(perpetual_generator(2)),
FakeSource(perpetual_generator(3)),
FakeSource(perpetual_generator(4)),
FakeSource(exception_generator()),
]
iterator_fn = sideinputs.get_iterator_fn_for_sources(sources)
seen = set()
with self.assertRaises(MyException):
for value in iterator_fn():
seen.add(value.value)
self.assertEqual(sorted(seen), list(range(5)))
class EmulatedCollectionsTest(unittest.TestCase):
def test_emulated_iterable(self):
def _iterable_fn():
for i in range(10):
yield i
iterable = sideinputs.EmulatedIterable(_iterable_fn)
# Check that multiple iterations are supported.
for _ in range(0, 5):
for i, j in enumerate(iterable):
self.assertEqual(i, j)
def test_large_iterable_values(self):
# Here, we create a large collection that would be too big for memory-
# constained test environments, but should be under the memory limit if
# materialized one at a time.
def _iterable_fn():
for i in range(10):
yield ('%d' % i) * (200 * 1024 * 1024)
iterable = sideinputs.EmulatedIterable(_iterable_fn)
# Check that multiple iterations are supported.
for _ in range(0, 3):
for i, j in enumerate(iterable):
self.assertEqual(('%d' % i) * (200 * 1024 * 1024), j)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
unittest.main()