blob: 7389568691cd9cc8a84d62e5acd7b47aaeeec741 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Unit tests for the transform.util classes."""
# pytype: skip-file
# pylint: disable=too-many-function-args
import collections
import hashlib
import hmac
import importlib
import logging
import math
import random
import re
import time
import unittest
import warnings
from collections.abc import Mapping
from datetime import datetime
import mock
import pytest
import pytz
from cryptography.fernet import Fernet
from cryptography.fernet import InvalidToken
from parameterized import param
from parameterized import parameterized
import apache_beam as beam
from apache_beam import GroupByKey
from apache_beam import Map
from apache_beam import WindowInto
from apache_beam.coders import coders
from apache_beam.metrics import MetricsFilter
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import TypeOptions
from apache_beam.portability import common_urns
from apache_beam.portability.api import beam_runner_api_pb2
from apache_beam.pvalue import AsList
from apache_beam.pvalue import AsSingleton
from apache_beam.runners import pipeline_context
from apache_beam.testing.synthetic_pipeline import SyntheticSource
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.testing.test_stream import TestStream
from apache_beam.testing.util import SortLists
from apache_beam.testing.util import TestWindowedValue
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import contains_in_any_order
from apache_beam.testing.util import equal_to
from apache_beam.transforms import trigger
from apache_beam.transforms import util
from apache_beam.transforms import window
from apache_beam.transforms.core import FlatMapTuple
from apache_beam.transforms.trigger import AfterCount
from apache_beam.transforms.trigger import Repeatedly
from apache_beam.transforms.util import GcpHsmGeneratedSecret
from apache_beam.transforms.util import GcpSecret
from apache_beam.transforms.util import Secret
from apache_beam.transforms.window import FixedWindows
from apache_beam.transforms.window import GlobalWindow
from apache_beam.transforms.window import GlobalWindows
from apache_beam.transforms.window import IntervalWindow
from apache_beam.transforms.window import Sessions
from apache_beam.transforms.window import SlidingWindows
from apache_beam.transforms.window import TimestampedValue
from apache_beam.typehints import typehints
from apache_beam.typehints.sharded_key_type import ShardedKeyType
from apache_beam.utils import proto_utils
from apache_beam.utils import timestamp
from apache_beam.utils.timestamp import MAX_TIMESTAMP
from apache_beam.utils.timestamp import MIN_TIMESTAMP
from apache_beam.utils.windowed_value import PANE_INFO_UNKNOWN
from apache_beam.utils.windowed_value import PaneInfo
from apache_beam.utils.windowed_value import PaneInfoTiming
from apache_beam.utils.windowed_value import WindowedValue
try:
from google.cloud import secretmanager
except ImportError:
secretmanager = None # type: ignore[assignment]
warnings.filterwarnings(
'ignore', category=FutureWarning, module='apache_beam.transform.util_test')
class _Unpicklable(object):
def __init__(self, value):
self.value = value
def __getstate__(self):
raise NotImplementedError()
def __setstate__(self, state):
raise NotImplementedError()
class _UnpicklableCoder(beam.coders.Coder):
def encode(self, value):
return str(value.value).encode()
def decode(self, encoded):
return _Unpicklable(int(encoded.decode()))
def to_type_hint(self):
return _Unpicklable
def is_deterministic(self):
return True
class CoGroupByKeyTest(unittest.TestCase):
def test_co_group_by_key_on_tuple(self):
with TestPipeline() as pipeline:
pcoll_1 = pipeline | 'Start 1' >> beam.Create([('a', 1), ('a', 2),
('b', 3), ('c', 4)])
pcoll_2 = pipeline | 'Start 2' >> beam.Create([('a', 5), ('a', 6),
('c', 7), ('c', 8)])
result = (pcoll_1, pcoll_2) | beam.CoGroupByKey() | SortLists
assert_that(
result,
equal_to([('a', ([1, 2], [5, 6])), ('b', ([3], [])),
('c', ([4], [7, 8]))]))
def test_co_group_by_key_on_iterable(self):
with TestPipeline() as pipeline:
pcoll_1 = pipeline | 'Start 1' >> beam.Create([('a', 1), ('a', 2),
('b', 3), ('c', 4)])
pcoll_2 = pipeline | 'Start 2' >> beam.Create([('a', 5), ('a', 6),
('c', 7), ('c', 8)])
result = iter([pcoll_1, pcoll_2]) | beam.CoGroupByKey() | SortLists
assert_that(
result,
equal_to([('a', ([1, 2], [5, 6])), ('b', ([3], [])),
('c', ([4], [7, 8]))]))
def test_co_group_by_key_on_list(self):
with TestPipeline() as pipeline:
pcoll_1 = pipeline | 'Start 1' >> beam.Create([('a', 1), ('a', 2),
('b', 3), ('c', 4)])
pcoll_2 = pipeline | 'Start 2' >> beam.Create([('a', 5), ('a', 6),
('c', 7), ('c', 8)])
result = [pcoll_1, pcoll_2] | beam.CoGroupByKey() | SortLists
assert_that(
result,
equal_to([('a', ([1, 2], [5, 6])), ('b', ([3], [])),
('c', ([4], [7, 8]))]))
def test_co_group_by_key_on_dict(self):
with TestPipeline() as pipeline:
pcoll_1 = pipeline | 'Start 1' >> beam.Create([('a', 1), ('a', 2),
('b', 3), ('c', 4)])
pcoll_2 = pipeline | 'Start 2' >> beam.Create([('a', 5), ('a', 6),
('c', 7), ('c', 8)])
result = {'X': pcoll_1, 'Y': pcoll_2} | beam.CoGroupByKey() | SortLists
assert_that(
result,
equal_to([('a', {
'X': [1, 2], 'Y': [5, 6]
}), ('b', {
'X': [3], 'Y': []
}), ('c', {
'X': [4], 'Y': [7, 8]
})]))
def test_co_group_by_key_on_dict_with_tuple_keys(self):
with TestPipeline() as pipeline:
key = ('a', ('b', 'c'))
pcoll_1 = pipeline | 'Start 1' >> beam.Create([(key, 1)])
pcoll_2 = pipeline | 'Start 2' >> beam.Create([(key, 2)])
result = {'X': pcoll_1, 'Y': pcoll_2} | beam.CoGroupByKey() | SortLists
assert_that(result, equal_to([(key, {'X': [1], 'Y': [2]})]))
def test_co_group_by_key_on_empty(self):
with TestPipeline() as pipeline:
assert_that(
tuple() | 'EmptyTuple' >> beam.CoGroupByKey(pipeline=pipeline),
equal_to([]),
label='AssertEmptyTuple')
assert_that([] | 'EmptyList' >> beam.CoGroupByKey(pipeline=pipeline),
equal_to([]),
label='AssertEmptyList')
assert_that(
iter([]) | 'EmptyIterable' >> beam.CoGroupByKey(pipeline=pipeline),
equal_to([]),
label='AssertEmptyIterable')
assert_that({} | 'EmptyDict' >> beam.CoGroupByKey(pipeline=pipeline),
equal_to([]),
label='AssertEmptyDict')
def test_co_group_by_key_on_one(self):
with TestPipeline() as pipeline:
pcoll = pipeline | beam.Create([('a', 1), ('b', 2)])
expected = [('a', ([1], )), ('b', ([2], ))]
assert_that((pcoll, ) | 'OneTuple' >> beam.CoGroupByKey(),
equal_to(expected),
label='AssertOneTuple')
assert_that([pcoll] | 'OneList' >> beam.CoGroupByKey(),
equal_to(expected),
label='AssertOneList')
assert_that(
iter([pcoll]) | 'OneIterable' >> beam.CoGroupByKey(),
equal_to(expected),
label='AssertOneIterable')
assert_that({'tag': pcoll}
| 'OneDict' >> beam.CoGroupByKey()
| beam.MapTuple(lambda k, v: (k, (v['tag'], ))),
equal_to(expected),
label='AssertOneDict')
def test_co_group_by_key_on_unpickled(self):
beam.coders.registry.register_coder(_Unpicklable, _UnpicklableCoder)
values = [_Unpicklable(i) for i in range(5)]
with TestPipeline() as pipeline:
xs = pipeline | beam.Create(values) | beam.WithKeys(lambda x: x)
pcoll = ({
'x': xs
}
| beam.CoGroupByKey()
| beam.FlatMapTuple(
lambda k, tagged: (k.value, tagged['x'][0].value * 2)))
expected = [0, 0, 1, 2, 2, 4, 3, 6, 4, 8]
assert_that(pcoll, equal_to(expected))
class FakeSecret(beam.Secret):
def __init__(self, version_name=None, should_throw=False):
self._secret = b'aKwI2PmqYFt2p5tNKCyBS5qYmHhHsGZcyZrnZQiQ-uE='
self._should_throw = should_throw
def get_secret_bytes(self) -> bytes:
if self._should_throw:
raise RuntimeError('Exception retrieving secret')
return self._secret
class MockNoOpDecrypt(beam.transforms.util._DecryptMessage):
def __init__(self, hmac_key_secret, key_coder, value_coder):
hmac_key = hmac_key_secret.get_secret_bytes()
self.fernet_tester = Fernet(hmac_key)
self.known_hmacs = []
for key in ['a', 'b', 'c']:
self.known_hmacs.append(
hmac.new(hmac_key, key_coder.encode(key), hashlib.sha256).digest())
super().__init__(hmac_key_secret, key_coder, value_coder)
def process(self, element):
final_elements = list(super().process(element))
# Check if we're looking at the actual elements being encoded/decoded
# There is also a gbk on assertEqual, which uses None as the key type.
final_element_keys = [e for e in final_elements if e[0] in ['a', 'b', 'c']]
if len(final_element_keys) == 0:
return final_elements
hmac_key, actual_elements = element
if hmac_key not in self.known_hmacs:
raise ValueError(f'GBK produced unencrypted value {hmac_key}')
for e in actual_elements:
try:
self.fernet_tester.decrypt(e[0], None)
except InvalidToken:
raise ValueError(f'GBK produced unencrypted value {e[0]}')
try:
self.fernet_tester.decrypt(e[1], None)
except InvalidToken:
raise ValueError(f'GBK produced unencrypted value {e[1]}')
return final_elements
class SecretTest(unittest.TestCase):
@parameterized.expand([
param(
secret_string='type:GcpSecret;version_name:my_secret/versions/latest',
secret=GcpSecret('my_secret/versions/latest')),
param(
secret_string='type:GcpSecret;version_name:foo',
secret=GcpSecret('foo')),
param(
secret_string='type:gcpsecreT;version_name:my_secret/versions/latest',
secret=GcpSecret('my_secret/versions/latest')),
])
def test_secret_manager_parses_correctly(self, secret_string, secret):
self.assertEqual(secret, Secret.parse_secret_option(secret_string))
@parameterized.expand([
param(
secret_string='version_name:foo',
exception_str='must contain a valid type parameter'),
param(
secret_string='type:gcpsecreT',
exception_str='missing 1 required positional argument'),
param(
secret_string='type:gcpsecreT;version_name:foo;extra:val',
exception_str='Invalid secret parameter extra'),
])
def test_secret_manager_throws_on_invalid(self, secret_string, exception_str):
with self.assertRaisesRegex(Exception, exception_str):
Secret.parse_secret_option(secret_string)
class GroupByEncryptedKeyTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
if secretmanager is not None:
cls.project_id = 'apache-beam-testing'
cls.secret_id = 'gbek_util_secret_tests'
cls.client = secretmanager.SecretManagerServiceClient()
cls.project_path = f'projects/{cls.project_id}'
cls.secret_path = f'{cls.project_path}/secrets/{cls.secret_id}'
try:
cls.client.get_secret(request={'name': cls.secret_path})
except Exception:
cls.client.create_secret(
request={
'parent': cls.project_path,
'secret_id': cls.secret_id,
'secret': {
'replication': {
'automatic': {}
}
}
})
cls.client.add_secret_version(
request={
'parent': cls.secret_path,
'payload': {
'data': Secret.generate_secret_bytes()
}
})
version_name = f'{cls.secret_path}/versions/latest'
cls.gcp_secret = GcpSecret(version_name)
cls.secret_option = f'type:GcpSecret;version_name:{version_name}'
def test_gbek_fake_secret_manager_roundtrips(self):
fakeSecret = FakeSecret()
with TestPipeline() as pipeline:
pcoll_1 = pipeline | 'Start 1' >> beam.Create([('a', 1), ('a', 2),
('b', 3), ('c', 4)])
result = (pcoll_1) | beam.GroupByEncryptedKey(fakeSecret)
assert_that(
result, equal_to([('a', ([1, 2])), ('b', ([3])), ('c', ([4]))]))
@unittest.skipIf(secretmanager is None, 'GCP dependencies are not installed')
def test_gbk_with_gbek_option_fake_secret_manager_roundtrips(self):
options = PipelineOptions()
options.view_as(SetupOptions).gbek = self.secret_option
with beam.Pipeline(options=options) as pipeline:
pcoll_1 = pipeline | 'Start 1' >> beam.Create([('a', 1), ('a', 2),
('b', 3), ('c', 4)])
result = (pcoll_1) | beam.GroupByKey()
sorted_result = result | beam.Map(lambda x: (x[0], sorted(x[1])))
assert_that(
sorted_result,
equal_to([('a', ([1, 2])), ('b', ([3])), ('c', ([4]))]))
@mock.patch('apache_beam.transforms.util._DecryptMessage', MockNoOpDecrypt)
def test_gbek_fake_secret_manager_actually_does_encryption(self):
fakeSecret = FakeSecret()
with TestPipeline('FnApiRunner') as pipeline:
pcoll_1 = pipeline | 'Start 1' >> beam.Create([('a', 1), ('a', 2),
('b', 3), ('c', 4)])
result = (pcoll_1) | beam.GroupByEncryptedKey(fakeSecret)
assert_that(
result, equal_to([('a', ([1, 2])), ('b', ([3])), ('c', ([4]))]))
@mock.patch('apache_beam.transforms.util._DecryptMessage', MockNoOpDecrypt)
@mock.patch('apache_beam.transforms.util.GcpSecret', FakeSecret)
def test_gbk_actually_does_encryption(self):
options = PipelineOptions()
# Version of GcpSecret doesn't matter since it is replaced by FakeSecret
options.view_as(SetupOptions).gbek = 'type:GcpSecret;version_name:Foo'
with TestPipeline('FnApiRunner', options=options) as pipeline:
pcoll_1 = pipeline | 'Start 1' >> beam.Create([('a', 1), ('a', 2),
('b', 3), ('c', 4)],
reshuffle=False)
result = pcoll_1 | beam.GroupByKey()
assert_that(
result, equal_to([('a', ([1, 2])), ('b', ([3])), ('c', ([4]))]))
def test_gbek_fake_secret_manager_throws(self):
fakeSecret = FakeSecret(None, True)
with self.assertRaisesRegex(RuntimeError, r'Exception retrieving secret'):
with TestPipeline() as pipeline:
pcoll_1 = pipeline | 'Start 1' >> beam.Create([('a', 1), ('a', 2),
('b', 3), ('c', 4)])
result = (pcoll_1) | beam.GroupByEncryptedKey(fakeSecret)
assert_that(
result, equal_to([('a', ([1, 2])), ('b', ([3])), ('c', ([4]))]))
@unittest.skipIf(secretmanager is None, 'GCP dependencies are not installed')
def test_gbek_gcp_secret_manager_roundtrips(self):
with TestPipeline() as pipeline:
pcoll_1 = pipeline | 'Start 1' >> beam.Create([('a', 1), ('a', 2),
('b', 3), ('c', 4)])
result = (pcoll_1) | beam.GroupByEncryptedKey(self.gcp_secret)
assert_that(
result, equal_to([('a', ([1, 2])), ('b', ([3])), ('c', ([4]))]))
@unittest.skipIf(secretmanager is None, 'GCP dependencies are not installed')
def test_gbek_gcp_secret_manager_throws(self):
gcp_secret = GcpSecret('bad_path/versions/latest')
with self.assertRaisesRegex(RuntimeError,
r'Failed to retrieve secret bytes'):
with TestPipeline() as pipeline:
pcoll_1 = pipeline | 'Start 1' >> beam.Create([('a', 1), ('a', 2),
('b', 3), ('c', 4)])
result = (pcoll_1) | beam.GroupByEncryptedKey(gcp_secret)
assert_that(
result, equal_to([('a', ([1, 2])), ('b', ([3])), ('c', ([4]))]))
@unittest.skipIf(secretmanager is None, 'GCP dependencies are not installed')
class GcpHsmGeneratedSecretTest(unittest.TestCase):
def setUp(self):
self.mock_secret_manager_client = mock.MagicMock()
self.mock_kms_client = mock.MagicMock()
# Patch the clients
self.secretmanager_patcher = mock.patch(
'google.cloud.secretmanager.SecretManagerServiceClient',
return_value=self.mock_secret_manager_client)
self.kms_patcher = mock.patch(
'google.cloud.kms.KeyManagementServiceClient',
return_value=self.mock_kms_client)
self.os_urandom_patcher = mock.patch('os.urandom', return_value=b'0' * 32)
self.hkdf_patcher = mock.patch(
'cryptography.hazmat.primitives.kdf.hkdf.HKDF.derive',
return_value=b'derived_key')
self.secretmanager_patcher.start()
self.kms_patcher.start()
self.os_urandom_patcher.start()
self.hkdf_patcher.start()
def tearDown(self):
self.secretmanager_patcher.stop()
self.kms_patcher.stop()
self.os_urandom_patcher.stop()
self.hkdf_patcher.stop()
def test_happy_path_secret_creation(self):
from google.api_core import exceptions as api_exceptions
project_id = 'test-project'
location_id = 'global'
key_ring_id = 'test-key-ring'
key_id = 'test-key'
job_name = 'test-job'
secret = GcpHsmGeneratedSecret(
project_id, location_id, key_ring_id, key_id, job_name)
# Mock responses for secret creation path
self.mock_secret_manager_client.access_secret_version.side_effect = [
api_exceptions.NotFound('not found'), # first check
api_exceptions.NotFound('not found'), # second check
mock.MagicMock(payload=mock.MagicMock(data=b'derived_key'))
]
self.mock_kms_client.encrypt.return_value = mock.MagicMock(
ciphertext=b'encrypted_nonce')
secret_bytes = secret.get_secret_bytes()
self.assertEqual(secret_bytes, b'derived_key')
# Assertions on mocks
secret_version_path = (
f'projects/{project_id}/secrets/{secret._secret_version_name}'
'/versions/1')
self.mock_secret_manager_client.access_secret_version.assert_any_call(
request={'name': secret_version_path})
self.assertEqual(
self.mock_secret_manager_client.access_secret_version.call_count, 3)
self.mock_secret_manager_client.create_secret.assert_called_once()
self.mock_kms_client.encrypt.assert_called_once()
self.mock_secret_manager_client.add_secret_version.assert_called_once()
def test_secret_already_exists(self):
from google.api_core import exceptions as api_exceptions
project_id = 'test-project'
location_id = 'global'
key_ring_id = 'test-key-ring'
key_id = 'test-key'
job_name = 'test-job'
secret = GcpHsmGeneratedSecret(
project_id, location_id, key_ring_id, key_id, job_name)
# Mock responses for secret creation path
self.mock_secret_manager_client.access_secret_version.side_effect = [
api_exceptions.NotFound('not found'),
api_exceptions.NotFound('not found'),
mock.MagicMock(payload=mock.MagicMock(data=b'derived_key'))
]
self.mock_secret_manager_client.create_secret.side_effect = (
api_exceptions.AlreadyExists('exists'))
self.mock_kms_client.encrypt.return_value = mock.MagicMock(
ciphertext=b'encrypted_nonce')
secret_bytes = secret.get_secret_bytes()
self.assertEqual(secret_bytes, b'derived_key')
# Assertions on mocks
self.mock_secret_manager_client.create_secret.assert_called_once()
self.mock_secret_manager_client.add_secret_version.assert_called_once()
def test_secret_version_already_exists(self):
project_id = 'test-project'
location_id = 'global'
key_ring_id = 'test-key-ring'
key_id = 'test-key'
job_name = 'test-job'
secret = GcpHsmGeneratedSecret(
project_id, location_id, key_ring_id, key_id, job_name)
self.mock_secret_manager_client.access_secret_version.return_value = (
mock.MagicMock(payload=mock.MagicMock(data=b'existing_dek')))
secret_bytes = secret.get_secret_bytes()
self.assertEqual(secret_bytes, b'existing_dek')
# Assertions
self.mock_secret_manager_client.access_secret_version.assert_called_once()
self.mock_secret_manager_client.create_secret.assert_not_called()
self.mock_secret_manager_client.add_secret_version.assert_not_called()
self.mock_kms_client.encrypt.assert_not_called()
class FakeClock(object):
def __init__(self, now=time.time()):
self._now = now
def __call__(self):
return self._now
def sleep(self, duration):
self._now += duration
class BatchElementsTest(unittest.TestCase):
NUM_ELEMENTS = 10
BATCH_SIZE = 5
@staticmethod
def _create_test_data():
scientists = [
"Einstein",
"Darwin",
"Copernicus",
"Pasteur",
"Curie",
"Faraday",
"Newton",
"Bohr",
"Galilei",
"Maxwell"
]
data = []
for i in range(BatchElementsTest.NUM_ELEMENTS):
index = i % len(scientists)
data.append(scientists[index])
return data
def test_constant_batch(self):
# Assumes a single bundle...
p = TestPipeline()
output = (
p
| beam.Create(range(35))
| util.BatchElements(min_batch_size=10, max_batch_size=10)
| beam.Map(len))
assert_that(output, equal_to([10, 10, 10, 5]))
res = p.run()
res.wait_until_finish()
metrics = res.metrics()
results = metrics.query(MetricsFilter().with_name("batch_size"))
self.assertEqual(len(results["distributions"]), 1)
def test_constant_batch_no_metrics(self):
p = TestPipeline()
output = (
p
| beam.Create(range(35))
| util.BatchElements(
min_batch_size=10, max_batch_size=10, record_metrics=False)
| beam.Map(len))
assert_that(output, equal_to([10, 10, 10, 5]))
res = p.run()
res.wait_until_finish()
metrics = res.metrics()
results = metrics.query(MetricsFilter().with_name("batch_size"))
self.assertEqual(len(results["distributions"]), 0)
def test_grows_to_max_batch(self):
# Assumes a single bundle, so we pin to the FnApiRunner
with TestPipeline('FnApiRunner') as p:
res = (
p
| beam.Create(range(164))
| util.BatchElements(
min_batch_size=1, max_batch_size=50, clock=FakeClock())
| beam.Map(len))
assert_that(res, equal_to([1, 1, 2, 4, 8, 16, 32, 50, 50]))
def test_windowed_batches(self):
# Assumes a single bundle in order, so we pin to the FnApiRunner
with TestPipeline('FnApiRunner') as p:
res = (
p
| beam.Create(range(47), reshuffle=False)
| beam.Map(lambda t: window.TimestampedValue(t, t))
| beam.WindowInto(window.FixedWindows(30))
| util.BatchElements(
min_batch_size=5, max_batch_size=10, clock=FakeClock())
| beam.Map(len))
assert_that(
res,
equal_to([
5,
5,
10,
10, # elements in [0, 30)
10,
7, # elements in [30, 47)
]))
def test_global_batch_timestamps(self):
# Assumes a single bundle, so we pin to the FnApiRunner
with TestPipeline('FnApiRunner') as p:
res = (
p
| beam.Create(range(3), reshuffle=False)
| util.BatchElements(min_batch_size=2, max_batch_size=2)
| beam.Map(
lambda batch, timestamp=beam.DoFn.TimestampParam:
(len(batch), timestamp)))
assert_that(
res,
equal_to([
(2, GlobalWindow().max_timestamp()),
(1, GlobalWindow().max_timestamp()),
]))
def test_sized_batches(self):
with TestPipeline() as p:
res = (
p
| beam.Create(
[
'a',
'a', # First batch.
'aaaaaaaaaa', # Second batch.
'aaaaa',
'aaaaa', # Third batch.
'a',
'aaaaaaa',
'a',
'a' # Fourth batch.
],
reshuffle=False)
| util.BatchElements(
min_batch_size=10, max_batch_size=10, element_size_fn=len)
| beam.Map(lambda batch: ''.join(batch))
| beam.Map(len))
assert_that(res, equal_to([2, 10, 10, 10]))
def test_sized_windowed_batches(self):
# Assumes a single bundle, in order so we pin to the FnApiRunner
with TestPipeline('FnApiRunner') as p:
res = (
p
| beam.Create(range(1, 8), reshuffle=False)
| beam.Map(lambda t: window.TimestampedValue('a' * t, t))
| beam.WindowInto(window.FixedWindows(3))
| util.BatchElements(
min_batch_size=11,
max_batch_size=11,
element_size_fn=len,
clock=FakeClock())
| beam.Map(lambda batch: ''.join(batch)))
assert_that(
res,
equal_to([
'a' * (1 + 2), # Elements in [1, 3)
'a' * (3 + 4), # Elements in [3, 6)
'a' * 5,
'a' * 6, # Elements in [6, 9)
'a' * 7,
]))
def test_target_duration(self):
clock = FakeClock()
batch_estimator = util._BatchSizeEstimator(
target_batch_overhead=None, target_batch_duration_secs=10, clock=clock)
batch_duration = lambda batch_size: 1 + .7 * batch_size
# 14 * .7 is as close as we can get to 10 as possible.
expected_sizes = [1, 2, 4, 8, 14, 14, 14]
actual_sizes = []
for _ in range(len(expected_sizes)):
actual_sizes.append(batch_estimator.next_batch_size())
with batch_estimator.record_time(actual_sizes[-1]):
clock.sleep(batch_duration(actual_sizes[-1]))
self.assertEqual(expected_sizes, actual_sizes)
def test_target_duration_including_fixed_cost(self):
clock = FakeClock()
batch_estimator = util._BatchSizeEstimator(
target_batch_overhead=None,
target_batch_duration_secs_including_fixed_cost=10,
clock=clock)
batch_duration = lambda batch_size: 1 + .7 * batch_size
# 1 + 14 * .7 is as close as we can get to 10 as possible.
expected_sizes = [1, 2, 4, 8, 12, 12, 12]
actual_sizes = []
for _ in range(len(expected_sizes)):
actual_sizes.append(batch_estimator.next_batch_size())
with batch_estimator.record_time(actual_sizes[-1]):
clock.sleep(batch_duration(actual_sizes[-1]))
self.assertEqual(expected_sizes, actual_sizes)
def test_target_overhead(self):
clock = FakeClock()
batch_estimator = util._BatchSizeEstimator(
target_batch_overhead=.05, target_batch_duration_secs=None, clock=clock)
batch_duration = lambda batch_size: 1 + .7 * batch_size
# At 27 items, a batch takes ~20 seconds with 5% (~1 second) overhead.
expected_sizes = [1, 2, 4, 8, 16, 27, 27, 27]
actual_sizes = []
for _ in range(len(expected_sizes)):
actual_sizes.append(batch_estimator.next_batch_size())
with batch_estimator.record_time(actual_sizes[-1]):
clock.sleep(batch_duration(actual_sizes[-1]))
self.assertEqual(expected_sizes, actual_sizes)
def test_variance(self):
clock = FakeClock()
variance = 0.25
batch_estimator = util._BatchSizeEstimator(
target_batch_overhead=.05,
target_batch_duration_secs=None,
variance=variance,
clock=clock)
batch_duration = lambda batch_size: 1 + .7 * batch_size
expected_target = 27
actual_sizes = []
for _ in range(util._BatchSizeEstimator._MAX_DATA_POINTS - 1):
actual_sizes.append(batch_estimator.next_batch_size())
with batch_estimator.record_time(actual_sizes[-1]):
clock.sleep(batch_duration(actual_sizes[-1]))
# Check that we're testing a good range of values.
stable_set = set(actual_sizes[-20:])
self.assertGreater(len(stable_set), 3)
self.assertGreater(
min(stable_set), expected_target - expected_target * variance)
self.assertLess(
max(stable_set), expected_target + expected_target * variance)
def test_ignore_first_n_batch_size(self):
clock = FakeClock()
batch_estimator = util._BatchSizeEstimator(
clock=clock, ignore_first_n_seen_per_batch_size=2)
expected_sizes = [
1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, 64
]
actual_sizes = []
for i in range(len(expected_sizes)):
actual_sizes.append(batch_estimator.next_batch_size())
with batch_estimator.record_time(actual_sizes[-1]):
if i % 3 == 2:
clock.sleep(0.01)
else:
clock.sleep(1)
self.assertEqual(expected_sizes, actual_sizes)
# Check we only record the third timing.
expected_data_batch_sizes = [1, 2, 4, 8, 16, 32, 64]
actual_data_batch_sizes = [x[0] for x in batch_estimator._data]
self.assertEqual(expected_data_batch_sizes, actual_data_batch_sizes)
expected_data_timing = [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]
for i in range(len(expected_data_timing)):
self.assertAlmostEqual(
expected_data_timing[i], batch_estimator._data[i][1])
def test_ignore_next_timing(self):
clock = FakeClock()
batch_estimator = util._BatchSizeEstimator(clock=clock)
batch_estimator.ignore_next_timing()
expected_sizes = [1, 1, 2, 4, 8, 16]
actual_sizes = []
for i in range(len(expected_sizes)):
actual_sizes.append(batch_estimator.next_batch_size())
with batch_estimator.record_time(actual_sizes[-1]):
if i == 0:
clock.sleep(1)
else:
clock.sleep(0.01)
self.assertEqual(expected_sizes, actual_sizes)
# Check the first record_time was skipped.
expected_data_batch_sizes = [1, 2, 4, 8, 16]
actual_data_batch_sizes = [x[0] for x in batch_estimator._data]
self.assertEqual(expected_data_batch_sizes, actual_data_batch_sizes)
expected_data_timing = [0.01, 0.01, 0.01, 0.01, 0.01]
for i in range(len(expected_data_timing)):
self.assertAlmostEqual(
expected_data_timing[i], batch_estimator._data[i][1])
def _run_regression_test(self, linear_regression_fn, test_outliers):
xs = [random.random() for _ in range(10)]
ys = [2 * x + 1 for x in xs]
a, b = linear_regression_fn(xs, ys)
self.assertAlmostEqual(a, 1)
self.assertAlmostEqual(b, 2)
xs = [1 + random.random() for _ in range(100)]
ys = [7 * x + 5 + 0.01 * random.random() for x in xs]
a, b = linear_regression_fn(xs, ys)
self.assertAlmostEqual(a, 5, delta=0.02)
self.assertAlmostEqual(b, 7, delta=0.02)
# Test repeated xs
xs = [1 + random.random()] * 100
ys = [7 * x + 5 + 0.01 * random.random() for x in xs]
a, b = linear_regression_fn(xs, ys)
self.assertAlmostEqual(a, 0, delta=0.02)
self.assertAlmostEqual(b, sum(ys) / (len(ys) * xs[0]), delta=0.02)
if test_outliers:
xs = [1 + random.random() for _ in range(100)]
ys = [2 * x + 1 for x in xs]
a, b = linear_regression_fn(xs, ys)
self.assertAlmostEqual(a, 1)
self.assertAlmostEqual(b, 2)
# An outlier or two doesn't affect the result.
for _ in range(2):
xs += [10]
ys += [30]
a, b = linear_regression_fn(xs, ys)
self.assertAlmostEqual(a, 1)
self.assertAlmostEqual(b, 2)
# But enough of them, and they're no longer outliers.
xs += [10] * 10
ys += [30] * 10
a, b = linear_regression_fn(xs, ys)
self.assertLess(a, 0.5)
self.assertGreater(b, 2.5)
def test_no_numpy_regression(self):
self._run_regression_test(
util._BatchSizeEstimator.linear_regression_no_numpy, False)
def test_numpy_regression(self):
try:
# pylint: disable=wrong-import-order, wrong-import-position
import numpy as _
except ImportError:
self.skipTest('numpy not available')
self._run_regression_test(
util._BatchSizeEstimator.linear_regression_numpy, True)
def test_stateful_constant_batch(self):
# Assumes a single bundle, so we pin to the FnApiRunner
p = TestPipeline('FnApiRunner')
output = (
p
| beam.Create(range(35))
| util.BatchElements(
min_batch_size=10, max_batch_size=10, max_batch_duration_secs=100)
| beam.Map(len))
assert_that(output, equal_to([10, 10, 10, 5]))
res = p.run()
res.wait_until_finish()
def test_stateful_in_global_window(self):
with TestPipeline() as pipeline:
collection = pipeline \
| beam.Create(
BatchElementsTest._create_test_data()) \
| util.BatchElements(
min_batch_size=BatchElementsTest.BATCH_SIZE,
max_batch_size=BatchElementsTest.BATCH_SIZE,
max_batch_duration_secs=100)
num_batches = collection | beam.combiners.Count.Globally()
assert_that(
num_batches,
equal_to([
int(
math.ceil(
BatchElementsTest.NUM_ELEMENTS /
BatchElementsTest.BATCH_SIZE))
]))
def test_stateful_buffering_timer_in_fixed_window_streaming(self):
window_duration = 6
max_buffering_duration_secs = 100
start_time = timestamp.Timestamp(0)
test_stream = (
TestStream().add_elements([
TimestampedValue(value, start_time + i)
for i, value in enumerate(BatchElementsTest._create_test_data())
]).advance_processing_time(150).advance_watermark_to(
start_time + window_duration).advance_watermark_to(
start_time + window_duration +
1).advance_watermark_to_infinity())
with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
# To trigger the processing time timer, use a fake clock with start time
# being Timestamp(0).
fake_clock = FakeClock(now=start_time)
num_elements_per_batch = (
pipeline | test_stream
| "fixed window" >> WindowInto(FixedWindows(window_duration))
| util.BatchElements(
min_batch_size=BatchElementsTest.BATCH_SIZE,
max_batch_size=BatchElementsTest.BATCH_SIZE,
max_batch_duration_secs=max_buffering_duration_secs,
clock=fake_clock)
| "count elements in batch" >> Map(lambda x: (None, len(x)))
| GroupByKey()
| "global window" >> WindowInto(GlobalWindows())
| FlatMapTuple(lambda k, vs: vs))
# Window duration is 6 and batch size is 5, so output batch size
# should be 5 (flush because of batch size reached).
expected_0 = 5
# There is only one element left in the window so batch size
# should be 1 (flush because of max buffering duration reached).
expected_1 = 1
# Collection has 10 elements, there are only 4 left, so batch size should
# be 4 (flush because of end of window reached).
expected_2 = 4
assert_that(
num_elements_per_batch,
equal_to([expected_0, expected_1, expected_2]),
"assert2")
def test_stateful_buffering_timer_in_global_window_streaming(self):
max_buffering_duration_secs = 42
start_time = timestamp.Timestamp(0)
test_stream = TestStream().advance_watermark_to(start_time)
for i, value in enumerate(BatchElementsTest._create_test_data()):
test_stream.add_elements(
[TimestampedValue(value, start_time + i)]) \
.advance_processing_time(5)
test_stream.advance_watermark_to(
start_time + BatchElementsTest.NUM_ELEMENTS + 1) \
.advance_watermark_to_infinity()
with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
# Set a batch size larger than the total number of elements.
# Since we're in a global window, we would have been waiting
# for all the elements to arrive without the buffering time limit.
batch_size = BatchElementsTest.NUM_ELEMENTS * 2
# To trigger the processing time timer, use a fake clock with start time
# being Timestamp(0). Since the fake clock never really advances during
# the pipeline execution, meaning that the timer is always set to the same
# value, the timer will be fired on every element after the first firing.
fake_clock = FakeClock(now=start_time)
num_elements_per_batch = (
pipeline | test_stream
| WindowInto(
GlobalWindows(),
trigger=Repeatedly(AfterCount(1)),
accumulation_mode=trigger.AccumulationMode.DISCARDING)
| util.BatchElements(
min_batch_size=batch_size,
max_batch_size=batch_size,
max_batch_duration_secs=max_buffering_duration_secs,
clock=fake_clock)
| 'count elements in batch' >> Map(lambda x: (None, len(x)))
| GroupByKey()
| FlatMapTuple(lambda k, vs: vs))
# We will flush twice when the max buffering duration is reached and when
# the global window ends.
assert_that(num_elements_per_batch, equal_to([9, 1]))
def test_stateful_grows_to_max_batch(self):
# Assumes a single bundle, so we pin to the FnApiRunner
with TestPipeline('FnApiRunner') as p:
res = (
p
| beam.Create(range(164))
| util.BatchElements(
min_batch_size=1,
max_batch_size=50,
max_batch_duration_secs=100,
clock=FakeClock())
| beam.Map(len))
assert_that(res, equal_to([1, 1, 2, 4, 8, 16, 32, 50, 50]))
class IdentityWindowTest(unittest.TestCase):
def test_window_preserved(self):
expected_timestamp = timestamp.Timestamp(5)
expected_window = window.IntervalWindow(1.0, 2.0)
class AddWindowDoFn(beam.DoFn):
def process(self, element):
yield WindowedValue(element, expected_timestamp, [expected_window])
with TestPipeline() as pipeline:
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
expected_windows = [
TestWindowedValue(kv, expected_timestamp, [expected_window])
for kv in data
]
before_identity = (
pipeline
| 'start' >> beam.Create(data)
| 'add_windows' >> beam.ParDo(AddWindowDoFn()))
assert_that(
before_identity,
equal_to(expected_windows),
label='before_identity',
reify_windows=True)
after_identity = (
before_identity
| 'window' >> beam.WindowInto(
beam.transforms.util._IdentityWindowFn(
coders.IntervalWindowCoder())))
assert_that(
after_identity,
equal_to(expected_windows),
label='after_identity',
reify_windows=True)
def test_no_window_context_fails(self):
expected_timestamp = timestamp.Timestamp(5)
# Assuming the default window function is window.GlobalWindows.
expected_window = window.GlobalWindow()
class AddTimestampDoFn(beam.DoFn):
def process(self, element):
yield window.TimestampedValue(element, expected_timestamp)
with self.assertRaisesRegex(Exception, r'.*window.*None.*add_timestamps2'):
with TestPipeline() as pipeline:
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
expected_windows = [
TestWindowedValue(kv, expected_timestamp, [expected_window])
for kv in data
]
before_identity = (
pipeline
| 'start' >> beam.Create(data)
| 'add_timestamps' >> beam.ParDo(AddTimestampDoFn()))
assert_that(
before_identity,
equal_to(expected_windows),
label='before_identity',
reify_windows=True)
_ = (
before_identity
| 'window' >> beam.WindowInto(
beam.transforms.util._IdentityWindowFn(
coders.GlobalWindowCoder()))
# This DoFn will return TimestampedValues, making
# WindowFn.AssignContext passed to IdentityWindowFn
# contain a window of None. IdentityWindowFn should
# raise an exception.
| 'add_timestamps2' >> beam.ParDo(AddTimestampDoFn()))
class ReshuffleTest(unittest.TestCase):
def test_reshuffle_contents_unchanged(self):
with TestPipeline() as pipeline:
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)]
result = (pipeline | beam.Create(data) | beam.Reshuffle())
assert_that(result, equal_to(data))
def test_reshuffle_contents_unchanged_with_buckets(self):
with TestPipeline() as pipeline:
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)]
buckets = 2
result = (pipeline | beam.Create(data) | beam.Reshuffle(buckets))
assert_that(result, equal_to(data))
def test_reshuffle_contents_unchanged_with_wrong_buckets(self):
wrong_buckets = [0, -1, "wrong", 2.5]
for wrong_bucket in wrong_buckets:
with self.assertRaisesRegex(ValueError,
'If `num_buckets` is set, it has to be an '
'integer greater than 0, got %s' %
wrong_bucket):
beam.Reshuffle(wrong_bucket)
def test_reshuffle_after_gbk_contents_unchanged(self):
with TestPipeline() as pipeline:
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)]
expected_result = [(1, [1, 2, 3]), (2, [1, 2]), (3, [1])]
after_gbk = (
pipeline
| beam.Create(data)
| beam.GroupByKey()
| beam.MapTuple(lambda k, vs: (k, sorted(vs))))
assert_that(after_gbk, equal_to(expected_result), label='after_gbk')
after_reshuffle = after_gbk | beam.Reshuffle()
assert_that(
after_reshuffle, equal_to(expected_result), label='after_reshuffle')
def test_reshuffle_timestamps_unchanged(self):
with TestPipeline() as pipeline:
timestamp = 5
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)]
expected_result = [
TestWindowedValue(v, timestamp, [GlobalWindow()]) for v in data
]
before_reshuffle = (
pipeline
| 'start' >> beam.Create(data)
| 'add_timestamp' >>
beam.Map(lambda v: beam.window.TimestampedValue(v, timestamp)))
assert_that(
before_reshuffle,
equal_to(expected_result),
label='before_reshuffle',
reify_windows=True)
after_reshuffle = before_reshuffle | beam.Reshuffle()
assert_that(
after_reshuffle,
equal_to(expected_result),
label='after_reshuffle',
reify_windows=True)
def test_reshuffle_windows_unchanged(self):
with TestPipeline() as pipeline:
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
expected_data = [
TestWindowedValue(
v,
t - .001, [w],
pane_info=PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0))
for (v, t, w) in [((1, contains_in_any_order([2, 1])), 4.0,
IntervalWindow(1.0, 4.0)), (
(2, contains_in_any_order([2, 1])), 4.0,
IntervalWindow(1.0, 4.0)), ((
3, [1]), 3.0, IntervalWindow(1.0, 3.0)), ((
1,
[4]), 6.0, IntervalWindow(4.0, 6.0))]
]
before_reshuffle = (
pipeline
| 'start' >> beam.Create(data)
| 'add_timestamp' >>
beam.Map(lambda v: beam.window.TimestampedValue(v, v[1]))
| 'window' >> beam.WindowInto(Sessions(gap_size=2))
| 'group_by_key' >> beam.GroupByKey())
assert_that(
before_reshuffle,
equal_to(expected_data),
label='before_reshuffle',
reify_windows=True)
after_reshuffle = before_reshuffle | beam.Reshuffle()
assert_that(
after_reshuffle,
equal_to(expected_data),
label='after reshuffle',
reify_windows=True)
def test_reshuffle_window_fn_preserved(self):
any_order = contains_in_any_order
with TestPipeline() as pipeline:
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
expected_windows = [
TestWindowedValue(v, t, [w])
for (v, t, w) in [((1, 1), 1.0, IntervalWindow(1.0, 3.0)), ((
2, 1), 1.0, IntervalWindow(1.0, 3.0)), ((
3, 1), 1.0, IntervalWindow(1.0, 3.0)), ((
1, 2), 2.0, IntervalWindow(2.0, 4.0)), (
(2, 2), 2.0,
IntervalWindow(2.0, 4.0)), ((1, 4), 4.0,
IntervalWindow(4.0, 6.0))]
]
expected_merged_windows = [
TestWindowedValue(
v,
t - .001, [w],
pane_info=PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0))
for (v, t,
w) in [((1, any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), (
(2, any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), (
(3, [1]), 3.0,
IntervalWindow(1.0, 3.0)), ((1, [4]), 6.0,
IntervalWindow(4.0, 6.0))]
]
before_reshuffle = (
pipeline
| 'start' >> beam.Create(data)
| 'add_timestamp' >> beam.Map(lambda v: TimestampedValue(v, v[1]))
| 'window' >> beam.WindowInto(Sessions(gap_size=2)))
assert_that(
before_reshuffle,
equal_to(expected_windows),
label='before_reshuffle',
reify_windows=True)
after_reshuffle = before_reshuffle | beam.Reshuffle()
assert_that(
after_reshuffle,
equal_to(expected_windows),
label='after_reshuffle',
reify_windows=True)
after_group = after_reshuffle | beam.GroupByKey()
assert_that(
after_group,
equal_to(expected_merged_windows),
label='after_group',
reify_windows=True)
def test_reshuffle_global_window(self):
with TestPipeline() as pipeline:
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])]
before_reshuffle = (
pipeline
| beam.Create(data)
| beam.WindowInto(GlobalWindows())
| beam.GroupByKey()
| beam.MapTuple(lambda k, vs: (k, sorted(vs))))
assert_that(
before_reshuffle, equal_to(expected_data), label='before_reshuffle')
after_reshuffle = before_reshuffle | beam.Reshuffle()
assert_that(
after_reshuffle, equal_to(expected_data), label='after reshuffle')
def test_reshuffle_sliding_window(self):
with TestPipeline() as pipeline:
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
window_size = 2
expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])] * window_size
before_reshuffle = (
pipeline
| beam.Create(data)
| beam.WindowInto(SlidingWindows(size=window_size, period=1))
| beam.GroupByKey()
| beam.MapTuple(lambda k, vs: (k, sorted(vs))))
assert_that(
before_reshuffle, equal_to(expected_data), label='before_reshuffle')
after_reshuffle = before_reshuffle | beam.Reshuffle()
# If Reshuffle applies the sliding window function a second time there
# should be extra values for each key.
assert_that(
after_reshuffle, equal_to(expected_data), label='after reshuffle')
def test_reshuffle_streaming_global_window(self):
options = PipelineOptions()
options.view_as(StandardOptions).streaming = True
with TestPipeline(options=options) as pipeline:
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])]
before_reshuffle = (
pipeline
| beam.Create(data)
| beam.WindowInto(GlobalWindows())
| beam.GroupByKey()
| beam.MapTuple(lambda k, vs: (k, sorted(vs))))
assert_that(
before_reshuffle, equal_to(expected_data), label='before_reshuffle')
after_reshuffle = before_reshuffle | beam.Reshuffle()
assert_that(
after_reshuffle, equal_to(expected_data), label='after reshuffle')
def test_reshuffle_streaming_global_window_with_buckets(self):
options = PipelineOptions()
options.view_as(StandardOptions).streaming = True
with TestPipeline(options=options) as pipeline:
data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])]
buckets = 2
before_reshuffle = (
pipeline
| beam.Create(data)
| beam.WindowInto(GlobalWindows())
| beam.GroupByKey()
| beam.MapTuple(lambda k, vs: (k, sorted(vs))))
assert_that(
before_reshuffle, equal_to(expected_data), label='before_reshuffle')
after_reshuffle = before_reshuffle | beam.Reshuffle(buckets)
assert_that(
after_reshuffle, equal_to(expected_data), label='after reshuffle')
@parameterized.expand([
param(compat_version=None),
param(compat_version="2.64.0"),
])
def test_reshuffle_custom_window_preserves_metadata(self, compat_version):
"""Tests that Reshuffle preserves pane info."""
from apache_beam.coders import typecoders
typecoders.registry.force_dill_deterministic_coders = True
element_count = 12
timestamp_value = timestamp.Timestamp(0)
l = [
TimestampedValue(("key", i), timestamp_value)
for i in range(element_count)
]
expected_timestamp = GlobalWindow().max_timestamp()
expected = [
TestWindowedValue(
('key', [0, 1, 2]),
expected_timestamp,
[GlobalWindow()],
pane_info=PaneInfo(
is_first=True,
is_last=False,
timing=PaneInfoTiming.EARLY, # 0
index=0,
nonspeculative_index=-1)),
TestWindowedValue(
('key', [3, 4, 5]),
expected_timestamp,
[GlobalWindow()],
pane_info=PaneInfo(
is_first=False,
is_last=False,
timing=PaneInfoTiming.EARLY, # 0
index=1,
nonspeculative_index=-1)),
TestWindowedValue(
('key', [6, 7, 8]),
expected_timestamp,
[GlobalWindow()],
pane_info=PaneInfo(
is_first=False,
is_last=False,
timing=PaneInfoTiming.EARLY, # 0
index=2,
nonspeculative_index=-1)),
TestWindowedValue(
('key', [9, 10, 11]),
expected_timestamp,
[GlobalWindow()],
pane_info=PaneInfo(
is_first=False,
is_last=False,
timing=PaneInfoTiming.EARLY, # 0
index=3,
nonspeculative_index=-1))
] if compat_version is None else ([
TestWindowedValue(('key', [0, 1, 2]),
expected_timestamp, [GlobalWindow()],
PANE_INFO_UNKNOWN),
TestWindowedValue(('key', [3, 4, 5]),
expected_timestamp, [GlobalWindow()],
PANE_INFO_UNKNOWN),
TestWindowedValue(('key', [6, 7, 8]),
expected_timestamp, [GlobalWindow()],
PANE_INFO_UNKNOWN),
TestWindowedValue(('key', [9, 10, 11]),
expected_timestamp, [GlobalWindow()],
PANE_INFO_UNKNOWN)
])
options = PipelineOptions(update_compatibility_version=compat_version)
options.view_as(StandardOptions).streaming = True
with beam.Pipeline(options=options) as p:
stream_source = (
TestStream().advance_watermark_to(0).advance_processing_time(
100).add_elements(l[:element_count // 4]).advance_processing_time(
100).advance_watermark_to(100).add_elements(
l[element_count // 4:2 * element_count // 4]).
advance_processing_time(100).advance_watermark_to(200).add_elements(
l[2 * element_count // 4:3 * element_count //
4]).advance_processing_time(
100).advance_watermark_to(300).add_elements(
l[3 * element_count // 4:]).advance_processing_time(
100).advance_watermark_to_infinity())
grouped = (
p | stream_source
| "Rewindow" >> beam.WindowInto(
beam.window.GlobalWindows(),
trigger=trigger.Repeatedly(trigger.AfterProcessingTime(1)),
accumulation_mode=trigger.AccumulationMode.DISCARDING)
| beam.GroupByKey())
after_reshuffle = (grouped | 'Reshuffle' >> beam.Reshuffle())
assert_that(
after_reshuffle,
equal_to(expected),
label='CheckMetadataPreserved',
reify_windows=True)
typecoders.registry.force_dill_deterministic_coders = False
@parameterized.expand([
param(compat_version=None),
param(compat_version="2.64.0"),
])
def test_reshuffle_default_window_preserves_metadata(self, compat_version):
"""Tests that Reshuffle preserves timestamp, window, and pane info
metadata."""
from apache_beam.coders import typecoders
typecoders.registry.force_dill_deterministic_coders = True
no_firing = PaneInfo(
is_first=True,
is_last=True,
timing=PaneInfoTiming.UNKNOWN,
index=0,
nonspeculative_index=0)
on_time_only = PaneInfo(
is_first=True,
is_last=True,
timing=PaneInfoTiming.ON_TIME,
index=0,
nonspeculative_index=0)
late_firing = PaneInfo(
is_first=False,
is_last=False,
timing=PaneInfoTiming.LATE,
index=1,
nonspeculative_index=1)
# Portable runners may not have the same level of precision on timestamps -
# this gets the largest supported timestamp with the extra non-supported
# bits truncated
gt = GlobalWindow().max_timestamp()
truncated_gt = gt - (gt % 0.001)
expected_preserved = [
TestWindowedValue('a', MIN_TIMESTAMP, [GlobalWindow()], no_firing),
TestWindowedValue(
'b', timestamp.Timestamp(0), [GlobalWindow()], on_time_only),
TestWindowedValue(
'c', timestamp.Timestamp(33), [GlobalWindow()], late_firing),
TestWindowedValue('d', truncated_gt, [GlobalWindow()], no_firing)
]
expected_not_preserved = [
TestWindowedValue(
'a', MIN_TIMESTAMP, [GlobalWindow()], PANE_INFO_UNKNOWN),
TestWindowedValue(
'b', timestamp.Timestamp(0), [GlobalWindow()], PANE_INFO_UNKNOWN),
TestWindowedValue(
'c', timestamp.Timestamp(33), [GlobalWindow()], PANE_INFO_UNKNOWN),
TestWindowedValue(
'd', truncated_gt, [GlobalWindow()], PANE_INFO_UNKNOWN)
]
expected = (
expected_preserved
if compat_version is None else expected_not_preserved)
options = PipelineOptions(update_compatibility_version=compat_version)
with TestPipeline(options=options) as pipeline:
# Create windowed values with specific metadata
elements = [
WindowedValue('a', MIN_TIMESTAMP, [GlobalWindow()], no_firing),
WindowedValue(
'b', timestamp.Timestamp(0), [GlobalWindow()], on_time_only),
WindowedValue(
'c', timestamp.Timestamp(33), [GlobalWindow()], late_firing),
WindowedValue('d', truncated_gt, [GlobalWindow()], no_firing)
]
after_reshuffle = (
pipeline
| 'Create' >> beam.Create(elements)
| 'Reshuffle' >> beam.Reshuffle())
assert_that(
after_reshuffle,
equal_to(expected),
label='CheckMetadataPreserved',
reify_windows=True)
typecoders.registry.force_dill_deterministic_coders = False
@pytest.mark.it_validatesrunner
def test_reshuffle_preserves_timestamps(self):
with TestPipeline() as pipeline:
# Create a PCollection and assign each element with a different timestamp.
before_reshuffle = (
pipeline
| beam.Create([
{
'name': 'foo', 'timestamp': MIN_TIMESTAMP
},
{
'name': 'foo', 'timestamp': 0
},
{
'name': 'bar', 'timestamp': 33
},
{
'name': 'bar', 'timestamp': 0
},
])
| beam.Map(
lambda element: beam.window.TimestampedValue(
element, element['timestamp'])))
# Reshuffle the PCollection above and assign the timestamp of an element
# to that element again.
after_reshuffle = before_reshuffle | beam.Reshuffle()
# Given an element, emits a string which contains the timestamp and the
# name field of the element.
def format_with_timestamp(element, timestamp=beam.DoFn.TimestampParam):
t = str(timestamp)
if timestamp == MIN_TIMESTAMP:
t = 'MIN_TIMESTAMP'
elif timestamp == MAX_TIMESTAMP:
t = 'MAX_TIMESTAMP'
return '{} - {}'.format(t, element['name'])
# Combine each element in before_reshuffle with its timestamp.
formatted_before_reshuffle = (
before_reshuffle
| "Get before_reshuffle timestamp" >> beam.Map(format_with_timestamp))
# Combine each element in after_reshuffle with its timestamp.
formatted_after_reshuffle = (
after_reshuffle
| "Get after_reshuffle timestamp" >> beam.Map(format_with_timestamp))
expected_data = [
'MIN_TIMESTAMP - foo',
'Timestamp(0) - foo',
'Timestamp(33) - bar',
'Timestamp(0) - bar'
]
# Can't compare formatted_before_reshuffle and formatted_after_reshuffle
# directly, because they are deferred PCollections while equal_to only
# takes a concrete argument.
assert_that(
formatted_before_reshuffle,
equal_to(expected_data),
label="formatted_before_reshuffle")
assert_that(
formatted_after_reshuffle,
equal_to(expected_data),
label="formatted_after_reshuffle")
def reshuffle_unpicklable_in_global_window_helper(
self, update_compatibility_version=None):
with TestPipeline(options=PipelineOptions(
update_compatibility_version=update_compatibility_version)) as pipeline:
data = [_Unpicklable(i) for i in range(5)]
expected_data = [0, 10, 20, 30, 40]
result = (
pipeline
| beam.Create(data)
| beam.WindowInto(GlobalWindows())
| beam.Reshuffle()
| beam.Map(lambda u: u.value * 10))
assert_that(result, equal_to(expected_data))
def test_reshuffle_unpicklable_in_global_window(self):
beam.coders.registry.register_coder(_Unpicklable, _UnpicklableCoder)
self.reshuffle_unpicklable_in_global_window_helper()
# An exception is raised when running reshuffle on unpicklable objects
# prior to 2.64.0
self.assertRaises(
RuntimeError,
self.reshuffle_unpicklable_in_global_window_helper,
"2.63.0")
def reshuffle_unpicklable_in_non_global_window_helper(
self, update_compatibility_version=None):
with TestPipeline(options=PipelineOptions(
update_compatibility_version=update_compatibility_version)) as pipeline:
data = [_Unpicklable(i) for i in range(5)]
expected_data = [0, 0, 0, 10, 10, 10, 20, 20, 20, 30, 30, 30, 40, 40, 40]
result = (
pipeline
| beam.Create(data)
| beam.WindowInto(window.SlidingWindows(size=3, period=1))
| beam.Reshuffle()
| beam.Map(lambda u: u.value * 10))
assert_that(result, equal_to(expected_data))
def test_reshuffle_unpicklable_in_non_global_window(self):
beam.coders.registry.register_coder(_Unpicklable, _UnpicklableCoder)
self.reshuffle_unpicklable_in_non_global_window_helper()
# An exception is raised when running reshuffle on unpicklable objects
# prior to 2.64.0
self.assertRaises(
RuntimeError,
self.reshuffle_unpicklable_in_non_global_window_helper,
"2.63.0")
class WithKeysTest(unittest.TestCase):
def setUp(self):
self.l = [1, 2, 3]
def test_constant_k(self):
with TestPipeline() as p:
pc = p | beam.Create(self.l)
with_keys = pc | util.WithKeys('k')
assert_that(with_keys, equal_to([('k', 1), ('k', 2), ('k', 3)],
))
def test_callable_k(self):
with TestPipeline() as p:
pc = p | beam.Create(self.l)
with_keys = pc | util.WithKeys(lambda x: x * x)
assert_that(with_keys, equal_to([(1, 1), (4, 2), (9, 3)]))
@staticmethod
def _test_args_kwargs_fn(x, multiply, subtract):
return x * multiply - subtract
def test_args_kwargs_k(self):
with TestPipeline() as p:
pc = p | beam.Create(self.l)
with_keys = pc | util.WithKeys(
WithKeysTest._test_args_kwargs_fn, 2, subtract=1)
assert_that(with_keys, equal_to([(1, 1), (3, 2), (5, 3)]))
def test_sideinputs(self):
with TestPipeline() as p:
pc = p | beam.Create(self.l)
si1 = AsList(p | "side input 1" >> beam.Create([1, 2, 3]))
si2 = AsSingleton(p | "side input 2" >> beam.Create([10]))
with_keys = pc | util.WithKeys(
lambda x, the_list, the_singleton: x + sum(the_list) + the_singleton,
si1,
the_singleton=si2)
assert_that(with_keys, equal_to([(17, 1), (18, 2), (19, 3)]))
class GroupIntoBatchesTest(unittest.TestCase):
NUM_ELEMENTS = 10
BATCH_SIZE = 5
@staticmethod
def _create_test_data():
scientists = [
"Einstein",
"Darwin",
"Copernicus",
"Pasteur",
"Curie",
"Faraday",
"Newton",
"Bohr",
"Galilei",
"Maxwell"
]
data = []
for i in range(GroupIntoBatchesTest.NUM_ELEMENTS):
index = i % len(scientists)
data.append(("key", scientists[index]))
return data
def test_in_global_window(self):
with TestPipeline() as pipeline:
collection = pipeline \
| beam.Create(GroupIntoBatchesTest._create_test_data()) \
| util.GroupIntoBatches(GroupIntoBatchesTest.BATCH_SIZE)
num_batches = collection | beam.combiners.Count.Globally()
assert_that(
num_batches,
equal_to([
int(
math.ceil(
GroupIntoBatchesTest.NUM_ELEMENTS /
GroupIntoBatchesTest.BATCH_SIZE))
]))
def test_in_global_window_with_synthetic_source(self):
with beam.Pipeline() as pipeline:
collection = (
pipeline
| beam.io.Read(
SyntheticSource({
"numRecords": 10, "keySizeBytes": 1, "valueSizeBytes": 1
}))
| "identical keys" >> beam.Map(lambda x: (None, x[1]))
| "Group key" >> beam.GroupIntoBatches(2)
| "count size" >> beam.Map(lambda x: len(x[1])))
assert_that(collection, equal_to([2, 2, 2, 2, 2]))
def test_with_sharded_key_in_global_window(self):
with TestPipeline() as pipeline:
collection = (
pipeline
| beam.Create(GroupIntoBatchesTest._create_test_data())
| util.GroupIntoBatches.WithShardedKey(
GroupIntoBatchesTest.BATCH_SIZE))
num_batches = collection | beam.combiners.Count.Globally()
assert_that(
num_batches,
equal_to([
int(
math.ceil(
GroupIntoBatchesTest.NUM_ELEMENTS /
GroupIntoBatchesTest.BATCH_SIZE))
]))
def test_buffering_timer_in_fixed_window_streaming(self):
window_duration = 6
max_buffering_duration_secs = 100
start_time = timestamp.Timestamp(0)
test_stream = (
TestStream().add_elements([
TimestampedValue(value, start_time + i)
for i, value in enumerate(GroupIntoBatchesTest._create_test_data())
]).advance_processing_time(150).advance_watermark_to(
start_time + window_duration).advance_watermark_to(
start_time + window_duration +
1).advance_watermark_to_infinity())
with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
# To trigger the processing time timer, use a fake clock with start time
# being Timestamp(0).
fake_clock = FakeClock(now=start_time)
num_elements_per_batch = (
pipeline | test_stream
| "fixed window" >> WindowInto(FixedWindows(window_duration))
| util.GroupIntoBatches(
GroupIntoBatchesTest.BATCH_SIZE,
max_buffering_duration_secs,
fake_clock)
| "count elements in batch" >> Map(lambda x: (None, len(x[1])))
| GroupByKey()
| "global window" >> WindowInto(GlobalWindows())
| FlatMapTuple(lambda k, vs: vs))
# Window duration is 6 and batch size is 5, so output batch size
# should be 5 (flush because of batch size reached).
expected_0 = 5
# There is only one element left in the window so batch size
# should be 1 (flush because of max buffering duration reached).
expected_1 = 1
# Collection has 10 elements, there are only 4 left, so batch size should
# be 4 (flush because of end of window reached).
expected_2 = 4
assert_that(
num_elements_per_batch,
equal_to([expected_0, expected_1, expected_2]),
"assert2")
def test_buffering_timer_in_global_window_streaming(self):
max_buffering_duration_secs = 42
start_time = timestamp.Timestamp(0)
test_stream = TestStream().advance_watermark_to(start_time)
for i, value in enumerate(GroupIntoBatchesTest._create_test_data()):
test_stream.add_elements(
[TimestampedValue(value, start_time + i)]) \
.advance_processing_time(5)
test_stream.advance_watermark_to(
start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \
.advance_watermark_to_infinity()
with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
# Set a batch size larger than the total number of elements.
# Since we're in a global window, we would have been waiting
# for all the elements to arrive without the buffering time limit.
batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2
# To trigger the processing time timer, use a fake clock with start time
# being Timestamp(0). Since the fake clock never really advances during
# the pipeline execution, meaning that the timer is always set to the same
# value, the timer will be fired on every element after the first firing.
fake_clock = FakeClock(now=start_time)
num_elements_per_batch = (
pipeline | test_stream
| WindowInto(
GlobalWindows(),
trigger=Repeatedly(AfterCount(1)),
accumulation_mode=trigger.AccumulationMode.DISCARDING)
| util.GroupIntoBatches(
batch_size, max_buffering_duration_secs, fake_clock)
| 'count elements in batch' >> Map(lambda x: (None, len(x[1])))
| GroupByKey()
| FlatMapTuple(lambda k, vs: vs))
# We will flush twice when the max buffering duration is reached and when
# the global window ends.
assert_that(num_elements_per_batch, equal_to([9, 1]))
def test_output_typehints(self):
transform = util.GroupIntoBatches.WithShardedKey(
GroupIntoBatchesTest.BATCH_SIZE)
unused_input_type = typehints.Tuple[str, str]
output_type = transform.infer_output_type(unused_input_type)
self.assertTrue(isinstance(output_type, typehints.TupleConstraint))
k, v = output_type.tuple_types
self.assertTrue(isinstance(k, ShardedKeyType))
self.assertTrue(isinstance(v, typehints.IterableTypeConstraint))
with TestPipeline() as pipeline:
collection = (
pipeline
| beam.Create([((1, 2), 'a'), ((2, 3), 'b')])
| util.GroupIntoBatches.WithShardedKey(
GroupIntoBatchesTest.BATCH_SIZE))
self.assertTrue(
collection.element_type,
typehints.Tuple[
ShardedKeyType[typehints.Tuple[int, int]], # type: ignore[misc]
typehints.Iterable[str]])
def test_runtime_type_check(self):
options = PipelineOptions()
options.view_as(TypeOptions).runtime_type_check = True
with TestPipeline(options=options) as pipeline:
collection = (
pipeline
| beam.Create(GroupIntoBatchesTest._create_test_data())
| util.GroupIntoBatches(GroupIntoBatchesTest.BATCH_SIZE))
num_batches = collection | beam.combiners.Count.Globally()
assert_that(
num_batches,
equal_to([
int(
math.ceil(
GroupIntoBatchesTest.NUM_ELEMENTS /
GroupIntoBatchesTest.BATCH_SIZE))
]))
def _test_runner_api_round_trip(self, transform, urn):
context = pipeline_context.PipelineContext()
proto = transform.to_runner_api(context)
self.assertEqual(urn, proto.urn)
payload = (
proto_utils.parse_Bytes(
proto.payload, beam_runner_api_pb2.GroupIntoBatchesPayload))
self.assertEqual(transform.params.batch_size, payload.batch_size)
self.assertEqual(
transform.params.max_buffering_duration_secs * 1000,
payload.max_buffering_duration_millis)
transform_from_proto = (
transform.__class__.from_runner_api_parameter(None, payload, None))
self.assertIsInstance(transform_from_proto, transform.__class__)
self.assertEqual(transform.params, transform_from_proto.params)
def test_runner_api(self):
batch_size = 10
max_buffering_duration_secs = [None, 0, 5]
for duration in max_buffering_duration_secs:
self._test_runner_api_round_trip(
util.GroupIntoBatches(batch_size, duration),
common_urns.group_into_batches_components.GROUP_INTO_BATCHES.urn)
self._test_runner_api_round_trip(
util.GroupIntoBatches(batch_size),
common_urns.group_into_batches_components.GROUP_INTO_BATCHES.urn)
for duration in max_buffering_duration_secs:
self._test_runner_api_round_trip(
util.GroupIntoBatches.WithShardedKey(batch_size, duration),
common_urns.composites.GROUP_INTO_BATCHES_WITH_SHARDED_KEY.urn)
self._test_runner_api_round_trip(
util.GroupIntoBatches.WithShardedKey(batch_size),
common_urns.composites.GROUP_INTO_BATCHES_WITH_SHARDED_KEY.urn)
class ToStringTest(unittest.TestCase):
def test_tostring_elements(self):
with TestPipeline() as p:
result = (p | beam.Create([1, 1, 2, 3]) | util.ToString.Element())
assert_that(result, equal_to(["1", "1", "2", "3"]))
def test_tostring_iterables(self):
with TestPipeline() as p:
result = (
p | beam.Create([("one", "two", "three"), ("four", "five", "six")])
| util.ToString.Iterables())
assert_that(result, equal_to(["one,two,three", "four,five,six"]))
def test_tostring_iterables_with_delimeter(self):
with TestPipeline() as p:
data = [("one", "two", "three"), ("four", "five", "six")]
result = (p | beam.Create(data) | util.ToString.Iterables("\t"))
assert_that(result, equal_to(["one\ttwo\tthree", "four\tfive\tsix"]))
def test_tostring_kvs(self):
with TestPipeline() as p:
result = (p | beam.Create([("one", 1), ("two", 2)]) | util.ToString.Kvs())
assert_that(result, equal_to(["one,1", "two,2"]))
def test_tostring_kvs_delimeter(self):
with TestPipeline() as p:
result = (
p | beam.Create([("one", 1), ("two", 2)]) | util.ToString.Kvs("\t"))
assert_that(result, equal_to(["one\t1", "two\t2"]))
def test_tostring_kvs_empty_delimeter(self):
with TestPipeline() as p:
result = (
p | beam.Create([("one", 1), ("two", 2)]) | util.ToString.Kvs(""))
assert_that(result, equal_to(["one1", "two2"]))
class LogElementsTest(unittest.TestCase):
@pytest.fixture(scope="function")
def _capture_stdout_log(request, capsys):
with TestPipeline() as p:
result = (
p | beam.Create([
TimestampedValue(
"event",
datetime(2022, 10, 1, 0, 0, 0, 0,
tzinfo=pytz.UTC).timestamp()),
TimestampedValue(
"event",
datetime(2022, 10, 2, 0, 0, 0, 0,
tzinfo=pytz.UTC).timestamp()),
])
| beam.WindowInto(FixedWindows(60))
| util.LogElements(
prefix='prefix_',
with_window=True,
with_timestamp=True,
with_pane_info=True))
request.captured_stdout = capsys.readouterr().out
return result
@pytest.mark.usefixtures("_capture_stdout_log")
def test_stdout_logs(self):
assert self.captured_stdout == \
("prefix_event, timestamp='2022-10-01T00:00:00Z', "
"window(start=2022-10-01T00:00:00Z, end=2022-10-01T00:01:00Z), "
"pane_info=PaneInfo(first: True, last: True, timing: UNKNOWN, "
"index: 0, nonspeculative_index: 0)\n"
"prefix_event, timestamp='2022-10-02T00:00:00Z', "
"window(start=2022-10-02T00:00:00Z, end=2022-10-02T00:01:00Z), "
"pane_info=PaneInfo(first: True, last: True, timing: UNKNOWN, "
"index: 0, nonspeculative_index: 0)\n"), \
f'Received from stdout: {self.captured_stdout}'
@pytest.fixture(scope="function")
def _capture_stdout_log_without_rfc3339(request, capsys):
with TestPipeline() as p:
result = (
p | beam.Create([
TimestampedValue(
"event",
datetime(2022, 10, 1, 0, 0, 0, 0,
tzinfo=pytz.UTC).timestamp()),
TimestampedValue(
"event",
datetime(2022, 10, 2, 0, 0, 0, 0,
tzinfo=pytz.UTC).timestamp()),
])
| beam.WindowInto(FixedWindows(60))
| util.LogElements(
prefix='prefix_',
with_window=True,
with_timestamp=True,
use_epoch_time=True))
request.captured_stdout = capsys.readouterr().out
return result
@pytest.mark.usefixtures("_capture_stdout_log_without_rfc3339")
def test_stdout_logs_without_rfc3339(self):
assert self.captured_stdout == \
("prefix_event, timestamp=1664582400, "
"window(start=1664582400, end=1664582460)\n"
"prefix_event, timestamp=1664668800, "
"window(start=1664668800, end=1664668860)\n"), \
f'Received from stdout: {self.captured_stdout}'
def test_ptransform_output(self):
with TestPipeline() as p:
result = (
p
| beam.Create(['a', 'b', 'c'])
| util.LogElements(prefix='prefix_'))
assert_that(result, equal_to(['a', 'b', 'c']))
@pytest.fixture(scope="function")
def _capture_logs(request, caplog):
with caplog.at_level(logging.INFO):
with TestPipeline() as p:
_ = (
p | "info" >> beam.Create(["element"])
| "I" >> beam.LogElements(prefix='info_', level=logging.INFO))
_ = (
p | "warning" >> beam.Create(["element"])
| "W" >> beam.LogElements(prefix='warning_', level=logging.WARNING))
_ = (
p | "error" >> beam.Create(["element"])
| "E" >> beam.LogElements(prefix='error_', level=logging.ERROR))
request.captured_log = caplog.text
@pytest.mark.usefixtures("_capture_logs")
def test_setting_level_uses_appropriate_log_channel(self):
self.assertTrue(
re.compile('INFO(.*)info_element').search(self.captured_log))
self.assertTrue(
re.compile('WARNING(.*)warning_element').search(self.captured_log))
self.assertTrue(
re.compile('ERROR(.*)error_element').search(self.captured_log))
class ReifyTest(unittest.TestCase):
def test_timestamp(self):
l = [
TimestampedValue('a', 100),
TimestampedValue('b', 200),
TimestampedValue('c', 300)
]
expected = [
TestWindowedValue('a', 100, [GlobalWindow()]),
TestWindowedValue('b', 200, [GlobalWindow()]),
TestWindowedValue('c', 300, [GlobalWindow()])
]
with TestPipeline() as p:
# Map(lambda x: x) PTransform is added after Create here, because when
# a PCollection of TimestampedValues is created with Create PTransform,
# the timestamps are not assigned to it. Adding a Map forces the
# PCollection to go through a DoFn so that the PCollection consists of
# the elements with timestamps assigned to them instead of a PCollection
# of TimestampedValue(element, timestamp).
pc = p | beam.Create(l) | beam.Map(lambda x: x)
reified_pc = pc | util.Reify.Timestamp()
assert_that(reified_pc, equal_to(expected), reify_windows=True)
def test_window(self):
l = [
GlobalWindows.windowed_value('a', 100),
GlobalWindows.windowed_value('b', 200),
GlobalWindows.windowed_value('c', 300)
]
expected = [
TestWindowedValue(('a', 100, GlobalWindow()), 100, [GlobalWindow()]),
TestWindowedValue(('b', 200, GlobalWindow()), 200, [GlobalWindow()]),
TestWindowedValue(('c', 300, GlobalWindow()), 300, [GlobalWindow()])
]
with TestPipeline() as p:
pc = p | beam.Create(l)
# Map(lambda x: x) PTransform is added after Create here, because when
# a PCollection of WindowedValues is created with Create PTransform,
# the windows are not assigned to it. Adding a Map forces the
# PCollection to go through a DoFn so that the PCollection consists of
# the elements with timestamps assigned to them instead of a PCollection
# of WindowedValue(element, timestamp, window).
pc = pc | beam.Map(lambda x: x)
reified_pc = pc | util.Reify.Window()
assert_that(reified_pc, equal_to(expected), reify_windows=True)
def test_timestamp_in_value(self):
l = [
TimestampedValue(('a', 1), 100),
TimestampedValue(('b', 2), 200),
TimestampedValue(('c', 3), 300)
]
expected = [
TestWindowedValue(('a', TimestampedValue(1, 100)),
100, [GlobalWindow()]),
TestWindowedValue(('b', TimestampedValue(2, 200)),
200, [GlobalWindow()]),
TestWindowedValue(('c', TimestampedValue(3, 300)),
300, [GlobalWindow()])
]
with TestPipeline() as p:
pc = p | beam.Create(l) | beam.Map(lambda x: x)
reified_pc = pc | util.Reify.TimestampInValue()
assert_that(reified_pc, equal_to(expected), reify_windows=True)
def test_window_in_value(self):
l = [
GlobalWindows.windowed_value(('a', 1), 100),
GlobalWindows.windowed_value(('b', 2), 200),
GlobalWindows.windowed_value(('c', 3), 300)
]
expected = [
TestWindowedValue(('a', (1, 100, GlobalWindow())),
100, [GlobalWindow()]),
TestWindowedValue(('b', (2, 200, GlobalWindow())),
200, [GlobalWindow()]),
TestWindowedValue(('c', (3, 300, GlobalWindow())),
300, [GlobalWindow()])
]
with TestPipeline() as p:
# Map(lambda x: x) hack is used for the same reason here.
# Also, this makes the typehint on Reify.WindowInValue work.
pc = p | beam.Create(l) | beam.Map(lambda x: x)
reified_pc = pc | util.Reify.WindowInValue()
assert_that(reified_pc, equal_to(expected), reify_windows=True)
class RegexTest(unittest.TestCase):
def test_find(self):
with TestPipeline() as p:
result = (
p | beam.Create(["aj", "xj", "yj", "zj"])
| util.Regex.find("[xyz]"))
assert_that(result, equal_to(["x", "y", "z"]))
def test_find_pattern(self):
with TestPipeline() as p:
rc = re.compile("[xyz]")
result = (p | beam.Create(["aj", "xj", "yj", "zj"]) | util.Regex.find(rc))
assert_that(result, equal_to(["x", "y", "z"]))
def test_find_group(self):
with TestPipeline() as p:
result = (
p | beam.Create(["aj", "xj", "yj", "zj"])
| util.Regex.find("([xyz])j", group=1))
assert_that(result, equal_to(["x", "y", "z"]))
def test_find_empty(self):
with TestPipeline() as p:
result = (
p | beam.Create(["a", "b", "c", "d"])
| util.Regex.find("[xyz]"))
assert_that(result, equal_to([]))
def test_find_group_name(self):
with TestPipeline() as p:
result = (
p | beam.Create(["aj", "xj", "yj", "zj"])
| util.Regex.find("(?P<namedgroup>[xyz])j", group="namedgroup"))
assert_that(result, equal_to(["x", "y", "z"]))
def test_find_group_name_pattern(self):
with TestPipeline() as p:
rc = re.compile("(?P<namedgroup>[xyz])j")
result = (
p | beam.Create(["aj", "xj", "yj", "zj"])
| util.Regex.find(rc, group="namedgroup"))
assert_that(result, equal_to(["x", "y", "z"]))
def test_find_all_groups(self):
data = ["abb ax abbb", "abc qwerty abcabcd xyz"]
with TestPipeline() as p:
pcol = (p | beam.Create(data))
assert_that(
pcol | 'with default values' >> util.Regex.find_all('a(b*)'),
equal_to([['abb', 'a', 'abbb'], ['ab', 'ab', 'ab']]),
label='CheckWithDefaultValues')
assert_that(
pcol | 'group 1' >> util.Regex.find_all('a(b*)', 1),
equal_to([['b', 'b', 'b'], ['bb', '', 'bbb']]),
label='CheckWithGroup1')
assert_that(
pcol | 'group 1 non empty' >> util.Regex.find_all(
'a(b*)', 1, outputEmpty=False),
equal_to([['b', 'b', 'b'], ['bb', 'bbb']]),
label='CheckGroup1NonEmpty')
assert_that(
pcol | 'named group' >> util.Regex.find_all(
'a(?P<namedgroup>b*)', 'namedgroup'),
equal_to([['b', 'b', 'b'], ['bb', '', 'bbb']]),
label='CheckNamedGroup')
assert_that(
pcol | 'all groups' >> util.Regex.find_all(
'a(?P<namedgroup>b*)', util.Regex.ALL),
equal_to([[('ab', 'b'), ('ab', 'b'), ('ab', 'b')],
[('abb', 'bb'), ('a', ''), ('abbb', 'bbb')]]),
label='CheckAllGroups')
assert_that(
pcol | 'all non empty groups' >> util.Regex.find_all(
'a(b*)', util.Regex.ALL, outputEmpty=False),
equal_to([[('ab', 'b'), ('ab', 'b'), ('ab', 'b')],
[('abb', 'bb'), ('abbb', 'bbb')]]),
label='CheckAllNonEmptyGroups')
def test_find_kv(self):
with TestPipeline() as p:
pcol = (p | beam.Create(['a b c d']))
assert_that(
pcol | 'key 1' >> util.Regex.find_kv(
'a (b) (c)',
1,
),
equal_to([('b', 'a b c')]),
label='CheckKey1')
assert_that(
pcol | 'key 1 group 1' >> util.Regex.find_kv('a (b) (c)', 1, 2),
equal_to([('b', 'c')]),
label='CheckKey1Group1')
def test_find_kv_pattern(self):
with TestPipeline() as p:
rc = re.compile("a (b) (c)")
result = (p | beam.Create(["a b c"]) | util.Regex.find_kv(rc, 1, 2))
assert_that(result, equal_to([("b", "c")]))
def test_find_kv_none(self):
with TestPipeline() as p:
result = (
p | beam.Create(["x y z"])
| util.Regex.find_kv("a (b) (c)", 1, 2))
assert_that(result, equal_to([]))
def test_match(self):
with TestPipeline() as p:
result = (
p | beam.Create(["a", "x", "y", "z"])
| util.Regex.matches("[xyz]"))
assert_that(result, equal_to(["x", "y", "z"]))
with TestPipeline() as p:
result = (
p | beam.Create(["a", "ax", "yby", "zzc"])
| util.Regex.matches("[xyz]"))
assert_that(result, equal_to(["y", "z"]))
def test_match_entire_line(self):
with TestPipeline() as p:
result = (
p | beam.Create(["a", "x", "y", "ay", "zz"])
| util.Regex.matches("[xyz]$"))
assert_that(result, equal_to(["x", "y"]))
def test_match_pattern(self):
with TestPipeline() as p:
rc = re.compile("[xyz]")
result = (p | beam.Create(["a", "x", "y", "z"]) | util.Regex.matches(rc))
assert_that(result, equal_to(["x", "y", "z"]))
def test_match_none(self):
with TestPipeline() as p:
result = (
p | beam.Create(["a", "b", "c", "d"])
| util.Regex.matches("[xyz]"))
assert_that(result, equal_to([]))
def test_match_group(self):
with TestPipeline() as p:
result = (
p | beam.Create(["a", "x xxx", "x yyy", "x zzz"])
| util.Regex.matches("x ([xyz]*)", 1))
assert_that(result, equal_to(("xxx", "yyy", "zzz")))
def test_match_group_name(self):
with TestPipeline() as p:
result = (
p | beam.Create(["a", "x xxx", "x yyy", "x zzz"])
| util.Regex.matches("x (?P<namedgroup>[xyz]*)", 'namedgroup'))
assert_that(result, equal_to(("xxx", "yyy", "zzz")))
def test_match_group_name_pattern(self):
with TestPipeline() as p:
rc = re.compile("x (?P<namedgroup>[xyz]*)")
result = (
p | beam.Create(["a", "x xxx", "x yyy", "x zzz"])
| util.Regex.matches(rc, 'namedgroup'))
assert_that(result, equal_to(("xxx", "yyy", "zzz")))
def test_match_group_empty(self):
with TestPipeline() as p:
result = (
p | beam.Create(["a", "b", "c", "d"])
| util.Regex.matches("x (?P<namedgroup>[xyz]*)", 'namedgroup'))
assert_that(result, equal_to([]))
def test_all_matched(self):
with TestPipeline() as p:
result = (
p | beam.Create(["a x", "x x", "y y", "z z"])
| util.Regex.all_matches("([xyz]) ([xyz])"))
expected_result = [["x x", "x", "x"], ["y y", "y", "y"],
["z z", "z", "z"]]
assert_that(result, equal_to(expected_result))
def test_all_matched_pattern(self):
with TestPipeline() as p:
rc = re.compile("([xyz]) ([xyz])")
result = (
p | beam.Create(["a x", "x x", "y y", "z z"])
| util.Regex.all_matches(rc))
expected_result = [["x x", "x", "x"], ["y y", "y", "y"],
["z z", "z", "z"]]
assert_that(result, equal_to(expected_result))
def test_match_group_kv(self):
with TestPipeline() as p:
result = (
p | beam.Create(["a b c"])
| util.Regex.matches_kv("a (b) (c)", 1, 2))
assert_that(result, equal_to([("b", "c")]))
def test_match_group_kv_pattern(self):
with TestPipeline() as p:
rc = re.compile("a (b) (c)")
pcol = (p | beam.Create(["a b c"]))
assert_that(
pcol | 'key 1' >> util.Regex.matches_kv(rc, 1),
equal_to([("b", "a b c")]),
label="CheckKey1")
assert_that(
pcol | 'key 1 group 2' >> util.Regex.matches_kv(rc, 1, 2),
equal_to([("b", "c")]),
label="CheckKey1Group2")
def test_match_group_kv_none(self):
with TestPipeline() as p:
result = (
p | beam.Create(["x y z"])
| util.Regex.matches_kv("a (b) (c)", 1, 2))
assert_that(result, equal_to([]))
def test_match_kv_group_names(self):
with TestPipeline() as p:
result = (
p | beam.Create(["a b c"]) | util.Regex.matches_kv(
"a (?P<keyname>b) (?P<valuename>c)", 'keyname', 'valuename'))
assert_that(result, equal_to([("b", "c")]))
def test_match_kv_group_names_pattern(self):
with TestPipeline() as p:
rc = re.compile("a (?P<keyname>b) (?P<valuename>c)")
result = (
p | beam.Create(["a b c"])
| util.Regex.matches_kv(rc, 'keyname', 'valuename'))
assert_that(result, equal_to([("b", "c")]))
def test_match_kv_group_name_none(self):
with TestPipeline() as p:
result = (
p | beam.Create(["x y z"]) | util.Regex.matches_kv(
"a (?P<keyname>b) (?P<valuename>c)", 'keyname', 'valuename'))
assert_that(result, equal_to([]))
def test_replace_all(self):
with TestPipeline() as p:
result = (
p | beam.Create(["xj", "yj", "zj"])
| util.Regex.replace_all("[xyz]", "new"))
assert_that(result, equal_to(["newj", "newj", "newj"]))
def test_replace_all_mixed(self):
with TestPipeline() as p:
result = (
p | beam.Create(["abc", "xj", "yj", "zj", "def"])
| util.Regex.replace_all("[xyz]", 'new'))
assert_that(result, equal_to(["abc", "newj", "newj", "newj", "def"]))
def test_replace_all_mixed_pattern(self):
with TestPipeline() as p:
rc = re.compile("[xyz]")
result = (
p | beam.Create(["abc", "xj", "yj", "zj", "def"])
| util.Regex.replace_all(rc, 'new'))
assert_that(result, equal_to(["abc", "newj", "newj", "newj", "def"]))
def test_replace_first(self):
with TestPipeline() as p:
result = (
p | beam.Create(["xjx", "yjy", "zjz"])
| util.Regex.replace_first("[xyz]", 'new'))
assert_that(result, equal_to(["newjx", "newjy", "newjz"]))
def test_replace_first_mixed(self):
with TestPipeline() as p:
result = (
p | beam.Create(["abc", "xjx", "yjy", "zjz", "def"])
| util.Regex.replace_first("[xyz]", 'new'))
assert_that(result, equal_to(["abc", "newjx", "newjy", "newjz", "def"]))
def test_replace_first_mixed_pattern(self):
with TestPipeline() as p:
rc = re.compile("[xyz]")
result = (
p | beam.Create(["abc", "xjx", "yjy", "zjz", "def"])
| util.Regex.replace_first(rc, 'new'))
assert_that(result, equal_to(["abc", "newjx", "newjy", "newjz", "def"]))
def test_split(self):
with TestPipeline() as p:
data = ["The quick brown fox jumps over the lazy dog"]
result = (p | beam.Create(data) | util.Regex.split("\\W+"))
expected_result = [[
"The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"
]]
assert_that(result, equal_to(expected_result))
def test_split_pattern(self):
with TestPipeline() as p:
data = ["The quick brown fox jumps over the lazy dog"]
rc = re.compile("\\W+")
result = (p | beam.Create(data) | util.Regex.split(rc))
expected_result = [[
"The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"
]]
assert_that(result, equal_to(expected_result))
def test_split_with_empty(self):
with TestPipeline() as p:
data = ["The quick brown fox jumps over the lazy dog"]
result = (p | beam.Create(data) | util.Regex.split("\\s", True))
expected_result = [[
'The',
'',
'quick',
'',
'',
'brown',
'fox',
'jumps',
'over',
'',
'',
'',
'the',
'lazy',
'dog'
]]
assert_that(result, equal_to(expected_result))
def test_split_without_empty(self):
with TestPipeline() as p:
data = ["The quick brown fox jumps over the lazy dog"]
result = (p | beam.Create(data) | util.Regex.split("\\s", False))
expected_result = [[
"The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"
]]
assert_that(result, equal_to(expected_result))
class TeeTest(unittest.TestCase):
_side_effects: Mapping[str, int] = collections.defaultdict(int)
def test_tee(self):
# The imports here are to avoid issues with the class (and its attributes)
# possibly being pickled rather than referenced.
def cause_side_effect(element):
importlib.import_module(__name__).TeeTest._side_effects[element] += 1
def count_side_effects(element):
return importlib.import_module(__name__).TeeTest._side_effects[element]
with TestPipeline() as p:
result = (
p
| beam.Create(['a', 'b', 'c'])
| 'TeePTransform' >> beam.Tee(beam.Map(cause_side_effect))
| 'TeeCallable' >> beam.Tee(
lambda pcoll: pcoll | beam.Map(
lambda element: cause_side_effect('X' + element))))
assert_that(result, equal_to(['a', 'b', 'c']))
self.assertEqual(count_side_effects('a'), 1)
self.assertEqual(count_side_effects('Xa'), 1)
class WaitOnTest(unittest.TestCase):
def test_find(self):
# We need shared reference that survives pickling.
def increment_global_counter():
try:
value = getattr(beam, '_WAIT_ON_TEST_COUNTER', 0)
return value
finally:
setattr(beam, '_WAIT_ON_TEST_COUNTER', value + 1)
def record(tag):
return f'Record({tag})' >> beam.Map(
lambda x: (x[0], tag, increment_global_counter()))
with TestPipeline() as p:
start = p | beam.Create([(None, ), (None, )])
x = start | record('x')
y = start | 'WaitForX' >> util.WaitOn(x) | record('y')
z = start | 'WaitForY' >> util.WaitOn(y) | record('z')
result = x | 'WaitForYZ' >> util.WaitOn(y, z) | record('result')
assert_that(x, equal_to([(None, 'x', 0), (None, 'x', 1)]), label='x')
assert_that(y, equal_to([(None, 'y', 2), (None, 'y', 3)]), label='y')
assert_that(z, equal_to([(None, 'z', 4), (None, 'z', 5)]), label='z')
assert_that(
result,
equal_to([(None, 'result', 6), (None, 'result', 7)]),
label='result')
class CompatCheckTest(unittest.TestCase):
def test_is_v1_prior_to_v2(self):
test_cases = [
# Basic comparison cases
("1.0.0", "2.0.0", True), # v1 < v2 in major
("2.0.0", "1.0.0", False), # v1 > v2 in major
("1.1.0", "1.2.0", True), # v1 < v2 in minor
("1.2.0", "1.1.0", False), # v1 > v2 in minor
("1.0.1", "1.0.2", True), # v1 < v2 in patch
("1.0.2", "1.0.1", False), # v1 > v2 in patch
# Equal versions
("1.0.0", "1.0.0", False), # Identical
("0.0.0", "0.0.0", False), # Both zero
# Different lengths - shorter vs longer
("1.0", "1.0.0", False), # Should be equal (1.0 = 1.0.0)
("1.0", "1.0.1", True), # 1.0.0 < 1.0.1
("1.2", "1.2.0", False), # Should be equal (1.2 = 1.2.0)
("1.2", "1.2.3", True), # 1.2.0 < 1.2.3
("2", "2.0.0", False), # Should be equal (2 = 2.0.0)
("2", "2.0.1", True), # 2.0.0 < 2.0.1
("1", "2.0", True), # 1.0.0 < 2.0.0
# Different lengths - longer vs shorter
("1.0.0", "1.0", False), # Should be equal
("1.0.1", "1.0", False), # 1.0.1 > 1.0.0
("1.2.0", "1.2", False), # Should be equal
("1.2.3", "1.2", False), # 1.2.3 > 1.2.0
("2.0.0", "2", False), # Should be equal
("2.0.1", "2", False), # 2.0.1 > 2.0.0
("2.0", "1", False), # 2.0.0 > 1.0.0
# Mixed length comparisons
("1.0", "2.0.0", True), # 1.0.0 < 2.0.0
("2.0", "1.0.0", False), # 2.0.0 > 1.0.0
("1", "1.0.1", True), # 1.0.0 < 1.0.1
("1.1", "1.0.9", False), # 1.1.0 > 1.0.9
# Large numbers
("1.9.9", "2.0.0", True), # 1.9.9 < 2.0.0
("10.0.0", "9.9.9", False), # 10.0.0 > 9.9.9
("1.10.0", "1.9.0", False), # 1.10.0 > 1.9.0
("1.2.10", "1.2.9", False), # 1.2.10 > 1.2.9
# Sequential versions
("1.0.0", "1.0.1", True),
("1.0.1", "1.0.2", True),
("1.0.9", "1.1.0", True),
("1.9.9", "2.0.0", True),
# Null/None cases
(None, "1.0.0", False), # v1 is None
]
for v1, v2, expected in test_cases:
self.assertEqual(
util.is_v1_prior_to_v2(v1=v1, v2=v2),
expected,
msg=f"Failed {v1} < {v2} == {expected}")
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
unittest.main()