sdks/python/apache_beam/testing/test_stream_it_test.py - beam - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 """Integration tests for the test_stream module."""

 # pytype: skip-file

 import unittest
 from functools import wraps

 import pytest

 import apache_beam as beam
 from apache_beam.options.pipeline_options import StandardOptions
 from apache_beam.testing.test_pipeline import TestPipeline
 from apache_beam.testing.test_stream import TestStream
 from apache_beam.testing.util import assert_that
 from apache_beam.testing.util import equal_to
 from apache_beam.testing.util import equal_to_per_window
 from apache_beam.transforms import trigger
 from apache_beam.transforms import window
 from apache_beam.transforms.window import FixedWindows
 from apache_beam.transforms.window import TimestampedValue
 from apache_beam.utils import timestamp
 from apache_beam.utils.timestamp import Timestamp


 def supported(runners):
   if not isinstance(runners, list):
     runners = [runners]

   def inner(fn):
     @wraps(fn)
     def wrapped(self):
       if self.runner_name not in runners:
         self.skipTest(
             'The "{}", does not support the TestStream transform. '
             'Supported runners: {}'.format(self.runner_name, runners))
       else:
         return fn(self)

     return wrapped

   return inner


 class TestStreamIntegrationTests(unittest.TestCase):
   @classmethod
   def setUpClass(cls):
     cls.test_pipeline = TestPipeline(is_integration_test=True)
     cls.args = cls.test_pipeline.get_full_options_as_args()
     cls.runner_name = type(cls.test_pipeline.runner).__name__
     cls.project = cls.test_pipeline.get_option('project')

   @supported(['DirectRunner', 'SwitchingDirectRunner'])
   @pytest.mark.it_postcommit
   def test_basic_execution(self):
     test_stream = (
         TestStream().advance_watermark_to(10).add_elements([
             'a', 'b', 'c'
         ]).advance_watermark_to(20).add_elements(['d']).add_elements([
             'e'
         ]).advance_processing_time(10).advance_watermark_to(300).add_elements([
             TimestampedValue('late', 12)
         ]).add_elements([TimestampedValue('last', 310)
                          ]).advance_watermark_to_infinity())

     class RecordFn(beam.DoFn):
       def process(
           self,
           element=beam.DoFn.ElementParam,
           timestamp=beam.DoFn.TimestampParam):
         yield (element, timestamp)

     with beam.Pipeline(argv=self.args) as p:
       my_record_fn = RecordFn()
       records = p | test_stream | beam.ParDo(my_record_fn)

       assert_that(
           records,
           equal_to([
               ('a', timestamp.Timestamp(10)),
               ('b', timestamp.Timestamp(10)),
               ('c', timestamp.Timestamp(10)),
               ('d', timestamp.Timestamp(20)),
               ('e', timestamp.Timestamp(20)),
               ('late', timestamp.Timestamp(12)),
               ('last', timestamp.Timestamp(310)),
           ]))

   @supported(['DirectRunner', 'SwitchingDirectRunner'])
   @pytest.mark.it_postcommit
   def test_multiple_outputs(self):
     """Tests that the TestStream supports emitting to multiple PCollections."""
     letters_elements = [
         TimestampedValue('a', 6),
         TimestampedValue('b', 7),
         TimestampedValue('c', 8),
     ]
     numbers_elements = [
         TimestampedValue('1', 11),
         TimestampedValue('2', 12),
         TimestampedValue('3', 13),
     ]
     test_stream = (
         TestStream().advance_watermark_to(5, tag='letters').add_elements(
             letters_elements,
             tag='letters').advance_watermark_to(10, tag='numbers').add_elements(
                 numbers_elements, tag='numbers'))

     class RecordFn(beam.DoFn):
       def process(
           self,
           element=beam.DoFn.ElementParam,
           timestamp=beam.DoFn.TimestampParam):
         yield (element, timestamp)

     options = StandardOptions(streaming=True)
     p = TestPipeline(is_integration_test=True, options=options)

     main = p | test_stream
     letters = main['letters'] | 'record letters' >> beam.ParDo(RecordFn())
     numbers = main['numbers'] | 'record numbers' >> beam.ParDo(RecordFn())

     assert_that(
         letters,
         equal_to([('a', Timestamp(6)), ('b', Timestamp(7)),
                   ('c', Timestamp(8))]),
         label='assert letters')

     assert_that(
         numbers,
         equal_to([('1', Timestamp(11)), ('2', Timestamp(12)),
                   ('3', Timestamp(13))]),
         label='assert numbers')

     p.run()

   @supported(['DirectRunner', 'SwitchingDirectRunner'])
   @pytest.mark.it_postcommit
   def test_multiple_outputs_with_watermark_advancement(self):
     """Tests that the TestStream can independently control output watermarks."""

     # Purposely set the watermark of numbers to 20 then letters to 5 to test
     # that the watermark advancement is per PCollection.
     #
     # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be
     # emitted at different times so that they will have different windows. The
     # watermark advancement is checked by checking their windows. If the
     # watermark does not advance, then the windows will be [-inf, -inf). If the
     # windows do not advance separately, then the PCollections will both
     # windowed in [15, 30).
     letters_elements = [
         TimestampedValue('a', 6),
         TimestampedValue('b', 7),
         TimestampedValue('c', 8),
     ]
     numbers_elements = [
         TimestampedValue('1', 21),
         TimestampedValue('2', 22),
         TimestampedValue('3', 23),
     ]
     test_stream = (
         TestStream().advance_watermark_to(
             0, tag='letters').advance_watermark_to(
                 0, tag='numbers').advance_watermark_to(
                     20, tag='numbers').advance_watermark_to(
                         5, tag='letters').add_elements(
                             letters_elements,
                             tag='letters').advance_watermark_to(
                                 10, tag='letters').add_elements(
                                     numbers_elements,
                                     tag='numbers').advance_watermark_to(
                                         30, tag='numbers'))

     options = StandardOptions(streaming=True)
     p = TestPipeline(is_integration_test=True, options=options)

     main = p | test_stream

     # Use an AfterWatermark trigger with an early firing to test that the
     # watermark is advancing properly and that the element is being emitted in
     # the correct window.
     letters = (
         main['letters']
         | 'letter windows' >> beam.WindowInto(
             FixedWindows(15),
             trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
             accumulation_mode=trigger.AccumulationMode.DISCARDING)
         | 'letter with key' >> beam.Map(lambda x: ('k', x))
         | 'letter gbk' >> beam.GroupByKey())

     numbers = (
         main['numbers']
         | 'number windows' >> beam.WindowInto(
             FixedWindows(15),
             trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
             accumulation_mode=trigger.AccumulationMode.DISCARDING)
         | 'number with key' >> beam.Map(lambda x: ('k', x))
         | 'number gbk' >> beam.GroupByKey())

     # The letters were emitted when the watermark was at 5, thus we expect to
     # see the elements in the [0, 15) window. We used an early trigger to make
     # sure that the ON_TIME empty pane was also emitted with a TestStream.
     # This pane has no data because of the early trigger causes the elements to
     # fire before the end of the window and because the accumulation mode
     # discards any data after the trigger fired.
     expected_letters = {
         window.IntervalWindow(0, 15): [
             ('k', ['a', 'b', 'c']),
             ('k', []),
         ],
     }

     # Same here, except the numbers were emitted at watermark = 20, thus they
     # are in the [15, 30) window.
     expected_numbers = {
         window.IntervalWindow(15, 30): [
             ('k', ['1', '2', '3']),
             ('k', []),
         ],
     }
     assert_that(
         letters,
         equal_to_per_window(expected_letters),
         label='letters assert per window')
     assert_that(
         numbers,
         equal_to_per_window(expected_numbers),
         label='numbers assert per window')

     p.run()


 if __name__ == '__main__':
   unittest.main()
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	"""Integration tests for the test_stream module."""

	# pytype: skip-file

	import unittest
	from functools import wraps

	import pytest

	import apache_beam as beam
	from apache_beam.options.pipeline_options import StandardOptions
	from apache_beam.testing.test_pipeline import TestPipeline
	from apache_beam.testing.test_stream import TestStream
	from apache_beam.testing.util import assert_that
	from apache_beam.testing.util import equal_to
	from apache_beam.testing.util import equal_to_per_window
	from apache_beam.transforms import trigger
	from apache_beam.transforms import window
	from apache_beam.transforms.window import FixedWindows
	from apache_beam.transforms.window import TimestampedValue
	from apache_beam.utils import timestamp
	from apache_beam.utils.timestamp import Timestamp


	def supported(runners):
	if not isinstance(runners, list):
	runners = [runners]

	def inner(fn):
	@wraps(fn)
	def wrapped(self):
	if self.runner_name not in runners:
	self.skipTest(
	'The "{}", does not support the TestStream transform. '
	'Supported runners: {}'.format(self.runner_name, runners))
	else:
	return fn(self)

	return wrapped

	return inner


	class TestStreamIntegrationTests(unittest.TestCase):
	@classmethod
	def setUpClass(cls):
	cls.test_pipeline = TestPipeline(is_integration_test=True)
	cls.args = cls.test_pipeline.get_full_options_as_args()
	cls.runner_name = type(cls.test_pipeline.runner).__name__
	cls.project = cls.test_pipeline.get_option('project')

	@supported(['DirectRunner', 'SwitchingDirectRunner'])
	@pytest.mark.it_postcommit
	def test_basic_execution(self):
	test_stream = (
	TestStream().advance_watermark_to(10).add_elements([
	'a', 'b', 'c'
	]).advance_watermark_to(20).add_elements(['d']).add_elements([
	'e'
	]).advance_processing_time(10).advance_watermark_to(300).add_elements([
	TimestampedValue('late', 12)
	]).add_elements([TimestampedValue('last', 310)
	]).advance_watermark_to_infinity())

	class RecordFn(beam.DoFn):
	def process(
	self,
	element=beam.DoFn.ElementParam,
	timestamp=beam.DoFn.TimestampParam):
	yield (element, timestamp)

	with beam.Pipeline(argv=self.args) as p:
	my_record_fn = RecordFn()
	records = p \| test_stream \| beam.ParDo(my_record_fn)

	assert_that(
	records,
	equal_to([
	('a', timestamp.Timestamp(10)),
	('b', timestamp.Timestamp(10)),
	('c', timestamp.Timestamp(10)),
	('d', timestamp.Timestamp(20)),
	('e', timestamp.Timestamp(20)),
	('late', timestamp.Timestamp(12)),
	('last', timestamp.Timestamp(310)),
	]))

	@supported(['DirectRunner', 'SwitchingDirectRunner'])
	@pytest.mark.it_postcommit
	def test_multiple_outputs(self):
	"""Tests that the TestStream supports emitting to multiple PCollections."""
	letters_elements = [
	TimestampedValue('a', 6),
	TimestampedValue('b', 7),
	TimestampedValue('c', 8),
	]
	numbers_elements = [
	TimestampedValue('1', 11),
	TimestampedValue('2', 12),
	TimestampedValue('3', 13),
	]
	test_stream = (
	TestStream().advance_watermark_to(5, tag='letters').add_elements(
	letters_elements,
	tag='letters').advance_watermark_to(10, tag='numbers').add_elements(
	numbers_elements, tag='numbers'))

	class RecordFn(beam.DoFn):
	def process(
	self,
	element=beam.DoFn.ElementParam,
	timestamp=beam.DoFn.TimestampParam):
	yield (element, timestamp)

	options = StandardOptions(streaming=True)
	p = TestPipeline(is_integration_test=True, options=options)

	main = p \| test_stream
	letters = main['letters'] \| 'record letters' >> beam.ParDo(RecordFn())
	numbers = main['numbers'] \| 'record numbers' >> beam.ParDo(RecordFn())

	assert_that(
	letters,
	equal_to([('a', Timestamp(6)), ('b', Timestamp(7)),
	('c', Timestamp(8))]),
	label='assert letters')

	assert_that(
	numbers,
	equal_to([('1', Timestamp(11)), ('2', Timestamp(12)),
	('3', Timestamp(13))]),
	label='assert numbers')

	p.run()

	@supported(['DirectRunner', 'SwitchingDirectRunner'])
	@pytest.mark.it_postcommit
	def test_multiple_outputs_with_watermark_advancement(self):
	"""Tests that the TestStream can independently control output watermarks."""

	# Purposely set the watermark of numbers to 20 then letters to 5 to test
	# that the watermark advancement is per PCollection.
	#
	# This creates two PCollections, (a, b, c) and (1, 2, 3). These will be
	# emitted at different times so that they will have different windows. The
	# watermark advancement is checked by checking their windows. If the
	# watermark does not advance, then the windows will be [-inf, -inf). If the
	# windows do not advance separately, then the PCollections will both
	# windowed in [15, 30).
	letters_elements = [
	TimestampedValue('a', 6),
	TimestampedValue('b', 7),
	TimestampedValue('c', 8),
	]
	numbers_elements = [
	TimestampedValue('1', 21),
	TimestampedValue('2', 22),
	TimestampedValue('3', 23),
	]
	test_stream = (
	TestStream().advance_watermark_to(
	0, tag='letters').advance_watermark_to(
	0, tag='numbers').advance_watermark_to(
	20, tag='numbers').advance_watermark_to(
	5, tag='letters').add_elements(
	letters_elements,
	tag='letters').advance_watermark_to(
	10, tag='letters').add_elements(
	numbers_elements,
	tag='numbers').advance_watermark_to(
	30, tag='numbers'))

	options = StandardOptions(streaming=True)
	p = TestPipeline(is_integration_test=True, options=options)

	main = p \| test_stream

	# Use an AfterWatermark trigger with an early firing to test that the
	# watermark is advancing properly and that the element is being emitted in
	# the correct window.
	letters = (
	main['letters']
	\| 'letter windows' >> beam.WindowInto(
	FixedWindows(15),
	trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
	accumulation_mode=trigger.AccumulationMode.DISCARDING)
	\| 'letter with key' >> beam.Map(lambda x: ('k', x))
	\| 'letter gbk' >> beam.GroupByKey())

	numbers = (
	main['numbers']
	\| 'number windows' >> beam.WindowInto(
	FixedWindows(15),
	trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
	accumulation_mode=trigger.AccumulationMode.DISCARDING)
	\| 'number with key' >> beam.Map(lambda x: ('k', x))
	\| 'number gbk' >> beam.GroupByKey())

	# The letters were emitted when the watermark was at 5, thus we expect to
	# see the elements in the [0, 15) window. We used an early trigger to make
	# sure that the ON_TIME empty pane was also emitted with a TestStream.
	# This pane has no data because of the early trigger causes the elements to
	# fire before the end of the window and because the accumulation mode
	# discards any data after the trigger fired.
	expected_letters = {
	window.IntervalWindow(0, 15): [
	('k', ['a', 'b', 'c']),
	('k', []),
	],
	}

	# Same here, except the numbers were emitted at watermark = 20, thus they
	# are in the [15, 30) window.
	expected_numbers = {
	window.IntervalWindow(15, 30): [
	('k', ['1', '2', '3']),
	('k', []),
	],
	}
	assert_that(
	letters,
	equal_to_per_window(expected_letters),
	label='letters assert per window')
	assert_that(
	numbers,
	equal_to_per_window(expected_numbers),
	label='numbers assert per window')

	p.run()


	if __name__ == '__main__':
	unittest.main()