sdks/python/apache_beam/runners/dataflow/test_dataflow_runner.py - beam - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 """Wrapper of Beam runners that's built for running and verifying e2e tests."""

 from __future__ import absolute_import
 from __future__ import print_function

 import logging
 import time

 from apache_beam.internal import pickler
 from apache_beam.options.pipeline_options import GoogleCloudOptions
 from apache_beam.options.pipeline_options import StandardOptions
 from apache_beam.options.pipeline_options import TestOptions
 from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
 from apache_beam.runners.runner import PipelineState

 __all__ = ['TestDataflowRunner']

 # Dataflow take up to 10mins for the long tail of starting/stopping worker
 # pool.
 WAIT_IN_STATE_TIMEOUT = 10 * 60


 class TestDataflowRunner(DataflowRunner):
   def run_pipeline(self, pipeline, options):
     """Execute test pipeline and verify test matcher"""
     test_options = options.view_as(TestOptions)
     on_success_matcher = test_options.on_success_matcher
     wait_duration = test_options.wait_until_finish_duration
     is_streaming = options.view_as(StandardOptions).streaming

     # [BEAM-1889] Do not send this to remote workers also, there is no need to
     # send this option to remote executors.
     test_options.on_success_matcher = None

     self.result = super(TestDataflowRunner, self).run_pipeline(
         pipeline, options)
     if self.result.has_job:
       # TODO(markflyhigh)(BEAM-1890): Use print since Nose dosen't show logs
       # in some cases.
       print('Found: %s.' % self.build_console_url(options))

     try:
       self.wait_until_in_state(PipelineState.RUNNING)

       if is_streaming and not wait_duration:
         logging.warning('Waiting indefinitely for streaming job.')
       self.result.wait_until_finish(duration=wait_duration)

       if on_success_matcher:
         from hamcrest import assert_that as hc_assert_that
         hc_assert_that(self.result, pickler.loads(on_success_matcher))
     finally:
       if not self.result.is_in_terminal_state():
         self.result.cancel()
         self.wait_until_in_state(PipelineState.CANCELLED)

     return self.result

   def build_console_url(self, options):
     """Build a console url of Dataflow job."""
     project = options.view_as(GoogleCloudOptions).project
     region_id = options.view_as(GoogleCloudOptions).region
     job_id = self.result.job_id()
     return (
         'https://console.cloud.google.com/dataflow/jobsDetail/locations'
         '/%s/jobs/%s?project=%s' % (region_id, job_id, project))

   def wait_until_in_state(self, expected_state, timeout=WAIT_IN_STATE_TIMEOUT):
     """Wait until Dataflow pipeline enters a certain state."""
     if not self.result.has_job:
       raise IOError('Failed to get the Dataflow job id.')

     start_time = time.time()
     while time.time() - start_time <= timeout:
       job_state = self.result.state
       if self.result.is_in_terminal_state() or job_state == expected_state:
         return job_state
       time.sleep(5)

     raise RuntimeError('Timeout after %d seconds while waiting for job %s '
                        'enters expected state %s. Current state is %s.' %
                        (timeout, self.result.job_id(),
                         expected_state, self.result.state))
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	"""Wrapper of Beam runners that's built for running and verifying e2e tests."""

	from __future__ import absolute_import
	from __future__ import print_function

	import logging
	import time

	from apache_beam.internal import pickler
	from apache_beam.options.pipeline_options import GoogleCloudOptions
	from apache_beam.options.pipeline_options import StandardOptions
	from apache_beam.options.pipeline_options import TestOptions
	from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
	from apache_beam.runners.runner import PipelineState

	__all__ = ['TestDataflowRunner']

	# Dataflow take up to 10mins for the long tail of starting/stopping worker
	# pool.
	WAIT_IN_STATE_TIMEOUT = 10 * 60


	class TestDataflowRunner(DataflowRunner):
	def run_pipeline(self, pipeline, options):
	"""Execute test pipeline and verify test matcher"""
	test_options = options.view_as(TestOptions)
	on_success_matcher = test_options.on_success_matcher
	wait_duration = test_options.wait_until_finish_duration
	is_streaming = options.view_as(StandardOptions).streaming

	# [BEAM-1889] Do not send this to remote workers also, there is no need to
	# send this option to remote executors.
	test_options.on_success_matcher = None

	self.result = super(TestDataflowRunner, self).run_pipeline(
	pipeline, options)
	if self.result.has_job:
	# TODO(markflyhigh)(BEAM-1890): Use print since Nose dosen't show logs
	# in some cases.
	print('Found: %s.' % self.build_console_url(options))

	try:
	self.wait_until_in_state(PipelineState.RUNNING)

	if is_streaming and not wait_duration:
	logging.warning('Waiting indefinitely for streaming job.')
	self.result.wait_until_finish(duration=wait_duration)

	if on_success_matcher:
	from hamcrest import assert_that as hc_assert_that
	hc_assert_that(self.result, pickler.loads(on_success_matcher))
	finally:
	if not self.result.is_in_terminal_state():
	self.result.cancel()
	self.wait_until_in_state(PipelineState.CANCELLED)

	return self.result

	def build_console_url(self, options):
	"""Build a console url of Dataflow job."""
	project = options.view_as(GoogleCloudOptions).project
	region_id = options.view_as(GoogleCloudOptions).region
	job_id = self.result.job_id()
	return (
	'https://console.cloud.google.com/dataflow/jobsDetail/locations'
	'/%s/jobs/%s?project=%s' % (region_id, job_id, project))

	def wait_until_in_state(self, expected_state, timeout=WAIT_IN_STATE_TIMEOUT):
	"""Wait until Dataflow pipeline enters a certain state."""
	if not self.result.has_job:
	raise IOError('Failed to get the Dataflow job id.')

	start_time = time.time()
	while time.time() - start_time <= timeout:
	job_state = self.result.state
	if self.result.is_in_terminal_state() or job_state == expected_state:
	return job_state
	time.sleep(5)

	raise RuntimeError('Timeout after %d seconds while waiting for job %s '
	'enters expected state %s. Current state is %s.' %
	(timeout, self.result.job_id(),
	expected_state, self.result.state))