python/pyspark/testing/streamingutils.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import os
 import tempfile
 import time
 import unittest

 from pyspark import SparkConf, SparkContext, RDD
 from pyspark.streaming import StreamingContext
 from pyspark.testing.sqlutils import search_jar


 # Must be same as the variable and condition defined in KinesisTestUtils.scala and modules.py
 kinesis_test_environ_var = "ENABLE_KINESIS_TESTS"
 should_skip_kinesis_tests = not os.environ.get(kinesis_test_environ_var) == "1"

 if should_skip_kinesis_tests:
     kinesis_requirement_message = (
         "Skipping all Kinesis Python tests as environmental variable 'ENABLE_KINESIS_TESTS' "
         "was not set."
     )
 else:
     kinesis_asl_assembly_jar = search_jar(
         "connector/kinesis-asl-assembly",
         "spark-streaming-kinesis-asl-assembly-",
         "spark-streaming-kinesis-asl-assembly_",
     )
     if kinesis_asl_assembly_jar is None:
         kinesis_requirement_message = (
             "Skipping all Kinesis Python tests as the optional Kinesis project was "
             "not compiled into a JAR. To run these tests, "
             "you need to build Spark with 'build/sbt -Pkinesis-asl assembly/package "
             "streaming-kinesis-asl-assembly/assembly' or "
             "'build/mvn -Pkinesis-asl package' before running this test."
         )
     else:
         existing_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
         jars_args = "--jars %s" % kinesis_asl_assembly_jar
         os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join([jars_args, existing_args])
         kinesis_requirement_message = None  # type: ignore

 should_test_kinesis = kinesis_requirement_message is None


 class PySparkStreamingTestCase(unittest.TestCase):
     timeout = 30  # seconds
     duration = 0.5

     @classmethod
     def setUpClass(cls):
         class_name = cls.__name__
         conf = SparkConf().set("spark.default.parallelism", 1)
         cls.sc = SparkContext(appName=class_name, conf=conf)
         cls.sc.setCheckpointDir(tempfile.mkdtemp())

     @classmethod
     def tearDownClass(cls):
         cls.sc.stop()
         # Clean up in the JVM just in case there has been some issues in Python API
         try:
             jSparkContextOption = SparkContext._jvm.SparkContext.get()
             if jSparkContextOption.nonEmpty():
                 jSparkContextOption.get().stop()
         except BaseException:
             pass

     def setUp(self):
         self.ssc = StreamingContext(self.sc, self.duration)

     def tearDown(self):
         if self.ssc is not None:
             self.ssc.stop(False)
         # Clean up in the JVM just in case there has been some issues in Python API
         try:
             jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive()
             if jStreamingContextOption.nonEmpty():
                 jStreamingContextOption.get().stop(False)
         except BaseException:
             pass

     def wait_for(self, result, n):
         start_time = time.time()
         while len(result) < n and time.time() - start_time < self.timeout:
             time.sleep(0.01)
         if len(result) < n:
             print("timeout after", self.timeout)

     def _take(self, dstream, n):
         """
         Return the first `n` elements in the stream (will start and stop).
         """
         results = []

         def take(_, rdd):
             if rdd and len(results) < n:
                 results.extend(rdd.take(n - len(results)))

         dstream.foreachRDD(take)

         self.ssc.start()
         self.wait_for(results, n)
         return results

     def _collect(self, dstream, n, block=True):
         """
         Collect each RDDs into the returned list.

         Returns
         -------
         list
             which will have the collected items.
         """
         result = []

         def get_output(_, rdd):
             if rdd and len(result) < n:
                 r = rdd.collect()
                 if r:
                     result.append(r)

         dstream.foreachRDD(get_output)

         if not block:
             return result

         self.ssc.start()
         self.wait_for(result, n)
         return result

     def _test_func(self, input, func, expected, sort=False, input2=None):
         """
         Parameters
         ----------
         input : list
             dataset for the test. This should be list of lists.
         func : function
             wrapped function. This function should return PythonDStream object.
         expected
             expected output for this testcase.
         """
         if not isinstance(input[0], RDD):
             input = [self.sc.parallelize(d, 1) for d in input]
         input_stream = self.ssc.queueStream(input)
         if input2 and not isinstance(input2[0], RDD):
             input2 = [self.sc.parallelize(d, 1) for d in input2]
         input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None

         # Apply test function to stream.
         if input2:
             stream = func(input_stream, input_stream2)
         else:
             stream = func(input_stream)

         result = self._collect(stream, len(expected))
         if sort:
             self._sort_result_based_on_key(result)
             self._sort_result_based_on_key(expected)
         self.assertEqual(expected, result)

     def _sort_result_based_on_key(self, outputs):
         """Sort the list based on first value."""
         for output in outputs:
             output.sort(key=lambda x: x[0])
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	import os
	import tempfile
	import time
	import unittest

	from pyspark import SparkConf, SparkContext, RDD
	from pyspark.streaming import StreamingContext
	from pyspark.testing.sqlutils import search_jar


	# Must be same as the variable and condition defined in KinesisTestUtils.scala and modules.py
	kinesis_test_environ_var = "ENABLE_KINESIS_TESTS"
	should_skip_kinesis_tests = not os.environ.get(kinesis_test_environ_var) == "1"

	if should_skip_kinesis_tests:
	kinesis_requirement_message = (
	"Skipping all Kinesis Python tests as environmental variable 'ENABLE_KINESIS_TESTS' "
	"was not set."
	)
	else:
	kinesis_asl_assembly_jar = search_jar(
	"connector/kinesis-asl-assembly",
	"spark-streaming-kinesis-asl-assembly-",
	"spark-streaming-kinesis-asl-assembly_",
	)
	if kinesis_asl_assembly_jar is None:
	kinesis_requirement_message = (
	"Skipping all Kinesis Python tests as the optional Kinesis project was "
	"not compiled into a JAR. To run these tests, "
	"you need to build Spark with 'build/sbt -Pkinesis-asl assembly/package "
	"streaming-kinesis-asl-assembly/assembly' or "
	"'build/mvn -Pkinesis-asl package' before running this test."
	)
	else:
	existing_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
	jars_args = "--jars %s" % kinesis_asl_assembly_jar
	os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join([jars_args, existing_args])
	kinesis_requirement_message = None # type: ignore

	should_test_kinesis = kinesis_requirement_message is None


	class PySparkStreamingTestCase(unittest.TestCase):
	timeout = 30 # seconds
	duration = 0.5

	@classmethod
	def setUpClass(cls):
	class_name = cls.__name__
	conf = SparkConf().set("spark.default.parallelism", 1)
	cls.sc = SparkContext(appName=class_name, conf=conf)
	cls.sc.setCheckpointDir(tempfile.mkdtemp())

	@classmethod
	def tearDownClass(cls):
	cls.sc.stop()
	# Clean up in the JVM just in case there has been some issues in Python API
	try:
	jSparkContextOption = SparkContext._jvm.SparkContext.get()
	if jSparkContextOption.nonEmpty():
	jSparkContextOption.get().stop()
	except BaseException:
	pass

	def setUp(self):
	self.ssc = StreamingContext(self.sc, self.duration)

	def tearDown(self):
	if self.ssc is not None:
	self.ssc.stop(False)
	# Clean up in the JVM just in case there has been some issues in Python API
	try:
	jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive()
	if jStreamingContextOption.nonEmpty():
	jStreamingContextOption.get().stop(False)
	except BaseException:
	pass

	def wait_for(self, result, n):
	start_time = time.time()
	while len(result) < n and time.time() - start_time < self.timeout:
	time.sleep(0.01)
	if len(result) < n:
	print("timeout after", self.timeout)

	def _take(self, dstream, n):
	"""
	Return the first `n` elements in the stream (will start and stop).
	"""
	results = []

	def take(_, rdd):
	if rdd and len(results) < n:
	results.extend(rdd.take(n - len(results)))

	dstream.foreachRDD(take)

	self.ssc.start()
	self.wait_for(results, n)
	return results

	def _collect(self, dstream, n, block=True):
	"""
	Collect each RDDs into the returned list.

	Returns
	-------
	list
	which will have the collected items.
	"""
	result = []

	def get_output(_, rdd):
	if rdd and len(result) < n:
	r = rdd.collect()
	if r:
	result.append(r)

	dstream.foreachRDD(get_output)

	if not block:
	return result

	self.ssc.start()
	self.wait_for(result, n)
	return result

	def _test_func(self, input, func, expected, sort=False, input2=None):
	"""
	Parameters
	----------
	input : list
	dataset for the test. This should be list of lists.
	func : function
	wrapped function. This function should return PythonDStream object.
	expected
	expected output for this testcase.
	"""
	if not isinstance(input[0], RDD):
	input = [self.sc.parallelize(d, 1) for d in input]
	input_stream = self.ssc.queueStream(input)
	if input2 and not isinstance(input2[0], RDD):
	input2 = [self.sc.parallelize(d, 1) for d in input2]
	input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None

	# Apply test function to stream.
	if input2:
	stream = func(input_stream, input_stream2)
	else:
	stream = func(input_stream)

	result = self._collect(stream, len(expected))
	if sort:
	self._sort_result_based_on_key(result)
	self._sort_result_based_on_key(expected)
	self.assertEqual(expected, result)

	def _sort_result_based_on_key(self, outputs):
	"""Sort the list based on first value."""
	for output in outputs:
	output.sort(key=lambda x: x[0])