sdks/python/apache_beam/testing/benchmarks/nexmark/nexmark_util.py - beam - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 """Utilities for the Nexmark suite.

 The Nexmark suite is a series of queries (streaming pipelines) performed
 on a simulation of auction events. This util includes:

   - A Command class used to terminate the streaming jobs
     launched in nexmark_launcher.py by the DirectRunner.
   - A ParseEventFn DoFn to parse events received from PubSub.

 Usage:

 To run a process for a certain duration, define in the code:
   command = Command(process_to_terminate, args)
   command.run(timeout=duration)

 """

 # pytype: skip-file

 import json
 import logging
 import threading

 import apache_beam as beam
 from apache_beam.metrics import MetricsFilter
 from apache_beam.runners.runner import PipelineResult  # pylint: disable=unused-import
 from apache_beam.testing.benchmarks.nexmark.models import auction_bid
 from apache_beam.testing.benchmarks.nexmark.models import nexmark_model
 from apache_beam.testing.benchmarks.nexmark.models.field_name import FieldNames
 from apache_beam.transforms import window
 from apache_beam.utils.timestamp import Timestamp

 _LOGGER = logging.getLogger(__name__)


 class Command(object):
   def __init__(self, cmd, args):
     self.cmd = cmd
     self.args = args

   def run(self, timeout):
     def thread_target():
       logging.debug(
           'Starting thread for %d seconds: %s', timeout, self.cmd.__name__)

       self.cmd(*self.args)
       _LOGGER.info(
           '%d seconds elapsed. Thread (%s) finished.',
           timeout,
           self.cmd.__name__)

     thread = threading.Thread(target=thread_target, name='Thread-timeout')
     thread.daemon = True
     thread.start()
     thread.join(timeout)


 def setup_coder():
   beam.coders.registry.register_coder(
       nexmark_model.Auction, nexmark_model.AuctionCoder)
   beam.coders.registry.register_coder(
       nexmark_model.Person, nexmark_model.PersonCoder)
   beam.coders.registry.register_coder(nexmark_model.Bid, nexmark_model.BidCoder)
   beam.coders.registry.register_coder(
       auction_bid.AuctionBid, auction_bid.AuctionBidCoder)


 class ParseEventFn(beam.DoFn):
   """
   Original parser for parsing raw events info into a Python objects.

   Each event line has the following format:

     person: <id starting with 'p'>,name,email,credit_card,city, \
             state,timestamp,extra
     auction: <id starting with 'a'>,item_name, description,initial_bid, \
              reserve_price,timestamp,expires,seller,category,extra
     bid: <auction starting with 'b'>,bidder,price,timestamp,extra

   For example:

     'p12345,maria,maria@maria.com,1234-5678-9012-3456, \
      sunnyvale,CA,1528098831536'
     'a12345,car67,2012 hyundai elantra,15000,20000, \
      1528098831536,20180630,maria,vehicle'
     'b12345,maria,20000,1528098831536'
   """
   def process(self, elem):
     model_dict = {
         'p': nexmark_model.Person,
         'a': nexmark_model.Auction,
         'b': nexmark_model.Bid,
     }
     row = elem.split(',')
     model = model_dict.get(elem[0])
     if not model:
       raise ValueError('Invalid event: %s.' % row)

     event = model(*row)
     logging.debug('Parsed event: %s', event)
     yield event


 class ParseJsonEventFn(beam.DoFn):
   """Parses the raw event info into a Python objects.

   Each event line has the following format:

     person:  {id,name,email,credit_card,city, \
               state,timestamp,extra}
     auction: {id,item_name, description,initial_bid, \
               reserve_price,timestamp,expires,seller,category,extra}
     bid:     {auction,bidder,price,timestamp,extra}

   For example:

     {"id":1000,"name":"Peter Jones","emailAddress":"nhd@xcat.com",\
      "creditCard":"7241 7320 9143 4888","city":"Portland","state":"WY",\
      "dateTime":1528098831026,\"extra":"WN_HS_bnpVQ\\[["}

     {"id":1000,"itemName":"wkx mgee","description":"eszpqxtdxrvwmmywkmogoahf",\
      "initialBid":28873,"reserve":29448,"dateTime":1528098831036,\
      "expires":1528098840451,"seller":1000,"category":13,"extra":"zcuupiz"}

     {"auction":1000,"bidder":1001,"price":32530001,"dateTime":1528098831066,\
      "extra":"fdiysaV^]NLVsbolvyqwgticfdrwdyiyofWPYTOuwogvszlxjrcNOORM"}
   """
   def process(self, elem):
     json_dict = json.loads(elem)
     if type(json_dict[FieldNames.DATE_TIME]) is dict:
       json_dict[FieldNames.DATE_TIME] = json_dict[
           FieldNames.DATE_TIME]['millis']
     if FieldNames.NAME in json_dict:
       yield nexmark_model.Person(
           json_dict[FieldNames.ID],
           json_dict[FieldNames.NAME],
           json_dict[FieldNames.EMAIL_ADDRESS],
           json_dict[FieldNames.CREDIT_CARD],
           json_dict[FieldNames.CITY],
           json_dict[FieldNames.STATE],
           millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
           json_dict[FieldNames.EXTRA])
     elif FieldNames.ITEM_NAME in json_dict:
       if type(json_dict[FieldNames.EXPIRES]) is dict:
         json_dict[FieldNames.EXPIRES] = json_dict[FieldNames.EXPIRES]['millis']
       yield nexmark_model.Auction(
           json_dict[FieldNames.ID],
           json_dict[FieldNames.ITEM_NAME],
           json_dict[FieldNames.DESCRIPTION],
           json_dict[FieldNames.INITIAL_BID],
           json_dict[FieldNames.RESERVE],
           millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
           millis_to_timestamp(json_dict[FieldNames.EXPIRES]),
           json_dict[FieldNames.SELLER],
           json_dict[FieldNames.CATEGORY],
           json_dict[FieldNames.EXTRA])
     elif FieldNames.AUCTION in json_dict:
       yield nexmark_model.Bid(
           json_dict[FieldNames.AUCTION],
           json_dict[FieldNames.BIDDER],
           json_dict[FieldNames.PRICE],
           millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
           json_dict[FieldNames.EXTRA])
     else:
       raise ValueError('Invalid event: %s.' % str(json_dict))


 class CountAndLog(beam.PTransform):
   def expand(self, pcoll):
     return (
         pcoll
         | 'window' >> beam.WindowInto(window.GlobalWindows())
         | "Count" >> beam.combiners.Count.Globally()
         | "Log" >> beam.Map(log_count_info))


 def log_count_info(count):
   logging.info('Query resulted in %d results', count)
   return count


 def display(elm):
   logging.debug(elm)
   return elm


 def millis_to_timestamp(millis: int) -> Timestamp:
   micro_second = millis * 1000
   return Timestamp(micros=micro_second)


 def get_counter_metric(
     result: PipelineResult, namespace: str, name: str) -> int:
   """
   get specific counter metric from pipeline result

   Args:
     result: the PipelineResult which metrics are read from
     namespace: a string representing the namespace of wanted metric
     name: a string representing the  name of the wanted metric

   Returns:
     the result of the wanted metric if it exist, else -1
   """
   metrics = result.metrics().query(
       MetricsFilter().with_namespace(namespace).with_name(name))
   counters = metrics['counters']
   if len(counters) > 1:
     raise RuntimeError(
         '%d instead of one metric result matches name: %s in namespace %s' %
         (len(counters), name, namespace))
   return counters[0].result if len(counters) > 0 else -1


 def get_start_time_metric(
     result: PipelineResult, namespace: str, name: str) -> int:
   """
   get the start time out of all times recorded by the specified distribution
   metric

   Args:
     result: the PipelineResult which metrics are read from
     namespace: a string representing the namespace of wanted metric
     name: a string representing the  name of the wanted metric

   Returns:
     the smallest time in the metric or -1 if it doesn't exist
   """
   distributions = result.metrics().query(
       MetricsFilter().with_namespace(namespace).with_name(
           name))['distributions']
   min_list = list(map(lambda m: m.result.min, distributions))
   return min(min_list) if len(min_list) > 0 else -1


 def get_end_time_metric(
     result: PipelineResult, namespace: str, name: str) -> int:
   """
   get the end time out of all times recorded by the specified distribution
   metric

   Args:
     result: the PipelineResult which metrics are read from
     namespace: a string representing the namespace of wanted metric
     name: a string representing the  name of the wanted metric

   Returns:
     the largest time in the metric or -1 if it doesn't exist
   """
   distributions = result.metrics().query(
       MetricsFilter().with_namespace(namespace).with_name(
           name))['distributions']
   max_list = list(map(lambda m: m.result.max, distributions))
   return max(max_list) if len(max_list) > 0 else -1
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	"""Utilities for the Nexmark suite.

	The Nexmark suite is a series of queries (streaming pipelines) performed
	on a simulation of auction events. This util includes:

	- A Command class used to terminate the streaming jobs
	launched in nexmark_launcher.py by the DirectRunner.
	- A ParseEventFn DoFn to parse events received from PubSub.

	Usage:

	To run a process for a certain duration, define in the code:
	command = Command(process_to_terminate, args)
	command.run(timeout=duration)

	"""

	# pytype: skip-file

	import json
	import logging
	import threading

	import apache_beam as beam
	from apache_beam.metrics import MetricsFilter
	from apache_beam.runners.runner import PipelineResult # pylint: disable=unused-import
	from apache_beam.testing.benchmarks.nexmark.models import auction_bid
	from apache_beam.testing.benchmarks.nexmark.models import nexmark_model
	from apache_beam.testing.benchmarks.nexmark.models.field_name import FieldNames
	from apache_beam.transforms import window
	from apache_beam.utils.timestamp import Timestamp

	_LOGGER = logging.getLogger(__name__)


	class Command(object):
	def __init__(self, cmd, args):
	self.cmd = cmd
	self.args = args

	def run(self, timeout):
	def thread_target():
	logging.debug(
	'Starting thread for %d seconds: %s', timeout, self.cmd.__name__)

	self.cmd(*self.args)
	_LOGGER.info(
	'%d seconds elapsed. Thread (%s) finished.',
	timeout,
	self.cmd.__name__)

	thread = threading.Thread(target=thread_target, name='Thread-timeout')
	thread.daemon = True
	thread.start()
	thread.join(timeout)


	def setup_coder():
	beam.coders.registry.register_coder(
	nexmark_model.Auction, nexmark_model.AuctionCoder)
	beam.coders.registry.register_coder(
	nexmark_model.Person, nexmark_model.PersonCoder)
	beam.coders.registry.register_coder(nexmark_model.Bid, nexmark_model.BidCoder)
	beam.coders.registry.register_coder(
	auction_bid.AuctionBid, auction_bid.AuctionBidCoder)


	class ParseEventFn(beam.DoFn):
	"""
	Original parser for parsing raw events info into a Python objects.

	Each event line has the following format:

	person: <id starting with 'p'>,name,email,credit_card,city, \
	state,timestamp,extra
	auction: <id starting with 'a'>,item_name, description,initial_bid, \
	reserve_price,timestamp,expires,seller,category,extra
	bid: <auction starting with 'b'>,bidder,price,timestamp,extra

	For example:

	'p12345,maria,maria@maria.com,1234-5678-9012-3456, \
	sunnyvale,CA,1528098831536'
	'a12345,car67,2012 hyundai elantra,15000,20000, \
	1528098831536,20180630,maria,vehicle'
	'b12345,maria,20000,1528098831536'
	"""
	def process(self, elem):
	model_dict = {
	'p': nexmark_model.Person,
	'a': nexmark_model.Auction,
	'b': nexmark_model.Bid,
	}
	row = elem.split(',')
	model = model_dict.get(elem[0])
	if not model:
	raise ValueError('Invalid event: %s.' % row)

	event = model(*row)
	logging.debug('Parsed event: %s', event)
	yield event


	class ParseJsonEventFn(beam.DoFn):
	"""Parses the raw event info into a Python objects.

	Each event line has the following format:

	person: {id,name,email,credit_card,city, \
	state,timestamp,extra}
	auction: {id,item_name, description,initial_bid, \
	reserve_price,timestamp,expires,seller,category,extra}
	bid: {auction,bidder,price,timestamp,extra}

	For example:

	{"id":1000,"name":"Peter Jones","emailAddress":"nhd@xcat.com",\
	"creditCard":"7241 7320 9143 4888","city":"Portland","state":"WY",\
	"dateTime":1528098831026,\"extra":"WN_HS_bnpVQ\\[["}

	{"id":1000,"itemName":"wkx mgee","description":"eszpqxtdxrvwmmywkmogoahf",\
	"initialBid":28873,"reserve":29448,"dateTime":1528098831036,\
	"expires":1528098840451,"seller":1000,"category":13,"extra":"zcuupiz"}

	{"auction":1000,"bidder":1001,"price":32530001,"dateTime":1528098831066,\
	"extra":"fdiysaV^]NLVsbolvyqwgticfdrwdyiyofWPYTOuwogvszlxjrcNOORM"}
	"""
	def process(self, elem):
	json_dict = json.loads(elem)
	if type(json_dict[FieldNames.DATE_TIME]) is dict:
	json_dict[FieldNames.DATE_TIME] = json_dict[
	FieldNames.DATE_TIME]['millis']
	if FieldNames.NAME in json_dict:
	yield nexmark_model.Person(
	json_dict[FieldNames.ID],
	json_dict[FieldNames.NAME],
	json_dict[FieldNames.EMAIL_ADDRESS],
	json_dict[FieldNames.CREDIT_CARD],
	json_dict[FieldNames.CITY],
	json_dict[FieldNames.STATE],
	millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
	json_dict[FieldNames.EXTRA])
	elif FieldNames.ITEM_NAME in json_dict:
	if type(json_dict[FieldNames.EXPIRES]) is dict:
	json_dict[FieldNames.EXPIRES] = json_dict[FieldNames.EXPIRES]['millis']
	yield nexmark_model.Auction(
	json_dict[FieldNames.ID],
	json_dict[FieldNames.ITEM_NAME],
	json_dict[FieldNames.DESCRIPTION],
	json_dict[FieldNames.INITIAL_BID],
	json_dict[FieldNames.RESERVE],
	millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
	millis_to_timestamp(json_dict[FieldNames.EXPIRES]),
	json_dict[FieldNames.SELLER],
	json_dict[FieldNames.CATEGORY],
	json_dict[FieldNames.EXTRA])
	elif FieldNames.AUCTION in json_dict:
	yield nexmark_model.Bid(
	json_dict[FieldNames.AUCTION],
	json_dict[FieldNames.BIDDER],
	json_dict[FieldNames.PRICE],
	millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
	json_dict[FieldNames.EXTRA])
	else:
	raise ValueError('Invalid event: %s.' % str(json_dict))


	class CountAndLog(beam.PTransform):
	def expand(self, pcoll):
	return (
	pcoll
	\| 'window' >> beam.WindowInto(window.GlobalWindows())
	\| "Count" >> beam.combiners.Count.Globally()
	\| "Log" >> beam.Map(log_count_info))


	def log_count_info(count):
	logging.info('Query resulted in %d results', count)
	return count


	def display(elm):
	logging.debug(elm)
	return elm


	def millis_to_timestamp(millis: int) -> Timestamp:
	micro_second = millis * 1000
	return Timestamp(micros=micro_second)


	def get_counter_metric(
	result: PipelineResult, namespace: str, name: str) -> int:
	"""
	get specific counter metric from pipeline result

	Args:
	result: the PipelineResult which metrics are read from
	namespace: a string representing the namespace of wanted metric
	name: a string representing the name of the wanted metric

	Returns:
	the result of the wanted metric if it exist, else -1
	"""
	metrics = result.metrics().query(
	MetricsFilter().with_namespace(namespace).with_name(name))
	counters = metrics['counters']
	if len(counters) > 1:
	raise RuntimeError(
	'%d instead of one metric result matches name: %s in namespace %s' %
	(len(counters), name, namespace))
	return counters[0].result if len(counters) > 0 else -1


	def get_start_time_metric(
	result: PipelineResult, namespace: str, name: str) -> int:
	"""
	get the start time out of all times recorded by the specified distribution
	metric

	Args:
	result: the PipelineResult which metrics are read from
	namespace: a string representing the namespace of wanted metric
	name: a string representing the name of the wanted metric

	Returns:
	the smallest time in the metric or -1 if it doesn't exist
	"""
	distributions = result.metrics().query(
	MetricsFilter().with_namespace(namespace).with_name(
	name))['distributions']
	min_list = list(map(lambda m: m.result.min, distributions))
	return min(min_list) if len(min_list) > 0 else -1


	def get_end_time_metric(
	result: PipelineResult, namespace: str, name: str) -> int:
	"""
	get the end time out of all times recorded by the specified distribution
	metric

	Args:
	result: the PipelineResult which metrics are read from
	namespace: a string representing the namespace of wanted metric
	name: a string representing the name of the wanted metric

	Returns:
	the largest time in the metric or -1 if it doesn't exist
	"""
	distributions = result.metrics().query(
	MetricsFilter().with_namespace(namespace).with_name(
	name))['distributions']
	max_list = list(map(lambda m: m.result.max, distributions))
	return max(max_list) if len(max_list) > 0 else -1