blob: 5da239364f8c2f47709db6dfb4a0aa2375ac790f [file] [log] [blame]
# Copyright 2016 Twitter. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""module for join bolt: ReduceByKeyAndWindowBolt"""
import collections
from heronpy.api.bolt.window_bolt import SlidingWindowBolt
from heronpy.api.custom_grouping import ICustomGrouping
from heronpy.api.component.component_spec import GlobalStreamId
from heronpy.api.stream import Grouping
from heronpy.dsl.streamlet import Streamlet, TimeWindow
from heronpy.dsl.dslboltbase import DslBoltBase
# pylint: disable=unused-argument
class ReduceByKeyAndWindowBolt(SlidingWindowBolt, DslBoltBase):
"""ReduceByKeyAndWindowBolt"""
FUNCTION = 'function'
WINDOWDURATION = SlidingWindowBolt.WINDOW_DURATION_SECS
SLIDEINTERVAL = SlidingWindowBolt.WINDOW_SLIDEINTERVAL_SECS
def initialize(self, config, context):
super(ReduceByKeyAndWindowBolt, self).initialize(config, context)
if ReduceByKeyAndWindowBolt.FUNCTION not in config:
raise RuntimeError("FUNCTION not specified in reducebywindow operator")
self.reduce_function = config[ReduceByKeyAndWindowBolt.FUNCTION]
if not callable(self.reduce_function):
raise RuntimeError("Reduce Function has to be callable")
@staticmethod
def _add(key, value, mymap):
if key in mymap:
mymap[key].append(value)
else:
mymap[key] = [value]
def processWindow(self, window_config, tuples):
# our temporary map
mymap = {}
for tup in tuples:
userdata = tup.values[0]
if not isinstance(userdata, collections.Iterable) or len(userdata) != 2:
raise RuntimeError("ReduceByWindow tuples must be iterable of length 2")
self._add(userdata[0], userdata[1], mymap)
for (key, values) in mymap.items():
result = values[0]
for value in values[1:]:
self.reduce_function(result, value)
self.emit([(key, result)], stream='output')
# pylint: disable=unused-argument
class ReduceGrouping(ICustomGrouping):
def prepare(self, context, component, stream, target_tasks):
self.target_tasks = target_tasks
def choose_tasks(self, values):
assert isinstance(values, list) and len(values) == 1
userdata = values[0]
if not isinstance(userdata, collections.Iterable) or len(userdata) != 2:
raise RuntimeError("Tuples going to reduce must be iterable of length 2")
# only emits to the first task id
hashvalue = hash(userdata[0])
target_index = hashvalue % len(self.target_tasks)
return [self.target_tasks[target_index]]
# pylint: disable=protected-access
class ReduceByKeyAndWindowStreamlet(Streamlet):
"""ReduceByKeyAndWindowStreamlet"""
def __init__(self, time_window, reduce_function, parents, stage_name=None, parallelism=None):
super(ReduceByKeyAndWindowStreamlet, self).__init__(parents=parents,
stage_name=stage_name,
parallelism=parallelism)
self._time_window = time_window
self._reduce_function = reduce_function
def _calculate_inputs(self):
return {GlobalStreamId(self._parents[0]._stage_name, self._parents[0]._output) :
Grouping.custom("heronpy.dsl.reducebykeyandwindowbolt.ReduceGrouping")}
def _calculate_stage_name(self, existing_stage_names):
funcname = "reducebykeyandwindow-" + self._reduce_function.__name__
if funcname not in existing_stage_names:
return funcname
else:
index = 1
newfuncname = funcname + str(index)
while newfuncname in existing_stage_names:
index = index + 1
newfuncname = funcname + str(index)
return newfuncname
def _build_this(self, builder):
if not callable(self._reduce_function):
raise RuntimeError("reduce function must be callable")
if not isinstance(self._time_window, TimeWindow):
raise RuntimeError("reduce's time_window should be TimeWindow")
builder.add_bolt(self._stage_name, ReduceByKeyAndWindowBolt, par=self._parallelism,
inputs=self._calculate_inputs(),
config={ReduceByKeyAndWindowBolt.FUNCTION : self._reduce_function,
ReduceByKeyAndWindowBolt.WINDOWDURATION : self._time_window.duration,
ReduceByKeyAndWindowBolt.SLIDEINTERVAL :
self._time_window.sliding_interval})