heronpy/connectors/textfiles/textfilespout.py - incubator-heron - Git at Google

 # Copyright 2016 - Twitter, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 '''textfilespout.py: module that defines a Heron Spout that reads data
    from a list of files and emits one tuple per line'''
 from heronpy.api.spout.spout import Spout

 class TextFileSpout(Spout):
   """TextFileSpout: reads from a list of files"""

   # TopologyBuilder uses these constants to set config
   FILES = "files"

   def initialize(self, config, context):
     """Implements TextFile Spout's initialize method"""
     self.logger.info("Initializing TextFileSpout with the following")
     self.logger.info("Component-specific config: \n%s" % str(config))
     all_spout_tasks = context.get_component_tasks(context.get_component_id())
     all_spout_tasks.sort()
     if context.get_task_id() not in all_spout_tasks:
       raise RuntimeError("TextFileSpout's task_id %d not among all TextFileSpout %s" %
                          (context.get_task_id(), str(all_spout_tasks)))
     myindex = all_spout_tasks.index(context.get_task_id())
     if TextFileSpout.FILES not in config:
       raise RuntimeError("TextFileSpout's Files config not setup properly")
     all_files_to_consume = config[TextFileSpout.FILES]
     if not isinstance(all_files_to_consume, list):
       raise RuntimeError("TextFileSpout's Files config must be a list")
     self.files_to_consume = all_files_to_consume[myindex::len(all_spout_tasks)]
     self.logger.info("TextFileSpout files to consume %s" % self.files_to_consume)
     self.lines_to_consume = self._get_next_lines()
     self.emit_count = 0
     self.ack_count = 0
     self.fail_count = 0

   def _get_next_lines(self):
     next_lines = []
     while len(next_lines) == 0:
       next_lines = self._consume_next_file()
       if next_lines is None:
         return next_lines
     return next_lines

   def _consume_next_file(self):
     file_to_consume = self._get_next_file_to_consume()
     if file_to_consume is None:
       self.logger.info("All files consumed")
       return None
     self.logger.info("Now reading file %s" % file_to_consume)
     try:
       filep = open(file_to_consume, 'r')
       return filep.readlines()
     except IOError as e:
       self.logger.info("Could not open the file %s" % file_to_consume)
       raise e

   def _get_next_file_to_consume(self):
     if len(self.files_to_consume) == 0:
       return None
     return self.files_to_consume.pop()

   def next_tuple(self):
     if self.lines_to_consume is None:
       return
     next_line = self.lines_to_consume.pop()
     if len(self.lines_to_consume) == 0:
       self.lines_to_consume = self._get_next_lines()
     self.emit(next_line)
     self.emit_count += 1

   def ack(self, tup_id):
     self.ack_count += 1
     self.logger.debug("Acked tuple %s" % str(tup_id))

   def fail(self, tup_id):
     self.fail_count += 1
     self.logger.debug("Failed tuple %s" % str(tup_id))
	# Copyright 2016 - Twitter, Inc.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	'''textfilespout.py: module that defines a Heron Spout that reads data
	from a list of files and emits one tuple per line'''
	from heronpy.api.spout.spout import Spout

	class TextFileSpout(Spout):
	"""TextFileSpout: reads from a list of files"""

	# TopologyBuilder uses these constants to set config
	FILES = "files"

	def initialize(self, config, context):
	"""Implements TextFile Spout's initialize method"""
	self.logger.info("Initializing TextFileSpout with the following")
	self.logger.info("Component-specific config: \n%s" % str(config))
	all_spout_tasks = context.get_component_tasks(context.get_component_id())
	all_spout_tasks.sort()
	if context.get_task_id() not in all_spout_tasks:
	raise RuntimeError("TextFileSpout's task_id %d not among all TextFileSpout %s" %
	(context.get_task_id(), str(all_spout_tasks)))
	myindex = all_spout_tasks.index(context.get_task_id())
	if TextFileSpout.FILES not in config:
	raise RuntimeError("TextFileSpout's Files config not setup properly")
	all_files_to_consume = config[TextFileSpout.FILES]
	if not isinstance(all_files_to_consume, list):
	raise RuntimeError("TextFileSpout's Files config must be a list")
	self.files_to_consume = all_files_to_consume[myindex::len(all_spout_tasks)]
	self.logger.info("TextFileSpout files to consume %s" % self.files_to_consume)
	self.lines_to_consume = self._get_next_lines()
	self.emit_count = 0
	self.ack_count = 0
	self.fail_count = 0

	def _get_next_lines(self):
	next_lines = []
	while len(next_lines) == 0:
	next_lines = self._consume_next_file()
	if next_lines is None:
	return next_lines
	return next_lines

	def _consume_next_file(self):
	file_to_consume = self._get_next_file_to_consume()
	if file_to_consume is None:
	self.logger.info("All files consumed")
	return None
	self.logger.info("Now reading file %s" % file_to_consume)
	try:
	filep = open(file_to_consume, 'r')
	return filep.readlines()
	except IOError as e:
	self.logger.info("Could not open the file %s" % file_to_consume)
	raise e

	def _get_next_file_to_consume(self):
	if len(self.files_to_consume) == 0:
	return None
	return self.files_to_consume.pop()

	def next_tuple(self):
	if self.lines_to_consume is None:
	return
	next_line = self.lines_to_consume.pop()
	if len(self.lines_to_consume) == 0:
	self.lines_to_consume = self._get_next_lines()
	self.emit(next_line)
	self.emit_count += 1

	def ack(self, tup_id):
	self.ack_count += 1
	self.logger.debug("Acked tuple %s" % str(tup_id))

	def fail(self, tup_id):
	self.fail_count += 1
	self.logger.debug("Failed tuple %s" % str(tup_id))