spot-ingest/master_collector.py - incubator-spot - Git at Google

 #!/bin/env python

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import argparse
 import os
 import json
 import sys
 import datetime
 from common.utils import Util
 from common.kerberos import Kerberos
 import common.configurator as Config
 from common.kafka_client import KafkaProducer

 # get master configuration.
 SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
 CONF_FILE = "{0}/ingest_conf.json".format(SCRIPT_PATH)
 MASTER_CONF = json.loads(open(CONF_FILE).read())


 def main():

     # input Parameters
     parser = argparse.ArgumentParser(description="Master Collector Ingest Daemon")
     parser.add_argument('-t', '--type', dest='type', required=True,
                         help='Type of data that will be ingested (Pipeline Configuration)',
                         metavar='')
     parser.add_argument('-w', '--workers', dest='workers_num',
                         required=True, help='Number of workers for the ingest process',
                         metavar='')
     parser.add_argument('-id', '--ingestId', dest='ingest_id',
                         required=False, help='Ingest ID', metavar='')
     args = parser.parse_args()

     # start collector based on data source type.
     start_collector(args.type, args.workers_num, args.ingest_id)


 def start_collector(type, workers_num, id=None):

     # generate ingest id
     ingest_id = str(datetime.datetime.time(datetime.datetime.now())).replace(":", "_").replace(".", "_")

     # create logger.
     logger = Util.get_logger("SPOT.INGEST")

     # validate the given configuration exists in ingest_conf.json.
     if not type in MASTER_CONF["pipelines"]:
         logger.error("'{0}' type is not a valid configuration.".format(type))
         sys.exit(1)

     # validate the type is a valid module.
     if not Util.validate_data_source(MASTER_CONF["pipelines"][type]["type"]):
         logger.error("'{0}' type is not configured. Please check you ingest conf file".format(MASTER_CONF["pipelines"][type]["type"]))
         sys.exit(1)

     # validate if kerberos authentication is required.
     if Config.kerberos_enabled():
         kb = Kerberos()
         kb.authenticate()

     # kafka server info.
     logger.info("Initializing kafka instance")
     k_server = MASTER_CONF["kafka"]['kafka_server']
     k_port = MASTER_CONF["kafka"]['kafka_port']

     # required zookeeper info.
     zk_server = MASTER_CONF["kafka"]['zookeper_server']
     zk_port = MASTER_CONF["kafka"]['zookeper_port']

     topic = "{0}".format(type,ingest_id) if not id else id
     producer = KafkaProducer(topic, k_server, k_port, zk_server, zk_port, workers_num)

     # create a collector instance based on data source type.
     logger.info("Starting {0} ingest instance".format(topic))
     module = __import__("pipelines.{0}.collector".
                         format(MASTER_CONF["pipelines"][type]["type"]),
                         fromlist=['Collector'])

     # start collector.
     ingest_collector = module.Collector(MASTER_CONF['hdfs_app_path'], producer, type)
     ingest_collector.start()


 if __name__ == '__main__':
     main()
	#!/bin/env python

	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import argparse
	import os
	import json
	import sys
	import datetime
	from common.utils import Util
	from common.kerberos import Kerberos
	import common.configurator as Config
	from common.kafka_client import KafkaProducer

	# get master configuration.
	SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
	CONF_FILE = "{0}/ingest_conf.json".format(SCRIPT_PATH)
	MASTER_CONF = json.loads(open(CONF_FILE).read())


	def main():

	# input Parameters
	parser = argparse.ArgumentParser(description="Master Collector Ingest Daemon")
	parser.add_argument('-t', '--type', dest='type', required=True,
	help='Type of data that will be ingested (Pipeline Configuration)',
	metavar='')
	parser.add_argument('-w', '--workers', dest='workers_num',
	required=True, help='Number of workers for the ingest process',
	metavar='')
	parser.add_argument('-id', '--ingestId', dest='ingest_id',
	required=False, help='Ingest ID', metavar='')
	args = parser.parse_args()

	# start collector based on data source type.
	start_collector(args.type, args.workers_num, args.ingest_id)


	def start_collector(type, workers_num, id=None):

	# generate ingest id
	ingest_id = str(datetime.datetime.time(datetime.datetime.now())).replace(":", "_").replace(".", "_")

	# create logger.
	logger = Util.get_logger("SPOT.INGEST")

	# validate the given configuration exists in ingest_conf.json.
	if not type in MASTER_CONF["pipelines"]:
	logger.error("'{0}' type is not a valid configuration.".format(type))
	sys.exit(1)

	# validate the type is a valid module.
	if not Util.validate_data_source(MASTER_CONF["pipelines"][type]["type"]):
	logger.error("'{0}' type is not configured. Please check you ingest conf file".format(MASTER_CONF["pipelines"][type]["type"]))
	sys.exit(1)

	# validate if kerberos authentication is required.
	if Config.kerberos_enabled():
	kb = Kerberos()
	kb.authenticate()

	# kafka server info.
	logger.info("Initializing kafka instance")
	k_server = MASTER_CONF["kafka"]['kafka_server']
	k_port = MASTER_CONF["kafka"]['kafka_port']

	# required zookeeper info.
	zk_server = MASTER_CONF["kafka"]['zookeper_server']
	zk_port = MASTER_CONF["kafka"]['zookeper_port']

	topic = "{0}".format(type,ingest_id) if not id else id
	producer = KafkaProducer(topic, k_server, k_port, zk_server, zk_port, workers_num)

	# create a collector instance based on data source type.
	logger.info("Starting {0} ingest instance".format(topic))
	module = __import__("pipelines.{0}.collector".
	format(MASTER_CONF["pipelines"][type]["type"]),
	fromlist=['Collector'])

	# start collector.
	ingest_collector = module.Collector(MASTER_CONF['hdfs_app_path'], producer, type)
	ingest_collector.start()


	if __name__ == '__main__':
	main()