spot-ingest/pipelines/dns/collector.py - incubator-spot - Git at Google

 #/bin/env python

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import time
 import logging
 import os
 import json
 from multiprocessing import Process
 from common.utils import Util
 from common import hdfs_client as hdfs
 from common.hdfs_client import HdfsException
 from common.file_collector import FileWatcher
 from multiprocessing import Pool


 class Collector(object):

     def __init__(self, hdfs_app_path, kafkaproducer, conf_type):

         self._initialize_members(hdfs_app_path, kafkaproducer, conf_type)

     def _initialize_members(self, hdfs_app_path, kafkaproducer, conf_type):

         # getting parameters.
         self._logger = logging.getLogger('SPOT.INGEST.DNS')
         self._hdfs_app_path = hdfs_app_path
         self._producer = kafkaproducer

         # get script path
         self._script_path = os.path.dirname(os.path.abspath(__file__))

         # read dns configuration.
         conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
         conf = json.loads(open(conf_file).read())
         self._conf = conf["pipelines"][conf_type]

         # set configuration.
         self._collector_path = self._conf['collector_path']
         self._dsource = 'dns'
         self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource)

         # set configuration.
         self._pkt_num = self._conf['pkt_num']
         self._pcap_split_staging = self._conf['pcap_split_staging']
         self._supported_files = self._conf['supported_files']

         # create collector watcher
         self._watcher = FileWatcher(self._collector_path, self._supported_files)

         # Multiprocessing.
         self._processes = conf["collector_processes"]
         self._ingestion_interval = conf["ingestion_interval"]
         self._pool = Pool(processes=self._processes)
         # TODO: review re-use of hdfs.client
         self._hdfs_client = hdfs.get_client()

     def start(self):

         self._logger.info("Starting DNS ingest")
         self._watcher.start()

         try:
             while True:
                 self._ingest_files_pool()
                 time.sleep(self._ingestion_interval)
         except KeyboardInterrupt:
             self._logger.info("Stopping DNS collector...")
             Util.remove_kafka_topic(self._producer.Zookeeper, self._producer.Topic, self._logger)
             self._watcher.stop()
             self._pool.terminate()
             self._pool.close()
             self._pool.join()
             SystemExit("Ingest finished...")

     def _ingest_files_pool(self):

         if self._watcher.HasFiles:

             for x in range(0, self._processes):
                 self._logger.info('processes: {0}'.format(self._processes))
                 new_file = self._watcher.GetNextFile()
                 if self._processes <= 1:
                     _ingest_file(
                         self._hdfs_client,
                         new_file,
                         self._pkt_num,
                         self._pcap_split_staging,
                         self._hdfs_root_path,
                         self._producer,
                         self._producer.Topic
                         )
                 else:
                     resutl = self._pool.apply_async(_ingest_file, args=(
                         self._hdfs_client,
                         new_file,
                         self._pkt_num,
                         self._pcap_split_staging,
                         self._hdfs_root_path,
                         self._producer,
                         self._producer.Topic
                         ))
                 # resutl.get() # to debug add try and catch.
                 if not self._watcher.HasFiles:
                     break
         return True


 def _ingest_file(hdfs_client, new_file, pkt_num, pcap_split_staging, hdfs_root_path, producer, topic):

     logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))

     try:
         # get file name and date.
         org_file = new_file
         file_name_parts = new_file.split('/')
         file_name = file_name_parts[len(file_name_parts)-1]

         # split file.
         name = file_name.split('.')[0]
         split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(pkt_num,
                                                                   new_file,
                                                                   pcap_split_staging,
                                                                   name)
         logger.info("Splitting file: {0}".format(split_cmd))
         Util.execute_cmd(split_cmd,logger)

         logger.info("Removing file: {0}".format(org_file))
         rm_big_file = "rm {0}".format(org_file)
         Util.execute_cmd(rm_big_file,logger)

     except Exception as err:
         logger.error("There was a problem splitting the file: {0}".format(err.message))
         logger.error("Exception: {0}".format(err))

     for currdir, subdir, files in os.walk(pcap_split_staging):
         for file in files:
             if file.endswith(".pcap") and "{0}_spot".format(name) in file:
                 # get timestamp from the file name to build hdfs path.
                 file_date = file.split('.')[0]
                 pcap_hour = file_date[-6:-4]
                 pcap_date_path = file_date[-14:-6]

                 # hdfs path with timestamp.
                 hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, pcap_date_path, pcap_hour)

                 # create hdfs path.
                 try:
                     if len(hdfs.list_dir(hdfs_path, hdfs_client)) == 0:
                         logger.info('creating directory: ' + hdfs_path)
                         hdfs_client.mkdir(hdfs_path, hdfs_client)

                     # load file to hdfs.
                     hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file)
                     result = hdfs_client.upload_file(hadoop_pcap_file, os.path.join(currdir,file))
                     if not result:
                         logger.error('File failed to upload: ' + hadoop_pcap_file)
                         raise HdfsException

                     # create event for workers to process the file.
                     logger.info( "Sending split file to Topic: {0}".format(topic))
                     producer.SendMessage(hadoop_pcap_file, topic)
                     logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic))

                 except HdfsException as err:
                     logger.error('Exception: ' + err.exception)
                     logger.info('Check Hdfs Connection settings and server health')

                 except Exception as err:
                     logger.info("File {0} failed to be sent to Kafka Topic to: {1}".format(new_file,topic))
                     logger.error("Error: {0}".format(err))
	#/bin/env python

	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import time
	import logging
	import os
	import json
	from multiprocessing import Process
	from common.utils import Util
	from common import hdfs_client as hdfs
	from common.hdfs_client import HdfsException
	from common.file_collector import FileWatcher
	from multiprocessing import Pool


	class Collector(object):

	def __init__(self, hdfs_app_path, kafkaproducer, conf_type):

	self._initialize_members(hdfs_app_path, kafkaproducer, conf_type)

	def _initialize_members(self, hdfs_app_path, kafkaproducer, conf_type):

	# getting parameters.
	self._logger = logging.getLogger('SPOT.INGEST.DNS')
	self._hdfs_app_path = hdfs_app_path
	self._producer = kafkaproducer

	# get script path
	self._script_path = os.path.dirname(os.path.abspath(__file__))

	# read dns configuration.
	conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
	conf = json.loads(open(conf_file).read())
	self._conf = conf["pipelines"][conf_type]

	# set configuration.
	self._collector_path = self._conf['collector_path']
	self._dsource = 'dns'
	self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource)

	# set configuration.
	self._pkt_num = self._conf['pkt_num']
	self._pcap_split_staging = self._conf['pcap_split_staging']
	self._supported_files = self._conf['supported_files']

	# create collector watcher
	self._watcher = FileWatcher(self._collector_path, self._supported_files)

	# Multiprocessing.
	self._processes = conf["collector_processes"]
	self._ingestion_interval = conf["ingestion_interval"]
	self._pool = Pool(processes=self._processes)
	# TODO: review re-use of hdfs.client
	self._hdfs_client = hdfs.get_client()

	def start(self):

	self._logger.info("Starting DNS ingest")
	self._watcher.start()

	try:
	while True:
	self._ingest_files_pool()
	time.sleep(self._ingestion_interval)
	except KeyboardInterrupt:
	self._logger.info("Stopping DNS collector...")
	Util.remove_kafka_topic(self._producer.Zookeeper, self._producer.Topic, self._logger)
	self._watcher.stop()
	self._pool.terminate()
	self._pool.close()
	self._pool.join()
	SystemExit("Ingest finished...")

	def _ingest_files_pool(self):

	if self._watcher.HasFiles:

	for x in range(0, self._processes):
	self._logger.info('processes: {0}'.format(self._processes))
	new_file = self._watcher.GetNextFile()
	if self._processes <= 1:
	_ingest_file(
	self._hdfs_client,
	new_file,
	self._pkt_num,
	self._pcap_split_staging,
	self._hdfs_root_path,
	self._producer,
	self._producer.Topic
	)
	else:
	resutl = self._pool.apply_async(_ingest_file, args=(
	self._hdfs_client,
	new_file,
	self._pkt_num,
	self._pcap_split_staging,
	self._hdfs_root_path,
	self._producer,
	self._producer.Topic
	))
	# resutl.get() # to debug add try and catch.
	if not self._watcher.HasFiles:
	break
	return True


	def _ingest_file(hdfs_client, new_file, pkt_num, pcap_split_staging, hdfs_root_path, producer, topic):

	logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))

	try:
	# get file name and date.
	org_file = new_file
	file_name_parts = new_file.split('/')
	file_name = file_name_parts[len(file_name_parts)-1]

	# split file.
	name = file_name.split('.')[0]
	split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(pkt_num,
	new_file,
	pcap_split_staging,
	name)
	logger.info("Splitting file: {0}".format(split_cmd))
	Util.execute_cmd(split_cmd,logger)

	logger.info("Removing file: {0}".format(org_file))
	rm_big_file = "rm {0}".format(org_file)
	Util.execute_cmd(rm_big_file,logger)

	except Exception as err:
	logger.error("There was a problem splitting the file: {0}".format(err.message))
	logger.error("Exception: {0}".format(err))

	for currdir, subdir, files in os.walk(pcap_split_staging):
	for file in files:
	if file.endswith(".pcap") and "{0}_spot".format(name) in file:
	# get timestamp from the file name to build hdfs path.
	file_date = file.split('.')[0]
	pcap_hour = file_date[-6:-4]
	pcap_date_path = file_date[-14:-6]

	# hdfs path with timestamp.
	hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, pcap_date_path, pcap_hour)

	# create hdfs path.
	try:
	if len(hdfs.list_dir(hdfs_path, hdfs_client)) == 0:
	logger.info('creating directory: ' + hdfs_path)
	hdfs_client.mkdir(hdfs_path, hdfs_client)

	# load file to hdfs.
	hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file)
	result = hdfs_client.upload_file(hadoop_pcap_file, os.path.join(currdir,file))
	if not result:
	logger.error('File failed to upload: ' + hadoop_pcap_file)
	raise HdfsException

	# create event for workers to process the file.
	logger.info( "Sending split file to Topic: {0}".format(topic))
	producer.SendMessage(hadoop_pcap_file, topic)
	logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic))

	except HdfsException as err:
	logger.error('Exception: ' + err.exception)
	logger.info('Check Hdfs Connection settings and server health')

	except Exception as err:
	logger.info("File {0} failed to be sent to Kafka Topic to: {1}".format(new_file,topic))
	logger.error("Error: {0}".format(err))