Merge pull request #144 from curtishoward/SPOT-181_envelope_ingest

[SPOT-181] spot-ingest for ODM with config-driven Spark streaming (Envelope)
diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index 977a58e..90cddb3 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -29,11 +29,11 @@
 if [[ "${#FDATE}" != "8" || -z "${DSOURCE}" ]]; then
     echo "ml_ops.sh syntax error"
     echo "Please run ml_ops.sh again with the correct syntax:"
-    echo "./ml_ops.sh YYYYMMDD TYPE [MAX RESULTS] [TOL]"
+    echo "./ml_ops.sh YYYYMMDD TYPE [TOL] [MAX RESULTS]"
     echo "for example:"
-    echo "./ml_ops.sh 20160122 dns 1000 1e-6"
+    echo "./ml_ops.sh 20160122 dns 1e-6 1000"
     echo "./ml_ops.sh 20160122 flow"
-    echo "./ml_ops.sh 20160122 proxy 100"
+    echo "./ml_ops.sh 20160122 proxy 1"
     exit
 fi
 
diff --git a/spot-setup/odm/create_event_avro.sql b/spot-setup/odm/create_event_avro.sql
index 16945cf..34941ca 100644
--- a/spot-setup/odm/create_event_avro.sql
+++ b/spot-setup/odm/create_event_avro.sql
@@ -18,10 +18,10 @@
 CREATE EXTERNAL TABLE IF NOT EXISTS ${VAR:ODM_DBNAME}.${VAR:ODM_TABLENAME} (
 -- Common
 event_time bigint,
-begintime bigint,
-endtime bigint,
-event_insertime bigint,
-lastupdatetime bigint,
+begin_time bigint,
+end_time bigint,
+event_insert_time bigint,
+last_update_time bigint,
 duration float,
 event_id string,
 name string,
@@ -39,8 +39,8 @@
 query string,
 service string,
 state string,
-in_bytes int,
-out_bytes int,
+in_bytes bigint,
+out_bytes bigint,
 xref string,
 version string,
 api string,
@@ -55,55 +55,62 @@
 count int,
 company string,
 additional_attrs map<string,string>,
-totrust string,
-fromtrust string,
+to_trust string,
+from_trust string,
 rule string,
 threat string,
 pcap_id int,
+session_id string,
+length int,
 -- Device
 dvc_time bigint,
 dvc_ip4 bigint,
 dvc_ip4_str string,
-dvc_ip6 bigint,
+dvc_ip6 string,
 dvc_ip6_str string,
 dvc_host string,
+dvc_mac string,
 dvc_domain string,
 dvc_type string,
 dvc_vendor string,
 dvc_fwd_ip4 bigint,
 dvc_fwd_ip4_str string,
-dvc_fwd_ip6 bigint,
+dvc_fwd_ip6 string,
 dvc_fwd_ip6_str string,
 dvc_version string,
 -- Network
 src_ip4 bigint,
 src_ip4_str string,
-src_ip6 bigint,
+src_ip6 string,
 src_ip6_str string,
 src_host string,
+src_mac string,
 src_domain string,
 src_port int,
 src_country_code string,
 src_country_name string,
 src_region string,
 src_city string,
-src_lat int,
-src_long int,
+src_lat float,
+src_long float,
+src_asn bigint,
+src_tos string,
 dst_ip4 bigint,
 dst_ip4_str string,
-dst_ip6 bigint,
+dst_ip6 string,
 dst_ip6_str string,
 dst_host string,
+dst_mac string,
 dst_domain string,
 dst_port int,
 dst_country_code string,
 dst_country_name string,
 dst_region string,
 dst_city string,
-dst_lat int,
-dst_long int,
-src_asn int,
-dst_asn int,
+dst_lat float,
+dst_long float,
+dst_asn bigint,
+dst_tos string,
 net_direction string,
 net_flags string,
 -- File
@@ -112,10 +119,11 @@
 file_atime bigint,
 file_acls string,
 file_type string,
-file_size int,
+file_size bigint,
 file_desc string,
 file_hash string,
 file_hash_type string,
+file_uid string,
 -- Endpoint
 end_object string,
 end_action string,
@@ -163,7 +171,7 @@
 http_response_body_len int,
 http_response_info_code int,
 http_response_info_msg string,
-http_response_resp_fuids string,
+http_response_resp_fuids array<string>,
 http_response_mime_types string,
 http_response_headers map<string,string>,
 -- SMTP
@@ -193,7 +201,7 @@
 ftp_command string,
 ftp_arg string,
 ftp_mime_type string,
-ftp_file_size int,
+ftp_file_size bigint,
 ftp_reply_code int,
 ftp_reply_msg string,
 ftp_data_channel_passive boolean,
@@ -224,8 +232,8 @@
 tls_resumed boolean,
 tls_next_protocol string,
 tls_established boolean,
-tls_cert_chain_fuids string,
-tls_client_cert_chain_fuids string,
+tls_cert_chain_fuids array<string>,
+tls_client_cert_chain_fuids array<string>,
 tls_subject string,
 tls_issuer string,
 -- SSH
@@ -250,14 +258,17 @@
 irc_value string,
 irc_additional_data string,
 -- Flow
-flow_in_packets int,
-flow_out_packets int,
+flow_in_packets bigint,
+flow_out_packets bigint,
 flow_conn_state string,
 flow_history string,
 flow_src_dscp string,
 flow_dst_dscp string,
 flow_input string,
 flow_output string,
+flow_fwd_status string,
+flow_snmp_in string,
+flow_snmp_out string,
 -- Vulnerability
 vuln_id string,
 vuln_type string,
@@ -291,7 +302,19 @@
 av_signaturesubid string,
 av_intrusionurl string,
 av_intrusionpayloadurl string,
-objectname string)
+objectname string,
+-- Agent
+agent_severity string,
+agent_mac string,
+agent_time bigint,
+agent_id string,
+agent_description string,
+agent_type string,
+agent_ip4 bigint,
+agent_ip4_str string,
+agent_ip6 string,
+agent_ip6_str string,
+agent_host string)
 PARTITIONED BY (
 `p_dvc_vendor` string, -- i.e. Windows, PAN, Fireeye
 `p_dvc_type` string, -- i.e. Unix, Sonicwall, Windows
@@ -299,4 +322,4 @@
 )
 STORED AS AVRO
 LOCATION '${VAR:ODM_LOCATION}'
-TBLPROPERTIES ('avro.schema.url'='${VAR:ODM_AVRO_URL}');
\ No newline at end of file
+TBLPROPERTIES ('avro.schema.url'='${VAR:ODM_AVRO_URL}');
diff --git a/spot-setup/odm/create_event_pqt.sql b/spot-setup/odm/create_event_pqt.sql
index 5a2322d..908bf93 100644
--- a/spot-setup/odm/create_event_pqt.sql
+++ b/spot-setup/odm/create_event_pqt.sql
@@ -55,55 +55,62 @@
 count int,
 company string,
 additional_attrs map<string,string>,
-totrust string,
-fromtrust string,
+to_trust string,
+from_trust string,
 rule string,
 threat string,
 pcap_id int,
+session_id string,
+length int,
 -- Device
 dvc_time bigint,
 dvc_ip4 bigint,
 dvc_ip4_str string,
-dvc_ip6 bigint,
+dvc_ip6 string,
 dvc_ip6_str string,
 dvc_host string,
+dvc_mac string,
 dvc_domain string,
 dvc_type string,
 dvc_vendor string,
 dvc_fwd_ip4 bigint,
-dvc_fwd_ip4_str string, 
-dvc_fwd_ip6 bigint,
-dvc_fwd_ip6_str string, 
+dvc_fwd_ip4_str string,
+dvc_fwd_ip6 string,
+dvc_fwd_ip6_str string,
 dvc_version string,
 -- Network
 src_ip4 bigint,
 src_ip4_str string,
-src_ip6 bigint,
+src_ip6 string,
 src_ip6_str string,
 src_host string,
+src_mac string,
 src_domain string,
 src_port int,
 src_country_code string,
 src_country_name string,
 src_region string,
 src_city string,
-src_lat int,
-src_long int,
+src_lat float,
+src_long float,
+src_asn bigint,
+src_tos string,
 dst_ip4 bigint,
 dst_ip4_str string,
-dst_ip6 bigint,
+dst_ip6 string,
 dst_ip6_str string,
 dst_host string,
+dst_mac string,
 dst_domain string,
 dst_port int,
 dst_country_code string,
 dst_country_name string,
 dst_region string,
 dst_city string,
-dst_lat int,
-dst_long int,
-src_asn int,
-dst_asn int,
+dst_lat float,
+dst_long float,
+dst_asn bigint,
+dst_tos string,
 net_direction string,
 net_flags string,
 -- File
@@ -112,10 +119,11 @@
 file_atime bigint,
 file_acls string,
 file_type string,
-file_size int,
+file_size bigint,
 file_desc string,
 file_hash string,
 file_hash_type string,
+file_uid string,
 -- Endpoint
 end_object string,
 end_action string,
@@ -163,7 +171,7 @@
 http_response_body_len int,
 http_response_info_code int,
 http_response_info_msg string,
-http_response_resp_fuids string,
+http_response_resp_fuids array<string>,
 http_response_mime_types string,
 http_response_headers map<string,string>,
 -- SMTP
@@ -193,7 +201,7 @@
 ftp_command string,
 ftp_arg string,
 ftp_mime_type string,
-ftp_file_size int,
+ftp_file_size bigint,
 ftp_reply_code int,
 ftp_reply_msg string,
 ftp_data_channel_passive boolean,
@@ -224,8 +232,8 @@
 tls_resumed boolean,
 tls_next_protocol string,
 tls_established boolean,
-tls_cert_chain_fuids string,
-tls_client_cert_chain_fuids string,
+tls_cert_chain_fuids array<string>,
+tls_client_cert_chain_fuids array<string>,
 tls_subject string,
 tls_issuer string,
 -- SSH
@@ -258,6 +266,9 @@
 flow_dst_dscp string,
 flow_input string,
 flow_output string,
+flow_fwd_status string,
+flow_snmp_in string,
+flow_snmp_out string,
 -- Vulnerability
 vuln_id string,
 vuln_type string,
@@ -291,11 +302,23 @@
 av_signaturesubid string,
 av_intrusionurl string,
 av_intrusionpayloadurl string,
-objectname string)
+objectname string,
+-- Agent
+agent_severity string,
+agent_mac string,
+agent_time bigint,
+agent_id string,
+agent_description string,
+agent_type string,
+agent_ip4 bigint,
+agent_ip4_str string,
+agent_ip6 string,
+agent_ip6_str string,
+agent_host string)
 PARTITIONED BY (
 `p_dvc_vendor` string, -- i.e. Windows, PAN, Fireeye
 `p_dvc_type` string, -- i.e. Unix, Sonicwall, Windows
 `p_dt` string -- i.e. 2017-01-01
 )
 STORED AS PARQUET
-LOCATION '${VAR:ODM_LOCATION}';
\ No newline at end of file
+LOCATION '${VAR:ODM_LOCATION}';
diff --git a/spot-setup/odm/event.avsc b/spot-setup/odm/event.avsc
index 7765d67..f4a0873 100644
--- a/spot-setup/odm/event.avsc
+++ b/spot-setup/odm/event.avsc
@@ -41,54 +41,61 @@
         {"name":"sensitivity", "type":["null","string"],"doc":"Sensitivity label", "default": null},
         {"name":"count", "type":["null","int"],"doc":"Generic count", "default": null},
         {"name":"company", "type":["null","string"],"doc":"Company label", "default": null},
-        {"name":"additional_attrs","type":["null", {"type": "map", "values": "string"}],"default":null, "doc":"Additional attributes of the event"},
-        {"name":"totrust", "type":["null","string"],"doc":"TBD", "default": null},
-        {"name":"fromtrust", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"additional_attrs","type":["null", "string"],"default":null, "doc":"Additional attributes of the event"},
+        {"name":"to_trust", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"from_trust", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"rule", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"threat", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"pcap_id", "type":["null","int"],"doc":"TBD", "default": null},
+        {"name":"session_id", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"length", "type":["null","int"],"doc":"TBD", "default": null},
         {"name":"dvc_time", "type":["null","long"],"doc":"UTC timestamp from device where event/alert originates or is received", "default": null},
         {"name":"dvc_ip4", "type":["null","long"],"doc":"IP address of device", "default": null},
         {"name":"dvc_ip4_str", "type":["null","string"],"doc":"IP address of device", "default": null},
-        {"name":"dvc_ip6", "type":["null","long"],"doc":"IP address of device", "default": null},
+        {"name":"dvc_ip6", "type":["null","string"],"doc":"IP address of device", "default": null},
         {"name":"dvc_ip6_str", "type":["null","string"],"doc":"IP address of device", "default": null},
         {"name":"dvc_host", "type":["null","string"],"doc":"Hostname of device", "default": null},
+        {"name":"dvc_mac", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"dvc_domain", "type":["null","string"],"doc":"Domain of device", "default": null},
         {"name":"dvc_type", "type":["null","string"],"doc":"Device type that generated the log", "default": null},
         {"name":"dvc_vendor", "type":["null","string"],"doc":"Vendor", "default": null},
         {"name":"dvc_fwd_ip4", "type":["null","long"],"doc":"Forwarded from device", "default": null},
         {"name":"dvc_fwd_ip4_str", "type":["null","string"],"doc":"Forwarded from device", "default": null},
-        {"name":"dvc_fwd_ip6", "type":["null","long"],"doc":"Forwarded from device", "default": null},
+        {"name":"dvc_fwd_ip6", "type":["null","string"],"doc":"Forwarded from device", "default": null},
         {"name":"dvc_fwd_ip6_str", "type":["null","string"],"doc":"Forwarded from device", "default": null},
         {"name":"dvc_version", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"src_ip4", "type":["null","long"],"doc":"Source ip address of event", "default": null},
         {"name":"src_ip4_str", "type":["null","string"],"doc":"Source ip address of event", "default": null},
-        {"name":"src_ip6", "type":["null","long"],"doc":"Source ip address of event", "default": null},
+        {"name":"src_ip6", "type":["null","string"],"doc":"Source ip address of event", "default": null},
         {"name":"src_ip6_str", "type":["null","string"],"doc":"Source ip address of event", "default": null},
         {"name":"src_host", "type":["null","string"],"doc":"Source FQDN of event", "default": null},
+        {"name":"src_mac", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"src_domain", "type":["null","string"],"doc":"Domain name of source address", "default": null},
         {"name":"src_port", "type":["null","int"],"doc":"Source port of event", "default": null},
         {"name":"src_country_code", "type":["null","string"],"doc":"Source country code", "default": null},
         {"name":"src_country_name", "type":["null","string"],"doc":"Source country name", "default": null},
         {"name":"src_region", "type":["null","string"],"doc":"Source region", "default": null},
         {"name":"src_city", "type":["null","string"],"doc":"Source city", "default": null},
-        {"name":"src_lat", "type":["null","int"],"doc":"Source latitude", "default": null},
-        {"name":"src_long", "type":["null","int"],"doc":"Source longitude", "default": null},
+        {"name":"src_lat", "type":["null","float"],"doc":"Source latitude", "default": null},
+        {"name":"src_long", "type":["null","float"],"doc":"Source longitude", "default": null},
+        {"name":"src_asn", "type":["null","long"],"doc":"Autonomous system number", "default": null},
+        {"name":"src_tos", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"dst_ip4", "type":["null","long"],"doc":"Destination ip address of event", "default": null},
         {"name":"dst_ip4_str", "type":["null","string"],"doc":"Destination ip address of event", "default": null},
-        {"name":"dst_ip6", "type":["null","long"],"doc":"Destination ip address of event", "default": null},
+        {"name":"dst_ip6", "type":["null","string"],"doc":"Destination ip address of event", "default": null},
         {"name":"dst_ip6_str", "type":["null","string"],"doc":"Destination ip address of event", "default": null},
         {"name":"dst_host", "type":["null","string"],"doc":"Destination FQDN of event", "default": null},
+        {"name":"dst_mac", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"dst_domain", "type":["null","string"],"doc":"Domain name of destination address", "default": null},
         {"name":"dst_port", "type":["null","int"],"doc":"Destination port of event", "default": null},
         {"name":"dst_country_code", "type":["null","string"],"doc":"Source country code", "default": null},
         {"name":"dst_country_name", "type":["null","string"],"doc":"Source country name", "default": null},
         {"name":"dst_region", "type":["null","string"],"doc":"Source region", "default": null},
         {"name":"dst_city", "type":["null","string"],"doc":"Source city", "default": null},
-        {"name":"dst_lat", "type":["null","int"],"doc":"Source latitude", "default": null},
-        {"name":"dst_long", "type":["null","int"],"doc":"Source longitude", "default": null},
-        {"name":"src_asn", "type":["null","int"],"doc":"Autonomous system number", "default": null},
-        {"name":"dst_asn", "type":["null","int"],"doc":"Autonomous system number", "default": null},
+        {"name":"dst_lat", "type":["null","float"],"doc":"Source latitude", "default": null},
+        {"name":"dst_long", "type":["null","float"],"doc":"Source longitude", "default": null},
+        {"name":"dst_asn", "type":["null","long"],"doc":"Autonomous system number", "default": null},
+        {"name":"dst_tos", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"net_direction", "type":["null","string"],"doc":"Direction", "default": null},
         {"name":"net_flags", "type":["null","string"],"doc":"TCP flags", "default": null},
         {"name":"file_name", "type":["null","string"],"doc":"Filename from event", "default": null},
@@ -96,10 +103,11 @@
         {"name":"file_atime", "type":["null","long"],"doc":"Timestamp (UTC) of file access", "default": null},
         {"name":"file_acls", "type":["null","string"],"doc":"File permissions", "default": null},
         {"name":"file_type", "type":["null","string"],"doc":"Type of file", "default": null},
-        {"name":"file_size", "type":["null","int"],"doc":"Size of file in bytes", "default": null},
+        {"name":"file_size", "type":["null","long"],"doc":"Size of file in bytes", "default": null},
         {"name":"file_desc", "type":["null","string"],"doc":"Description of file", "default": null},
         {"name":"file_hash", "type":["null","string"],"doc":"Hash of file", "default": null},
         {"name":"file_hash_type", "type":["null","string"],"doc":"Type of hash", "default": null},
+        {"name":"file_uid", "type":["null","string"],"doc":"File identifier", "default": null},
         {"name":"end_object", "type":["null","string"],"doc":"File/Process/ Registry", "default": null},
         {"name":"end_action", "type":["null","string"],"doc":"Action taken on object (open/delete/ edit)", "default": null},
         {"name":"end_msg", "type":["null","string"],"doc":"Message (details of action taken on object)", "default": null},
@@ -170,7 +178,7 @@
         {"name":"ftp_command", "type":["null","string"],"doc":"FTP command", "default": null},
         {"name":"ftp_arg", "type":["null","string"],"doc":"Argument", "default": null},
         {"name":"ftp_mime_type", "type":["null","string"],"doc":"Mime type", "default": null},
-        {"name":"ftp_file_size", "type":["null","int"],"doc":"File size", "default": null},
+        {"name":"ftp_file_size", "type":["null","long"],"doc":"File size", "default": null},
         {"name":"ftp_reply_code", "type":["null","int"],"doc":"Reply code", "default": null},
         {"name":"ftp_reply_msg", "type":["null","string"],"doc":"Reply message", "default": null},
         {"name":"ftp_data_channel_passive", "type":["null","boolean"],"doc":"Passive data channel?", "default": null},
@@ -229,6 +237,9 @@
         {"name":"flow_dst_dscp", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"flow_input", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"flow_output", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"flow_fwd_status", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"flow_snmp_in", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"flow_snmp_out", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"vuln_id", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"vuln_type", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"vuln_status", "type":["null","string"],"doc":"TBD", "default": null},
@@ -260,7 +271,18 @@
         {"name":"av_signaturesubid", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"av_intrusionurl", "type":["null","string"],"doc":"TBD", "default": null},
         {"name":"av_intrusionpayloadurl", "type":["null","string"],"doc":"TBD", "default": null},
-        {"name":"objectname", "type":["null","string"],"doc":"TBD", "default": null}
+        {"name":"objectname", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"agent_severity", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"agent_mac", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"agent_time", "type":["null","long"],"doc":"TBD", "default": null},
+        {"name":"agent_id", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"agent_description", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"agent_type", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"agent_ip4", "type":["null","long"],"doc":"TBD", "default": null},
+        {"name":"agent_ip4_str", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"agent_ip6", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"agent_ip6_str", "type":["null","string"],"doc":"TBD", "default": null},
+        {"name":"agent_host", "type":["null","string"],"doc":"TBD", "default": null}
      ],
      "doc": "A view schema for storing Apache Spot Event data."
   }
diff --git a/spot-setup/odm/odm_setup.sh b/spot-setup/odm/odm_setup.sh
index 7fc10bd..7a48149 100755
--- a/spot-setup/odm/odm_setup.sh
+++ b/spot-setup/odm/odm_setup.sh
@@ -92,7 +92,7 @@
 
 # Check the format argument and make sure its supported
 if [ "$format" != "pqt" ] && [ "$format" != "avro" ] ; then
-    log "Format argument '$format' is not supported. Only Parquet and Avro are supported data storage formats. Use 'pqt' or 'avro'  instead (i.e. ./odm_setup pqt)."
+    log "Format argument '$format' is not supported. Only Parquet and Avro are supported data storage formats. Use 'pqt' or 'avro'  instead (i.e. ./odm_setup -f pqt)."
     exit 1
 fi
 
@@ -194,4 +194,4 @@
             ${impala_db_shell} --var=ODM_DBNAME=${DBNAME} --var=ODM_TABLENAME=${f} --var=ODM_LOCATION=${HUSER}/${d}/${f} --var=ODM_AVRO_URL=hdfs://${HUSER}/${d}/schema/${f}.avsc -c -f ${ODM_FILES_DIR}/create_${f}_avro.sql
         fi
 	done
-done
\ No newline at end of file
+done