TEZ-4098: tez-tools improvements: log-split, swimlane (László Bodor, reviewed by rbalamohan)
diff --git a/tez-tools/swimlanes/swimlane.py b/tez-tools/swimlanes/swimlane.py
index bbd54df..11976da 100644
--- a/tez-tools/swimlanes/swimlane.py
+++ b/tez-tools/swimlanes/swimlane.py
@@ -195,6 +195,7 @@
 			svg.text(marginRight+xdomain(percentX), y+marginTop+12, "%d%% (%0.1fs)" % (int(fraction*100), (percentX - dag.start)/1000.0), style="font-size:12px; text-anchor: middle")
 	out.write(svg.flush())
 	out.close()
+	print("Output svg is written into: " + str(out))
 
 if __name__ == "__main__":
 	sys.exit(main(sys.argv[1:]))
diff --git a/tez-tools/swimlanes/yarn-swimlanes.sh b/tez-tools/swimlanes/yarn-swimlanes.sh
index df4d071..02465b0 100644
--- a/tez-tools/swimlanes/yarn-swimlanes.sh
+++ b/tez-tools/swimlanes/yarn-swimlanes.sh
@@ -19,10 +19,17 @@
 set -e
 
 APPID=$1
-
-YARN=$(which yarn);
 TMP=$(mktemp)
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
-echo "Fetching yarn logs for $APPID"
-$YARN logs -applicationId $APPID | grep HISTORY > $TMP 
-python swimlane.py -o $APPID.svg $TMP
+if [[ -f $APPID ]]; then
+    echo "Reading yarn logs from local file: $APPID"
+    cat "$APPID" | grep HISTORY > "$TMP"
+else
+    YARN=$(which yarn);
+    echo "Fetching yarn logs for $APPID"
+    $YARN logs -applicationId "$APPID" | grep HISTORY > "$TMP"
+fi
+echo "History was written into $TMP"
+
+python "$DIR/swimlane.py" -o "$APPID.svg" "$TMP"
\ No newline at end of file
diff --git a/tez-tools/tez-log-split/README.md b/tez-tools/tez-log-split/README.md
new file mode 100644
index 0000000..a7341a7
--- /dev/null
+++ b/tez-tools/tez-log-split/README.md
@@ -0,0 +1,77 @@
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+Tez log splitter
+=========
+
+This is a post-hoc analysis tool for Apache Tez which splits
+an aggregated yarn log file to separate files into a hierarchical folder structure.
+
+```
+.
+├── vc0525.your.domain.com_8041
+│   └── container_e10_1575565459633_0004_01_000001
+│       ├── container-localizer-syslog
+│       ├── dag_1575565459633_0004_1-tez-dag.pb.txt
+│       ├── dag_1575565459633_0004_1.dot
+│       ├── prelaunch.err
+│       ├── prelaunch.out
+│       ├── stderr
+│       ├── stdout
+│       ├── syslog
+│       ├── syslog_dag_1575565459633_0004_1
+│       └── syslog_dag_1575565459633_0004_1_post
+├── vc0526.your.domain.com_8041
+│   └── container_e10_1575565459633_0004_01_000004
+│       ├── container-localizer-syslog
+│       ├── prelaunch.err
+│       ├── prelaunch.out
+│       ├── stderr
+│       ├── stdout
+│       ├── syslog
+│       └── syslog_attempt_1575565459633_0004_1_00_000000_2
+├── vc0528.your.domain.com_8041
+│   └── container_e10_1575565459633_0004_01_000002
+│       ├── container-localizer-syslog
+│       ├── prelaunch.err
+│       ├── prelaunch.out
+│       ├── stderr
+│       ├── stdout
+│       ├── syslog
+│       └── syslog_attempt_1575565459633_0004_1_00_000000_0
+├── vc0529.your.domain.com_8041
+│   └── container_e10_1575565459633_0004_01_000005
+│       ├── container-localizer-syslog
+│       ├── prelaunch.err
+│       ├── prelaunch.out
+│       ├── stderr
+│       ├── stdout
+│       ├── syslog
+│       └── syslog_attempt_1575565459633_0004_1_00_000000_3
+└── vc0536.your.domain.com_8041
+    └── container_e10_1575565459633_0004_01_000003
+        ├── container-localizer-syslog
+        ├── prelaunch.err
+        ├── prelaunch.out
+        ├── stderr
+        ├── stdout
+        ├── syslog
+        └── syslog_attempt_1575565459633_0004_1_00_000000_1
+```
+
+To use the tool, run e.g.
+
+`tez-log-splitter.sh application_1576254620247_0010`  (app log is fetched from yarn)
+`tez-log-splitter.sh ~/path/to/application_1576254620247_0010.log`  (...when app log is already on your computer)
+`tez-log-splitter.sh ~/path/to/application_1576254620247_0010.log.gz`  (...when app log is already on your computer in gz)
diff --git a/tez-tools/tez-log-split/logsplit.py b/tez-tools/tez-log-split/logsplit.py
new file mode 100644
index 0000000..47e17da
--- /dev/null
+++ b/tez-tools/tez-log-split/logsplit.py
@@ -0,0 +1,111 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import sys
+import os
+import re
+from gzip import GzipFile as GZFile
+from getopt import getopt
+
+def usage():
+    sys.stderr.write("""
+usage: logsplit.py <log-file>
+
+Input files for this tool can be prepared by "yarn logs -applicationId <application_...>".
+""")
+
+def open_file(f):
+    if f.endswith(".gz"):
+        return GZFile(f)
+    return open(f)
+
+class AggregatedLog(object):
+    def __init__(self):
+        self.in_container = False
+        self.in_logfile = False
+        self.current_container_header = None
+        self.current_container_name = None
+        self.current_host_name = None # as read from log line: "hello.my.host.com_8041"
+        self.current_file = None
+        self.HEADER_CONTAINER_RE = re.compile("Container: (container_[a-z0-9_]+) on (.*)")
+        self.HEADER_LAST_ROW_RE = re.compile("^LogContents:$")
+        self.HEADER_LOG_TYPE_RE = re.compile("^LogType:(.*)")
+        self.LAST_LOG_LINE_RE = re.compile("^End of LogType:.*")
+
+    def process(self, input_file):
+        self.output_folder = input_file.name + "_splitlogs"
+        os.mkdir(self.output_folder)
+
+        for line in input_file:
+            self.parse(line)
+
+    def parse(self, line):
+        if self.in_container:
+            if self.in_logfile:
+                m = self.LAST_LOG_LINE_RE.match(line)
+                if m:
+                    self.in_container = False
+                    self.in_logfile = False
+                    self.current_file.close()
+                else:
+                    self.write_to_current_file(line)
+            else:
+                m = self.HEADER_LOG_TYPE_RE.match(line)
+                if m:
+                    file_name = m.group(1)
+                    self.create_file_in_current_container(file_name)
+                elif self.HEADER_LAST_ROW_RE.match(line):
+                    self.in_logfile = True
+                    self.write_to_current_file(self.current_container_header) #for host reference
+        else:
+            m = self.HEADER_CONTAINER_RE.match(line)
+            self.current_container_header = line
+            if m:
+                self.in_container = True
+                self.current_container_name = m.group(1)
+                self.current_host_name = m.group(2)
+                self.start_container_folder()
+
+    def start_container_folder(self):
+        container_dir = os.path.join(self.output_folder, self.get_current_container_dir_name())
+        if not os.path.exists(container_dir):
+            os.makedirs(container_dir)
+
+    def create_file_in_current_container(self, file_name):
+        file_to_be_created = os.path.join(self.output_folder, self.get_current_container_dir_name(), file_name)
+        file = open(file_to_be_created, "w+")
+        self.current_file = file
+
+    def write_to_current_file(self, line):
+        self.current_file.write(line)
+
+    def get_current_container_dir_name(self):
+        return os.path.join(self.current_host_name, self.current_container_name)
+
+def main(argv):
+    (opts, args) = getopt(argv, "")
+    input_file = args[0]
+    fp = open_file(input_file)
+    aggregated_log = AggregatedLog()
+    aggregated_log.process(fp)
+    print ("Split application logs was written into folder " + aggregated_log.output_folder)
+    fp.close()
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/tez-tools/tez-log-split/tez-log-splitter.sh b/tez-tools/tez-log-split/tez-log-splitter.sh
new file mode 100644
index 0000000..712e499
--- /dev/null
+++ b/tez-tools/tez-log-split/tez-log-splitter.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#set -e
+
+APPID=$1
+TMP=$(mktemp)
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+if [[ -f $APPID ]]; then
+    echo "Reading yarn logs from local file: $APPID"
+    TMP=$APPID
+else
+    YARN=$(which yarn);
+    echo "Fetching yarn logs for $APPID"
+    $YARN logs -applicationId "$APPID" > "$TMP"
+    echo "Application log was written into $TMP"
+fi
+
+python "$DIR/logsplit.py" "$TMP"
\ No newline at end of file