TEZ-4098: tez-tools improvements: log-split, swimlane (László Bodor, reviewed by rbalamohan)
diff --git a/tez-tools/swimlanes/swimlane.py b/tez-tools/swimlanes/swimlane.py
index bbd54df..11976da 100644
--- a/tez-tools/swimlanes/swimlane.py
+++ b/tez-tools/swimlanes/swimlane.py
@@ -195,6 +195,7 @@
svg.text(marginRight+xdomain(percentX), y+marginTop+12, "%d%% (%0.1fs)" % (int(fraction*100), (percentX - dag.start)/1000.0), style="font-size:12px; text-anchor: middle")
out.write(svg.flush())
out.close()
+ print("Output svg is written into: " + str(out))
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
diff --git a/tez-tools/swimlanes/yarn-swimlanes.sh b/tez-tools/swimlanes/yarn-swimlanes.sh
index df4d071..02465b0 100644
--- a/tez-tools/swimlanes/yarn-swimlanes.sh
+++ b/tez-tools/swimlanes/yarn-swimlanes.sh
@@ -19,10 +19,17 @@
set -e
APPID=$1
-
-YARN=$(which yarn);
TMP=$(mktemp)
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-echo "Fetching yarn logs for $APPID"
-$YARN logs -applicationId $APPID | grep HISTORY > $TMP
-python swimlane.py -o $APPID.svg $TMP
+if [[ -f $APPID ]]; then
+ echo "Reading yarn logs from local file: $APPID"
+ cat "$APPID" | grep HISTORY > "$TMP"
+else
+ YARN=$(which yarn);
+ echo "Fetching yarn logs for $APPID"
+ $YARN logs -applicationId "$APPID" | grep HISTORY > "$TMP"
+fi
+echo "History was written into $TMP"
+
+python "$DIR/swimlane.py" -o "$APPID.svg" "$TMP"
\ No newline at end of file
diff --git a/tez-tools/tez-log-split/README.md b/tez-tools/tez-log-split/README.md
new file mode 100644
index 0000000..a7341a7
--- /dev/null
+++ b/tez-tools/tez-log-split/README.md
@@ -0,0 +1,77 @@
+<!--
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. See accompanying LICENSE file.
+-->
+
+Tez log splitter
+=========
+
+This is a post-hoc analysis tool for Apache Tez which splits
+an aggregated yarn log file to separate files into a hierarchical folder structure.
+
+```
+.
+├── vc0525.your.domain.com_8041
+│ └── container_e10_1575565459633_0004_01_000001
+│ ├── container-localizer-syslog
+│ ├── dag_1575565459633_0004_1-tez-dag.pb.txt
+│ ├── dag_1575565459633_0004_1.dot
+│ ├── prelaunch.err
+│ ├── prelaunch.out
+│ ├── stderr
+│ ├── stdout
+│ ├── syslog
+│ ├── syslog_dag_1575565459633_0004_1
+│ └── syslog_dag_1575565459633_0004_1_post
+├── vc0526.your.domain.com_8041
+│ └── container_e10_1575565459633_0004_01_000004
+│ ├── container-localizer-syslog
+│ ├── prelaunch.err
+│ ├── prelaunch.out
+│ ├── stderr
+│ ├── stdout
+│ ├── syslog
+│ └── syslog_attempt_1575565459633_0004_1_00_000000_2
+├── vc0528.your.domain.com_8041
+│ └── container_e10_1575565459633_0004_01_000002
+│ ├── container-localizer-syslog
+│ ├── prelaunch.err
+│ ├── prelaunch.out
+│ ├── stderr
+│ ├── stdout
+│ ├── syslog
+│ └── syslog_attempt_1575565459633_0004_1_00_000000_0
+├── vc0529.your.domain.com_8041
+│ └── container_e10_1575565459633_0004_01_000005
+│ ├── container-localizer-syslog
+│ ├── prelaunch.err
+│ ├── prelaunch.out
+│ ├── stderr
+│ ├── stdout
+│ ├── syslog
+│ └── syslog_attempt_1575565459633_0004_1_00_000000_3
+└── vc0536.your.domain.com_8041
+ └── container_e10_1575565459633_0004_01_000003
+ ├── container-localizer-syslog
+ ├── prelaunch.err
+ ├── prelaunch.out
+ ├── stderr
+ ├── stdout
+ ├── syslog
+ └── syslog_attempt_1575565459633_0004_1_00_000000_1
+```
+
+To use the tool, run e.g.
+
+`tez-log-splitter.sh application_1576254620247_0010` (app log is fetched from yarn)
+`tez-log-splitter.sh ~/path/to/application_1576254620247_0010.log` (...when app log is already on your computer)
+`tez-log-splitter.sh ~/path/to/application_1576254620247_0010.log.gz` (...when app log is already on your computer in gz)
diff --git a/tez-tools/tez-log-split/logsplit.py b/tez-tools/tez-log-split/logsplit.py
new file mode 100644
index 0000000..47e17da
--- /dev/null
+++ b/tez-tools/tez-log-split/logsplit.py
@@ -0,0 +1,111 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import sys
+import os
+import re
+from gzip import GzipFile as GZFile
+from getopt import getopt
+
+def usage():
+ sys.stderr.write("""
+usage: logsplit.py <log-file>
+
+Input files for this tool can be prepared by "yarn logs -applicationId <application_...>".
+""")
+
+def open_file(f):
+ if f.endswith(".gz"):
+ return GZFile(f)
+ return open(f)
+
+class AggregatedLog(object):
+ def __init__(self):
+ self.in_container = False
+ self.in_logfile = False
+ self.current_container_header = None
+ self.current_container_name = None
+ self.current_host_name = None # as read from log line: "hello.my.host.com_8041"
+ self.current_file = None
+ self.HEADER_CONTAINER_RE = re.compile("Container: (container_[a-z0-9_]+) on (.*)")
+ self.HEADER_LAST_ROW_RE = re.compile("^LogContents:$")
+ self.HEADER_LOG_TYPE_RE = re.compile("^LogType:(.*)")
+ self.LAST_LOG_LINE_RE = re.compile("^End of LogType:.*")
+
+ def process(self, input_file):
+ self.output_folder = input_file.name + "_splitlogs"
+ os.mkdir(self.output_folder)
+
+ for line in input_file:
+ self.parse(line)
+
+ def parse(self, line):
+ if self.in_container:
+ if self.in_logfile:
+ m = self.LAST_LOG_LINE_RE.match(line)
+ if m:
+ self.in_container = False
+ self.in_logfile = False
+ self.current_file.close()
+ else:
+ self.write_to_current_file(line)
+ else:
+ m = self.HEADER_LOG_TYPE_RE.match(line)
+ if m:
+ file_name = m.group(1)
+ self.create_file_in_current_container(file_name)
+ elif self.HEADER_LAST_ROW_RE.match(line):
+ self.in_logfile = True
+ self.write_to_current_file(self.current_container_header) #for host reference
+ else:
+ m = self.HEADER_CONTAINER_RE.match(line)
+ self.current_container_header = line
+ if m:
+ self.in_container = True
+ self.current_container_name = m.group(1)
+ self.current_host_name = m.group(2)
+ self.start_container_folder()
+
+ def start_container_folder(self):
+ container_dir = os.path.join(self.output_folder, self.get_current_container_dir_name())
+ if not os.path.exists(container_dir):
+ os.makedirs(container_dir)
+
+ def create_file_in_current_container(self, file_name):
+ file_to_be_created = os.path.join(self.output_folder, self.get_current_container_dir_name(), file_name)
+ file = open(file_to_be_created, "w+")
+ self.current_file = file
+
+ def write_to_current_file(self, line):
+ self.current_file.write(line)
+
+ def get_current_container_dir_name(self):
+ return os.path.join(self.current_host_name, self.current_container_name)
+
+def main(argv):
+ (opts, args) = getopt(argv, "")
+ input_file = args[0]
+ fp = open_file(input_file)
+ aggregated_log = AggregatedLog()
+ aggregated_log.process(fp)
+ print ("Split application logs was written into folder " + aggregated_log.output_folder)
+ fp.close()
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
diff --git a/tez-tools/tez-log-split/tez-log-splitter.sh b/tez-tools/tez-log-split/tez-log-splitter.sh
new file mode 100644
index 0000000..712e499
--- /dev/null
+++ b/tez-tools/tez-log-split/tez-log-splitter.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#set -e
+
+APPID=$1
+TMP=$(mktemp)
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+if [[ -f $APPID ]]; then
+ echo "Reading yarn logs from local file: $APPID"
+ TMP=$APPID
+else
+ YARN=$(which yarn);
+ echo "Fetching yarn logs for $APPID"
+ $YARN logs -applicationId "$APPID" > "$TMP"
+ echo "Application log was written into $TMP"
+fi
+
+python "$DIR/logsplit.py" "$TMP"
\ No newline at end of file