| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * <p/> |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * <p/> |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.tez.analyzer.plugins; |
| |
| import com.google.common.base.Predicate; |
| import com.google.common.collect.Iterables; |
| import com.google.common.collect.Lists; |
| import com.google.common.collect.Multimap; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.util.ToolRunner; |
| import org.apache.tez.analyzer.Analyzer; |
| import org.apache.tez.analyzer.CSVResult; |
| import org.apache.tez.common.counters.FileSystemCounter; |
| import org.apache.tez.common.counters.TaskCounter; |
| import org.apache.tez.common.counters.TezCounter; |
| import org.apache.tez.common.counters.TezCounters; |
| import org.apache.tez.dag.api.TezException; |
| import org.apache.tez.dag.api.oldrecords.TaskAttemptState; |
| import org.apache.tez.history.parser.datamodel.DagInfo; |
| import org.apache.tez.history.parser.datamodel.TaskAttemptInfo; |
| |
| import java.util.Collection; |
| import java.util.List; |
| |
| |
| /** |
| * This will provide the set of nodes participated in the DAG in descending order of task execution |
| * time. |
| * <p/> |
| * Combine it with other counters to understand slow nodes better. |
| */ |
| public class SlowNodeAnalyzer extends TezAnalyzerBase implements Analyzer { |
| |
| private static final Log LOG = LogFactory.getLog(SlowNodeAnalyzer.class); |
| |
| private static final String[] headers = { "nodeName", "noOfTasksExecuted", "noOfKilledTasks", |
| "noOfFailedTasks", "avgSucceededTaskExecutionTime", "avgKilledTaskExecutionTime", |
| "avgFailedTaskExecutionTime", "avgHDFSBytesRead", "avgHDFSBytesWritten", |
| "avgFileBytesRead", "avgFileBytesWritten", "avgGCTimeMillis", "avgCPUTimeMillis" }; |
| |
| private final CSVResult csvResult = new CSVResult(headers); |
| |
| public SlowNodeAnalyzer(Configuration config) { |
| super(config); |
| } |
| |
| @Override |
| public void analyze(DagInfo dagInfo) throws TezException { |
| Multimap<String, TaskAttemptInfo> nodeDetails = dagInfo.getNodeDetails(); |
| for (String nodeName : nodeDetails.keySet()) { |
| List<String> record = Lists.newLinkedList(); |
| |
| Collection<TaskAttemptInfo> taskAttemptInfos = nodeDetails.get(nodeName); |
| |
| record.add(nodeName); |
| record.add(taskAttemptInfos.size() + ""); |
| record.add(getNumberOfTasks(taskAttemptInfos, TaskAttemptState.KILLED) + ""); |
| record.add(getNumberOfTasks(taskAttemptInfos, TaskAttemptState.FAILED) + ""); |
| |
| Iterable<TaskAttemptInfo> succeedTasks = getFilteredTaskAttempts(taskAttemptInfos, |
| TaskAttemptState.SUCCEEDED); |
| record.add(getAvgTaskExecutionTime(succeedTasks) + ""); |
| |
| Iterable<TaskAttemptInfo> killedTasks = getFilteredTaskAttempts(taskAttemptInfos, |
| TaskAttemptState.KILLED); |
| record.add(getAvgTaskExecutionTime(killedTasks) + ""); |
| |
| Iterable<TaskAttemptInfo> failedTasks = getFilteredTaskAttempts(taskAttemptInfos, |
| TaskAttemptState.FAILED); |
| record.add(getAvgTaskExecutionTime(failedTasks) + ""); |
| |
| record.add(getAvgCounter(taskAttemptInfos, FileSystemCounter.class |
| .getName(), FileSystemCounter.HDFS_BYTES_READ.name()) + ""); |
| record.add(getAvgCounter(taskAttemptInfos, FileSystemCounter.class |
| .getName(), FileSystemCounter.HDFS_BYTES_WRITTEN.name()) + ""); |
| record.add(getAvgCounter(taskAttemptInfos, FileSystemCounter.class |
| .getName(), FileSystemCounter.FILE_BYTES_READ.name()) + ""); |
| record.add(getAvgCounter(taskAttemptInfos, FileSystemCounter.class |
| .getName(), FileSystemCounter.FILE_BYTES_WRITTEN.name()) + ""); |
| record.add(getAvgCounter(taskAttemptInfos, TaskCounter.class |
| .getName(), TaskCounter.GC_TIME_MILLIS.name()) + ""); |
| record.add(getAvgCounter(taskAttemptInfos, TaskCounter.class |
| .getName(), TaskCounter.CPU_MILLISECONDS.name()) + ""); |
| |
| csvResult.addRecord(record.toArray(new String[record.size()])); |
| } |
| } |
| |
| private Iterable<TaskAttemptInfo> getFilteredTaskAttempts(Collection<TaskAttemptInfo> |
| taskAttemptInfos, final TaskAttemptState status) { |
| return Iterables.filter(taskAttemptInfos, new |
| Predicate<TaskAttemptInfo>() { |
| @Override public boolean apply(TaskAttemptInfo input) { |
| return input.getStatus().equalsIgnoreCase(status.toString()); |
| } |
| }); |
| } |
| |
| private float getAvgTaskExecutionTime(Iterable<TaskAttemptInfo> taskAttemptInfos) { |
| long totalTime = 0; |
| int size = 0; |
| for (TaskAttemptInfo attemptInfo : taskAttemptInfos) { |
| totalTime += attemptInfo.getTimeTaken(); |
| size++; |
| } |
| return (size > 0) ? (totalTime * 1.0f / size) : 0; |
| } |
| |
| private int getNumberOfTasks(Collection<TaskAttemptInfo> taskAttemptInfos, TaskAttemptState |
| status) { |
| int tasks = 0; |
| for (TaskAttemptInfo attemptInfo : taskAttemptInfos) { |
| if (attemptInfo.getStatus().equalsIgnoreCase(status.toString())) { |
| tasks++; |
| } |
| } |
| return tasks; |
| } |
| |
| private float getAvgCounter(Collection<TaskAttemptInfo> taskAttemptInfos, String |
| counterGroupName, String counterName) { |
| long total = 0; |
| int taskCount = 0; |
| for (TaskAttemptInfo attemptInfo : taskAttemptInfos) { |
| TezCounters tezCounters = attemptInfo.getTezCounters(); |
| TezCounter counter = tezCounters.findCounter(counterGroupName, counterName); |
| if (counter != null) { |
| total += counter.getValue(); |
| taskCount++; |
| } else { |
| LOG.info("Could not find counterGroupName=" + counterGroupName + ", counter=" + |
| counterName + " in " + attemptInfo); |
| } |
| } |
| return (taskCount > 0) ? (total * 1.0f / taskCount) : 0; |
| } |
| |
| @Override |
| public CSVResult getResult() throws TezException { |
| return csvResult; |
| } |
| |
| @Override |
| public String getName() { |
| return "Slow Node Analyzer"; |
| } |
| |
| @Override |
| public String getDescription() { |
| StringBuilder sb = new StringBuilder(); |
| sb.append("Analyze node details for the DAG.").append("\n"); |
| sb.append("This could be used to find out the set of nodes where the tasks are taking more " |
| + "time on average.").append("\n"); |
| sb.append("This could be used to find out the set of nodes where the tasks are taking more " |
| + "time on average and to understand whether too many tasks got scheduled on a node.") |
| .append("\n"); |
| sb.append("One needs to combine the task execution time with other metrics like bytes " |
| + "read/written etc to get better idea of bad nodes. In order to understand the slow " |
| + "nodes due to network, it might be worthwhile to consider the shuffle performance " |
| + "analyzer tool in tez-tools").append("\n"); |
| return sb.toString(); |
| } |
| |
| public static void main(String[] args) throws Exception { |
| Configuration config = new Configuration(); |
| SlowNodeAnalyzer analyzer = new SlowNodeAnalyzer(config); |
| int res = ToolRunner.run(config, analyzer, args); |
| analyzer.printResults(); |
| System.exit(res); |
| } |
| } |