blob: 794f992c1c42c96a26d5d732fdb794ab209ea8b7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.eagle.jpm.mr.history.parser;
import org.apache.eagle.jpm.mr.history.MRHistoryJobConfig;
import org.apache.eagle.jpm.mr.historyentity.JobBaseAPIEntity;
import org.apache.eagle.jpm.mr.historyentity.TaskAttemptExecutionAPIEntity;
import org.apache.eagle.jpm.mr.historyentity.TaskFailureCountAPIEntity;
import org.apache.eagle.jpm.util.MRJobTagName;
import org.apache.eagle.service.client.IEagleServiceClient;
import org.apache.eagle.service.client.impl.EagleServiceClientImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.apache.eagle.jpm.mr.history.MRHistoryJobConfig.EagleServiceConfig;
public class TaskFailureListener implements HistoryJobEntityCreationListener {
private static final Logger LOG = LoggerFactory.getLogger(TaskFailureListener.class);
private static final String MR_ERROR_CATEGORY_CONFIG_FILE_NAME = "MRErrorCategory.config";
private static final int BATCH_SIZE = 1000;
private static final int MAX_RETRY_TIMES = 3;
private final List<TaskFailureCountAPIEntity> failureTasks = new ArrayList<TaskFailureCountAPIEntity>();
private final MRErrorClassifier classifier;
private EagleServiceConfig eagleServiceConfig;
public TaskFailureListener(EagleServiceConfig eagleServiceConfig) {
this.eagleServiceConfig = eagleServiceConfig;
InputStream is = null;
try {
is = TaskFailureListener.class.getClassLoader().getResourceAsStream(MR_ERROR_CATEGORY_CONFIG_FILE_NAME);
URL url = TaskFailureListener.class.getClassLoader().getResource(MR_ERROR_CATEGORY_CONFIG_FILE_NAME);
if (url != null) {
LOG.info("Feeder is going to load configuration file: " + url.toString());
}
classifier = new MRErrorClassifier(is);
} catch (IOException ex) {
throw new RuntimeException("Can't find MRErrorCategory.config file to configure MRErrorCategory");
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
LOG.warn("exception found {}", e);
}
}
}
}
@Override
public void jobEntityCreated(JobBaseAPIEntity entity) throws Exception {
if (!(entity instanceof TaskAttemptExecutionAPIEntity)) {
return;
}
TaskAttemptExecutionAPIEntity e = (TaskAttemptExecutionAPIEntity) entity;
// only store those killed or failed tasks
if (!e.getTaskStatus().equals(EagleTaskStatus.FAILED.name()) && !e.getTaskStatus().equals(EagleTaskStatus.KILLED.name())) {
return;
}
TaskFailureCountAPIEntity failureTask = new TaskFailureCountAPIEntity();
Map<String, String> tags = new HashMap<>();
failureTask.setTags(tags);
tags.put(MRJobTagName.SITE.toString(), e.getTags().get(MRJobTagName.SITE.toString()));
tags.put(MRJobTagName.JOD_DEF_ID.toString(), e.getTags().get(MRJobTagName.JOD_DEF_ID.toString()));
tags.put(MRJobTagName.RACK.toString(), e.getTags().get(MRJobTagName.RACK.toString()));
tags.put(MRJobTagName.HOSTNAME.toString(), e.getTags().get(MRJobTagName.HOSTNAME.toString()));
tags.put(MRJobTagName.JOB_ID.toString(), e.getTags().get(MRJobTagName.JOB_ID.toString()));
tags.put(MRJobTagName.TASK_ATTEMPT_ID.toString(), e.getTags().get(MRJobTagName.TASK_ATTEMPT_ID.toString()));
tags.put(MRJobTagName.TASK_TYPE.toString(), e.getTags().get(MRJobTagName.TASK_TYPE.toString()));
//TODO need optimize, match and then capture the data
final String errCategory = classifier.classifyError(e.getError());
tags.put(MRJobTagName.ERROR_CATEGORY.toString(), errCategory);
entity.getTags().put(MRJobTagName.ERROR_CATEGORY.toString(), errCategory);
failureTask.setError(e.getError());
failureTask.setFailureCount(1); // hard coded to 1 unless we do pre-aggregation in the future
failureTask.setTimestamp(e.getTimestamp());
failureTask.setTaskStatus(e.getTaskStatus());
failureTasks.add(failureTask);
if (failureTasks.size() >= BATCH_SIZE) {
flush();
}
}
@Override
public void flush() throws Exception {
IEagleServiceClient client = new EagleServiceClientImpl(
eagleServiceConfig.eagleServiceHost,
eagleServiceConfig.eagleServicePort,
eagleServiceConfig.username,
eagleServiceConfig.password);
//client.getJerseyClient().setReadTimeout(eagleServiceConfig.readTimeoutSeconds * 1000);
client.setReadTimeout(eagleServiceConfig.readTimeoutSeconds * 1000);
int tried = 0;
while (tried <= MAX_RETRY_TIMES) {
try {
LOG.info("start flushing entities of total number " + failureTasks.size());
client.create(failureTasks);
LOG.info("finish flushing entities of total number " + failureTasks.size());
failureTasks.clear();
break;
} catch (Exception ex) {
if (tried < MAX_RETRY_TIMES) {
LOG.error("Got exception to flush, retry as " + (tried + 1) + " times", ex);
} else {
LOG.error("Got exception to flush, reach max retry times " + MAX_RETRY_TIMES, ex);
throw ex;
}
}
tried++;
}
client.close();
}
}