| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.hadoop.contrib.failmon; |
| |
| import java.util.ArrayList; |
| |
| import org.apache.hadoop.conf.Configuration; |
| |
| /********************************************************** |
| * This class executes monitoring jobs on all nodes of the |
| * cluster, on which we intend to gather failure metrics. |
| * It is basically a thread that sleeps and periodically wakes |
| * up to execute monitoring jobs and ship all gathered data to |
| * a "safe" location, which in most cases will be the HDFS |
| * filesystem of the monitored cluster. |
| * |
| **********************************************************/ |
| |
| public class Executor implements Runnable { |
| |
| public static final int DEFAULT_LOG_INTERVAL = 3600; |
| |
| public static final int DEFAULT_POLL_INTERVAL = 360; |
| |
| public static int MIN_INTERVAL = 5; |
| |
| public static int instances = 0; |
| |
| LocalStore lstore; |
| |
| ArrayList<MonitorJob> monitors; |
| |
| int interval; |
| |
| int upload_interval; |
| int upload_counter; |
| |
| /** |
| * Create an instance of the class and read the configuration |
| * file to determine the set of jobs that will be run and the |
| * maximum interval for which the thread can sleep before it |
| * wakes up to execute a monitoring job on the node. |
| * |
| */ |
| |
| public Executor(Configuration conf) { |
| |
| Environment.prepare("conf/failmon.properties"); |
| |
| String localTmpDir; |
| |
| if (conf == null) { |
| // running as a stand-alone application |
| localTmpDir = System.getProperty("java.io.tmpdir"); |
| Environment.setProperty("local.tmp.dir", localTmpDir); |
| } else { |
| // running from within Hadoop |
| localTmpDir = conf.get("hadoop.tmp.dir"); |
| String hadoopLogPath = System.getProperty("hadoop.log.dir") + "/" + System.getProperty("hadoop.log.file"); |
| Environment.setProperty("hadoop.log.file", hadoopLogPath); |
| Environment.setProperty("local.tmp.dir", localTmpDir); |
| } |
| |
| monitors = Environment.getJobs(); |
| interval = Environment.getInterval(monitors); |
| upload_interval = LocalStore.UPLOAD_INTERVAL; |
| lstore = new LocalStore(); |
| |
| if (Environment.getProperty("local.upload.interval") != null) |
| upload_interval = Integer.parseInt(Environment.getProperty("local.upload.interval")); |
| |
| instances++; |
| } |
| |
| public void run() { |
| upload_counter = upload_interval; |
| |
| Environment.logInfo("Failmon Executor thread started successfully."); |
| while (true) { |
| try { |
| Thread.sleep(interval * 1000); |
| for (int i = 0; i < monitors.size(); i++) { |
| monitors.get(i).counter -= interval; |
| if (monitors.get(i).counter <= 0) { |
| monitors.get(i).reset(); |
| Environment.logInfo("Calling " + monitors.get(i).job.getInfo() + "...\t"); |
| monitors.get(i).job.monitor(lstore); |
| } |
| } |
| upload_counter -= interval; |
| if (upload_counter <= 0) { |
| lstore.upload(); |
| upload_counter = upload_interval; |
| } |
| } catch (InterruptedException e) { |
| e.printStackTrace(); |
| } |
| } |
| } |
| |
| public void cleanup() { |
| instances--; |
| } |
| } |