blob: 1550c7e620ea09ece148b092b71789e6134e177e [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.alibaba.jstorm.cluster;
import backtype.storm.generated.TopologyTaskHbInfo;
import backtype.storm.utils.Utils;
import com.alibaba.jstorm.cache.JStormCache;
import com.alibaba.jstorm.callback.ClusterStateCallback;
import com.alibaba.jstorm.callback.RunnableCallback;
import com.alibaba.jstorm.common.metric.QueueGauge;
import com.alibaba.jstorm.daemon.supervisor.SupervisorInfo;
import com.alibaba.jstorm.schedule.Assignment;
import com.alibaba.jstorm.schedule.AssignmentBak;
import com.alibaba.jstorm.task.TaskInfo;
import com.alibaba.jstorm.task.error.TaskError;
import com.alibaba.jstorm.task.backpressure.SourceBackpressureInfo;
import com.alibaba.jstorm.utils.JStormUtils;
import com.alibaba.jstorm.utils.PathUtils;
import com.alibaba.jstorm.utils.TimeUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.apache.zookeeper.Watcher.Event.EventType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicReference;
public class StormZkClusterState implements StormClusterState {
private static Logger LOG = LoggerFactory.getLogger(StormZkClusterState.class);
private ClusterState cluster_state;
private ConcurrentHashMap<String, RunnableCallback> assignment_info_callback;
private AtomicReference<RunnableCallback> supervisors_callback;
private AtomicReference<RunnableCallback> assignments_callback;
private ConcurrentHashMap<String, RunnableCallback> storm_base_callback;
private AtomicReference<RunnableCallback> master_callback;
private UUID state_id;
private boolean solo;
public StormZkClusterState(Object cluster_state_spec) throws Exception {
if (cluster_state_spec instanceof ClusterState) {
solo = false;
cluster_state = (ClusterState) cluster_state_spec;
} else {
solo = true;
cluster_state = new DistributedClusterState((Map) cluster_state_spec);
}
assignment_info_callback = new ConcurrentHashMap<String, RunnableCallback>();
supervisors_callback = new AtomicReference<RunnableCallback>(null);
assignments_callback = new AtomicReference<RunnableCallback>(null);
storm_base_callback = new ConcurrentHashMap<String, RunnableCallback>();
master_callback = new AtomicReference<RunnableCallback>(null);
state_id = cluster_state.register(new ClusterStateCallback() {
public <T> Object execute(T... args) {
if (args == null) {
LOG.warn("Input args is null");
return null;
} else if (args.length < 2) {
LOG.warn("Input args is invalid, args length:" + args.length);
return null;
}
EventType zkEventTypes = (EventType) args[0];
String path = (String) args[1];
List<String> toks = PathUtils.tokenize_path(path);
int size = toks.size();
if (size >= 1) {
String params = null;
String root = toks.get(0);
RunnableCallback fn = null;
if (root.equals(Cluster.ASSIGNMENTS_ROOT)) {
if (size == 1) {
// set null and get the old value
fn = assignments_callback.getAndSet(null);
} else {
params = toks.get(1);
fn = assignment_info_callback.remove(params);
}
} else if (root.equals(Cluster.SUPERVISORS_ROOT)) {
fn = supervisors_callback.getAndSet(null);
} else if (root.equals(Cluster.STORMS_ROOT) && size > 1) {
params = toks.get(1);
fn = storm_base_callback.remove(params);
} else if (root.equals(Cluster.MASTER_ROOT)) {
fn = master_callback.getAndSet(null);
} else {
LOG.error("Unknown callback for subtree " + path);
}
if (fn != null) {
// FIXME How to set the args
// fn.setArgs(params, zkEventTypes, path);
fn.run();
}
}
return null;
}
});
String[] pathlist =
JStormUtils.mk_arr(Cluster.SUPERVISORS_SUBTREE, Cluster.STORMS_SUBTREE, Cluster.ASSIGNMENTS_SUBTREE, Cluster.ASSIGNMENTS_BAK_SUBTREE,
Cluster.TASKS_SUBTREE, Cluster.TASKBEATS_SUBTREE, Cluster.TASKERRORS_SUBTREE, Cluster.METRIC_SUBTREE, Cluster.BACKPRESSURE_SUBTREE);
for (String path : pathlist) {
cluster_state.mkdirs(path);
}
}
/**
* @@@ TODO
*
* Just add cache in lower ZK level In fact, for some Object Assignment/TaskInfo/StormBase These object can be cache for long time
*
* @param simpleCache
*/
public void setCache(JStormCache simpleCache) {
if (cluster_state instanceof DistributedClusterState) {
((DistributedClusterState) cluster_state).setZkCache(simpleCache);
}
}
public Object getObject(String path, boolean callback) throws Exception {
byte[] data = cluster_state.get_data(path, callback);
return Utils.maybe_deserialize(data);
}
public Object getObjectSync(String path, boolean callback) throws Exception {
byte[] data = cluster_state.get_data_sync(path, callback);
return Utils.maybe_deserialize(data);
}
public String getString(String path, boolean callback) throws Exception {
byte[] data = cluster_state.get_data(path, callback);
return new String(data);
}
public void deleteObject(String path) {
try {
cluster_state.delete_node(path);
} catch (Exception e) {
LOG.warn("Failed to delete node " + path);
}
}
public void setObject(String path, Object obj) throws Exception {
if (obj instanceof byte[]) {
cluster_state.set_data(path, (byte[]) obj);
} else if (obj instanceof String) {
cluster_state.set_data(path, ((String) obj).getBytes());
} else {
cluster_state.set_data(path, Utils.serialize(obj));
}
}
public void setTempObject(String path, Object obj) throws Exception {
if (obj instanceof byte[]) {
cluster_state.set_ephemeral_node(path, (byte[]) obj);
} else if (obj instanceof String) {
cluster_state.set_ephemeral_node(path, ((String) obj).getBytes());
} else {
cluster_state.set_ephemeral_node(path, Utils.serialize(obj));
}
}
@Override
public void disconnect() {
cluster_state.unregister(state_id);
if (solo == true) {
cluster_state.close();
}
}
public void remove_storm(String topologyId, boolean needSleep) {
deleteObject(Cluster.assignment_path(topologyId));
// wait 10 seconds, so supervisor will kill worker smoothly
if (needSleep) {
JStormUtils.sleepMs(10000);
}
try {
deleteObject(Cluster.storm_task_root(topologyId));
teardown_heartbeats(topologyId);
teardown_task_errors(topologyId);
teardown_backpressure(topologyId);
deleteObject(Cluster.metric_path(topologyId));
} catch (Exception e) {
LOG.warn("Failed to delete task root and monitor root for" + topologyId);
}
remove_storm_base(topologyId);
}
@Override
public void remove_storm(String topologyId) throws Exception {
remove_storm(topologyId, true);
}
@Override
public void try_remove_storm(String topologyId) {
remove_storm(topologyId, false);
}
@Override
public Assignment assignment_info(String topologyId, RunnableCallback callback) throws Exception {
if (callback != null) {
assignment_info_callback.put(topologyId, callback);
}
String assgnmentPath = Cluster.assignment_path(topologyId);
return (Assignment) getObject(assgnmentPath, callback != null);
}
@Override
public List<String> assignments(RunnableCallback callback) throws Exception {
if (callback != null) {
assignments_callback.set(callback);
}
return cluster_state.get_children(Cluster.ASSIGNMENTS_SUBTREE, callback != null);
}
@Override
public void set_assignment(String topologyId, Assignment info) throws Exception {
setObject(Cluster.assignment_path(topologyId), info);
}
@Override
public AssignmentBak assignment_bak(String topologyName) throws Exception {
String assgnmentBakPath = Cluster.assignment_bak_path(topologyName);
return (AssignmentBak) getObject(assgnmentBakPath, false);
}
@Override
public void backup_assignment(String topologyName, AssignmentBak info) throws Exception {
setObject(Cluster.assignment_bak_path(topologyName), info);
}
@Override
public StormBase storm_base(String topologyId, RunnableCallback callback) throws Exception {
if (callback != null) {
storm_base_callback.put(topologyId, callback);
}
return (StormBase) getObject(Cluster.storm_path(topologyId), callback != null);
}
@Override
public void activate_storm(String topologyId, StormBase stormBase) throws Exception {
String stormPath = Cluster.storm_path(topologyId);
setObject(stormPath, stormBase);
}
@Override
public void remove_storm_base(String topologyId) {
deleteObject(Cluster.storm_path(topologyId));
}
@Override
public void update_storm(String topologyId, StormStatus newElems) throws Exception {
/**
* FIXME, maybe overwrite old callback
*/
StormBase base = this.storm_base(topologyId, null);
if (base != null) {
base.setStatus(newElems);
setObject(Cluster.storm_path(topologyId), base);
}
}
@Override
public void set_storm_monitor(String topologyId, boolean isEnable) throws Exception {
// TODO Auto-generated method stub
StormBase base = this.storm_base(topologyId, null);
if (base != null) {
base.setEnableMonitor(isEnable);
setObject(Cluster.storm_path(topologyId), base);
}
}
@Override
public List<String> active_storms() throws Exception {
return cluster_state.get_children(Cluster.STORMS_SUBTREE, false);
}
@Override
public void topology_heartbeat(String topologyId, TopologyTaskHbInfo info) throws Exception {
String taskPath = Cluster.taskbeat_storm_root(topologyId);
setObject(taskPath, info);
}
@Override
public TopologyTaskHbInfo topology_heartbeat(String topologyId) throws Exception {
String taskPath = Cluster.taskbeat_storm_root(topologyId);
return (TopologyTaskHbInfo) getObject(taskPath, false);
}
@Override
public List<String> heartbeat_storms() throws Exception {
return cluster_state.get_children(Cluster.TASKBEATS_SUBTREE, false);
}
@Override
public void teardown_heartbeats(String topologyId) {
try {
String taskbeatPath = Cluster.taskbeat_storm_root(topologyId);
deleteObject(taskbeatPath);
} catch (Exception e) {
LOG.warn("Could not teardown heartbeats for " + topologyId, e);
}
}
@Override
public void report_task_error(String topologyId, int taskId, Throwable error) throws Exception {
report_task_error(topologyId, taskId, new String(JStormUtils.getErrorInfo(error)), null);
}
public void report_task_error(String topologyId, int taskId, String error, String tag) throws Exception {
boolean found = false;
String path = Cluster.taskerror_path(topologyId, taskId);
cluster_state.mkdirs(path);
List<Integer> children = new ArrayList<Integer>();
String timeStamp = String.valueOf(TimeUtils.current_time_secs());
String timestampPath = path + Cluster.ZK_SEPERATOR + timeStamp;
for (String str : cluster_state.get_children(path, false)) {
String errorPath = path + "/" + str;
String errorInfo = getString(errorPath, false);
if (StringUtils.isBlank(errorInfo)) {
deleteObject(errorPath);
continue;
}
if (errorInfo.equals(error)
|| (tag != null && errorInfo.startsWith(tag))) {
cluster_state.delete_node(errorPath);
cluster_state.set_data(timestampPath, error.getBytes());
found = true;
break;
}
children.add(Integer.parseInt(str));
}
if (found == false) {
Collections.sort(children);
while (children.size() >= 3) {
deleteObject(path + Cluster.ZK_SEPERATOR + children.remove(0));
}
setObject(timestampPath, error);
}
setLastErrInfo(topologyId, error, timeStamp);
}
private static final String TASK_IS_DEAD = "is dead on"; // Full string is
// "task-id is dead on hostname:port"
private void setLastErrInfo(String topologyId, String error, String timeStamp) throws Exception {
// Set error information in task error topology patch
// Last Error information format in ZK: map<report_duration, timestamp>
// report_duration means only the errors will presented in web ui if the
// error happens within this duration.
// Currently, the duration for "queue full" error is 180sec(3min) while
// the duration for other errors is 1800sec(30min).
String lastErrTopoPath = Cluster.lasterror_path(topologyId);
Map<Integer, String> lastErrInfo = null;
try {
lastErrInfo = (Map<Integer, String>) getObject(lastErrTopoPath, false);
} catch (Exception e) {
LOG.error("Failed to get last error time. Remove the corrupt node for " + topologyId, e);
remove_lastErr_time(topologyId);
lastErrInfo = null;
}
if (lastErrInfo == null)
lastErrInfo = new HashMap<Integer, String>();
// The error time is used to indicate how long the error info is present
// in UI
if (error.indexOf(QueueGauge.QUEUE_IS_FULL) != -1)
lastErrInfo.put(JStormUtils.MIN_1 * 3, timeStamp);
else if (error.indexOf(TASK_IS_DEAD) != -1)
lastErrInfo.put(JStormUtils.DAY_1 * 3, timeStamp);
else
lastErrInfo.put(JStormUtils.MIN_30, timeStamp);
setObject(lastErrTopoPath, lastErrInfo);
}
@Override
public void remove_task_error(String topologyId, int taskId) throws Exception {
String path = Cluster.taskerror_path(topologyId, taskId);
cluster_state.delete_node(path);
}
@Override
public Map<Integer, String> topo_lastErr_time(String topologyId) throws Exception {
String path = Cluster.lasterror_path(topologyId);
return (Map<Integer, String>) getObject(path, false);
}
@Override
public void remove_lastErr_time(String topologyId) throws Exception {
String path = Cluster.lasterror_path(topologyId);
deleteObject(path);
}
@Override
public List<String> task_error_storms() throws Exception {
return cluster_state.get_children(Cluster.TASKERRORS_SUBTREE, false);
}
@Override
public List<String> task_error_ids(String topologyId) throws Exception {
return cluster_state.get_children(Cluster.taskerror_storm_root(topologyId), false);
}
@Override
public List<String> task_error_time(String topologyId, int taskId) throws Exception {
String path = Cluster.taskerror_path(topologyId, taskId);
if (cluster_state.node_existed(path, false) == false) {
return new ArrayList<String>();
}
return cluster_state.get_children(path, false);
}
@Override
public void remove_task(String topologyId, Set<Integer> taskIds) throws Exception {
String tasksPath = Cluster.storm_task_root(topologyId);
Object data = getObject(tasksPath, false);
if (data != null) {
Map<Integer, TaskInfo> taskInfoMap = ((Map<Integer, TaskInfo>) data);
for (Integer taskId : taskIds) {
taskInfoMap.remove(taskId);
}
// update zk node of tasks
setObject(tasksPath, taskInfoMap);
}
}
@Override
public String task_error_info(String topologyId, int taskId, long timeStamp) throws Exception {
String path = Cluster.taskerror_path(topologyId, taskId);
path = path + "/" + timeStamp;
return getString(path, false);
}
@Override
public List<TaskError> task_errors(String topologyId, int taskId) throws Exception {
List<TaskError> errors = new ArrayList<TaskError>();
String path = Cluster.taskerror_path(topologyId, taskId);
if (cluster_state.node_existed(path, false) == false) {
return errors;
}
List<String> children = cluster_state.get_children(path, false);
for (String str : children) {
byte[] v = cluster_state.get_data(path + "/" + str, false);
if (v != null) {
TaskError error = new TaskError(new String(v), Integer.parseInt(str));
errors.add(error);
}
}
Collections.sort(errors, new Comparator<TaskError>() {
@Override
public int compare(TaskError o1, TaskError o2) {
if (o1.getTimSecs() > o2.getTimSecs()) {
return 1;
}
if (o1.getTimSecs() < o2.getTimSecs()) {
return -1;
}
return 0;
}
});
return errors;
}
@Override
public void teardown_task_errors(String topologyId) {
try {
String taskerrPath = Cluster.taskerror_storm_root(topologyId);
deleteObject(taskerrPath);
} catch (Exception e) {
LOG.error("Could not teardown errors for " + topologyId, e);
}
}
@Override
public void set_task(String topologyId, Map<Integer, TaskInfo> taskInfoMap) throws Exception {
String stormTaskPath = Cluster.storm_task_root(topologyId);
if (taskInfoMap != null) {
// reupdate zk node of tasks
setObject(stormTaskPath, taskInfoMap);
}
}
@Override
public void add_task(String topologyId, Map<Integer, TaskInfo> taskInfoMap) throws Exception {
String stormTaskPath = Cluster.storm_task_root(topologyId);
Object data = getObject(stormTaskPath, false);
if (data != null) {
((Map<Integer, TaskInfo>) data).putAll(taskInfoMap);
// reupdate zk node of tasks
setObject(stormTaskPath, data);
}
}
@Override
public List<String> task_storms() throws Exception {
return cluster_state.get_children(Cluster.TASKS_SUBTREE, false);
}
@Override
public Set<Integer> task_ids(String stromId) throws Exception {
String stormTaskPath = Cluster.storm_task_root(stromId);
Object data = getObject(stormTaskPath, false);
if (data == null) {
return null;
}
return ((Map<Integer, TaskInfo>) data).keySet();
}
@Override
public Set<Integer> task_ids_by_componentId(String topologyId, String componentId) throws Exception {
String stormTaskPath = Cluster.storm_task_root(topologyId);
Object data = getObject(stormTaskPath, false);
if (data == null) {
return null;
}
Map<Integer, TaskInfo> taskInfoMap = (Map<Integer, TaskInfo>) data;
Set<Integer> rtn = new HashSet<Integer>();
Set<Integer> taskIds = taskInfoMap.keySet();
for (Integer taskId : taskIds) {
TaskInfo taskInfo = taskInfoMap.get(taskId);
if (taskInfo != null) {
if (taskInfo.getComponentId().equalsIgnoreCase(componentId))
rtn.add(taskId);
}
}
return rtn;
}
@Override
public Map<Integer, TaskInfo> task_all_info(String topologyId) throws Exception {
String taskPath = Cluster.storm_task_root(topologyId);
Object data = getObject(taskPath, false);
if (data == null) {
return null;
}
return (Map<Integer, TaskInfo>) data;
}
@Override
public SupervisorInfo supervisor_info(String supervisorId) throws Exception {
String supervisorPath = Cluster.supervisor_path(supervisorId);
return (SupervisorInfo) getObject(supervisorPath, false);
}
@Override
public List<String> supervisors(RunnableCallback callback) throws Exception {
if (callback != null) {
supervisors_callback.set(callback);
}
return cluster_state.get_children(Cluster.SUPERVISORS_SUBTREE, callback != null);
}
@Override
public void supervisor_heartbeat(String supervisorId, SupervisorInfo info) throws Exception {
String supervisorPath = Cluster.supervisor_path(supervisorId);
setTempObject(supervisorPath, info);
}
@Override
public String get_leader_host() throws Exception {
// TODO Auto-generated method stub
return new String(cluster_state.get_data(Cluster.MASTER_SUBTREE, false));
}
@Override
public boolean leader_existed() throws Exception {
// TODO Auto-generated method stub
return cluster_state.node_existed(Cluster.MASTER_SUBTREE, false);
}
@Override
public List<String> get_nimbus_slaves() throws Exception {
return cluster_state.get_children(Cluster.NIMBUS_SLAVE_SUBTREE, false);
}
public String get_nimbus_slave_time(String host) throws Exception {
String path = Cluster.NIMBUS_SLAVE_SUBTREE + Cluster.ZK_SEPERATOR + host;
return getString(path, false);
}
@Override
public void update_nimbus_slave(String host, int time) throws Exception {
setTempObject(Cluster.NIMBUS_SLAVE_SUBTREE + Cluster.ZK_SEPERATOR + host, String.valueOf(time));
}
@Override
public void unregister_nimbus_host(String host) throws Exception {
deleteObject(Cluster.NIMBUS_SLAVE_SUBTREE + Cluster.ZK_SEPERATOR + host);
}
@Override
public void update_nimbus_detail(String hostPort, Map map) throws Exception {
// TODO Auto-generated method stub
cluster_state.set_ephemeral_node(Cluster.NIMBUS_SLAVE_DETAIL_SUBTREE + Cluster.ZK_SEPERATOR + hostPort, Utils.serialize(map));
}
@Override
public Map get_nimbus_detail(String hostPort, boolean watch) throws Exception {
byte[] data = cluster_state.get_data(Cluster.NIMBUS_SLAVE_DETAIL_SUBTREE + Cluster.ZK_SEPERATOR + hostPort, watch);
return (Map) Utils.maybe_deserialize(data);
}
@Override
public void unregister_nimbus_detail(String hostPort) throws Exception {
cluster_state.delete_node(Cluster.NIMBUS_SLAVE_DETAIL_SUBTREE + Cluster.ZK_SEPERATOR + hostPort);
}
@Override
public boolean try_to_be_leader(String path, String host, RunnableCallback callback) throws Exception {
// TODO Auto-generated method stub
if (callback != null)
this.master_callback.set(callback);
try {
cluster_state.tryToBeLeader(path, host.getBytes());
} catch (NodeExistsException e) {
cluster_state.node_existed(path, true);
LOG.info("leader is alive");
return false;
}
return true;
}
@Override
public void set_topology_metric(String topologyId, Object metric) throws Exception {
String path = Cluster.metric_path(topologyId);
setObject(path, metric);
}
@Override
public Object get_topology_metric(String topologyId) throws Exception {
return getObject(Cluster.metric_path(topologyId), false);
}
@Override
public List<String> get_metrics() throws Exception {
return cluster_state.get_children(Cluster.METRIC_SUBTREE, false);
}
@Override
public List<String> list_dirs(String path, boolean watch) throws Exception {
List<String> subDirs = null;
subDirs = cluster_state.get_children(path, watch);
return subDirs;
}
@Override
public List<String> backpressureInfos() throws Exception {
return cluster_state.get_children(Cluster.BACKPRESSURE_SUBTREE, false);
}
@Override
public void set_backpressure_info(String topologyId, Map<String, SourceBackpressureInfo> sourceToBackpressureInfo) throws Exception {
String path = Cluster.backpressure_path(topologyId);
cluster_state.set_data(path, Utils.serialize(sourceToBackpressureInfo));
}
@Override
public Map<String, SourceBackpressureInfo> get_backpressure_info(String topologyId) throws Exception {
String path = Cluster.backpressure_path(topologyId);
byte[] data = cluster_state.get_data(path, false);
return (Map<String, SourceBackpressureInfo>) Utils.maybe_deserialize(data);
}
@Override
public void teardown_backpressure(String topologyId) {
try {
String backpressurePath = Cluster.backpressure_path(topologyId);
cluster_state.delete_node(backpressurePath);
} catch (Exception e) {
LOG.warn("Could not teardown backpressure info for " + topologyId, e);
}
}
}