[horus] Self-healing operational repair (#350)
diff --git a/app/horus/core/alert/dingtalk.go b/app/horus/core/alert/dingtalk.go
index e67e194..94c6e4a 100644
--- a/app/horus/core/alert/dingtalk.go
+++ b/app/horus/core/alert/dingtalk.go
@@ -55,7 +55,7 @@
func DingTalkSend(dk *config.DingTalkConfiguration, msg string) {
dtm := Message{MsgType: "text"}
dtm.Text.Content = fmt.Sprintf("%s\n"+
- "【日志:%s】", DingTalkTitle, msg)
+ "日志:%s】", DingTalkTitle, msg)
dtm.At.AtMobiles = dk.AtMobiles
bs, err := json.Marshal(dtm)
if err != nil {
diff --git a/app/horus/core/horuser/modular.go b/app/horus/core/horuser/modular.go
index 3a34d7e..bd3445d 100644
--- a/app/horus/core/horuser/modular.go
+++ b/app/horus/core/horuser/modular.go
@@ -111,13 +111,13 @@
return
}
err = h.Cordon(nodeName, clusterName)
- res := "success"
+ res := "Success"
if err != nil {
res = fmt.Sprintf("failed:%v", err)
klog.Errorf("Cordon failed:%v", res)
}
- msg := fmt.Sprintf("【集群:%v】\n 【%s 插件 Cordon 节点:%v】\n 【结果: %v】\n 【今日操作次数:%v】",
+ msg := fmt.Sprintf("【集群:%v】\n 【%s 插件 Cordon 节点:%v】\n 【结果: %v】\n 【今日操作次数:%v",
clusterName, moduleName, nodeName, res, len(data)+1)
klog.Infof("Attempting to send DingTalk message: %s", msg)
diff --git a/app/horus/core/horuser/recovery.go b/app/horus/core/horuser/recovery.go
index 51d33fe..5f7fd47 100644
--- a/app/horus/core/horuser/recovery.go
+++ b/app/horus/core/horuser/recovery.go
@@ -60,25 +60,29 @@
klog.Infof("clusterName:%v nodeName:%v", n.ClusterName, n.NodeName)
return
}
- ql := fmt.Sprintf(n.RecoveryQL, n.NodeName)
- vecs, err := h.InstantQuery(addr, ql, n.ClusterName, h.cc.NodeRecovery.PromQueryTimeSecond)
+
+ vecs, err := h.InstantQuery(addr, n.RecoveryQL, n.ClusterName, h.cc.NodeRecovery.PromQueryTimeSecond)
if err != nil {
- klog.Errorf("recoveryNodes instantQuery err:%v ql:%v", err, ql)
+ klog.Errorf("recoveryNodes InstantQuery err:%v ql:%v", err, n.RecoveryQL)
return
}
- if len(vecs) != 2 {
- klog.Errorf("%v", vecs)
+ if len(vecs) != 1 {
+ klog.Infof("Expected 1 result, but got: %d", len(vecs))
+ return
+ }
+ if err != nil {
+ klog.Errorf("recoveryNodes instantQuery err:%v ql:%v", err, n.RecoveryQL)
return
}
klog.Infof("recoveryNodes check success.")
err = h.UnCordon(n.NodeName, n.ClusterName)
- res := "success"
+ res := "Success"
if err != nil {
res = fmt.Sprintf("failed:%v", err)
}
msg := fmt.Sprintf("【自愈检查 %v: 恢复节点调度】【集群: %v】\n【节点: %v】【日期: %v】\n"+
- "【自愈检查 QL: %v】", res, n.ClusterName, n.NodeName, n.CreateTime, ql)
+ "【自愈检查 QL: %v", res, n.ClusterName, n.NodeName, n.CreateTime, n.RecoveryQL)
alert.DingTalkSend(h.cc.NodeRecovery.DingTalk, msg)
pass, err := n.RecoveryMarker()
diff --git a/deploy/horus/horus.yaml b/deploy/horus/horus.yaml
index 80e02f1..dd042b0 100644
--- a/deploy/horus/horus.yaml
+++ b/deploy/horus/horus.yaml
@@ -36,10 +36,10 @@
cluster: config.1
promMultiple:
- cluster: http://192.168.15.128:32608
+ cluster: http://192.168.15.128:30201
nodeRecovery:
- enabled: false
+ enabled: true
dayNumber: 1
checkIntervalSecond: 5
promQueryTimeSecond: 10
@@ -50,20 +50,20 @@
- 15000000
customModular:
- enabled: true
+ enabled: false
cordonDailyLimit:
filesystem_readonly: 5
arp_entries: 5
checkQL:
filesystem_readonly: |-
- node_filesystem_readonly{mountpoint="/",node="%s"} == 1
+ node_filesystem_readonly{mountpoint="/"} == 1
arp_entries: |-
- node_arp_entries{device="ens160",node="master"} > 2
+ node_arp_entries{device="ens160"} > 2
recoveryQL:
filesystem_readonly: |-
- node_filesystem_readonly{mountpoint="/",node="%s"} == 0
+ node_filesystem_readonly{mountpoint="/"} == 0
arp_entries: |-
- node_arp_entries{device="ens160",node="%s"} < 2
+ node_arp_entries{device="ens160"} > 2
checkIntervalSecond: 5
promQueryTimeSecond: 10
kubeMultiple: