blob: 35c6872c21022411ea1c0bc12bf53f83af1e789c [file] [log] [blame]
package health
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import (
"fmt"
"strconv"
"strings"
"time"
"github.com/apache/trafficcontrol/lib/go-log"
"github.com/apache/trafficcontrol/lib/go-tc"
"github.com/apache/trafficcontrol/lib/go-util"
"github.com/apache/trafficcontrol/traffic_monitor/cache"
"github.com/apache/trafficcontrol/traffic_monitor/peer"
"github.com/apache/trafficcontrol/traffic_monitor/threadsafe"
"github.com/apache/trafficcontrol/traffic_monitor/todata"
)
// GetVitals Gets the vitals to decide health on in the right format
func GetVitals(newResult *cache.Result, prevResult *cache.Result, mc *tc.TrafficMonitorConfigMap) {
if newResult.Error != nil {
log.Errorf("cache_health.GetVitals() called with an errored Result!")
return
}
// proc.loadavg -- we're using the 1 minute average (!?)
// value looks like: "0.20 0.07 0.07 1/967 29536" (without the quotes)
loadAverages := strings.Fields(newResult.Astats.System.ProcLoadavg)
if len(loadAverages) > 0 {
oneMinAvg, err := strconv.ParseFloat(loadAverages[0], 64)
if err != nil {
setErr(newResult, fmt.Errorf("Error converting load average string '%s': %v", newResult.Astats.System.ProcLoadavg, err))
return
}
newResult.Vitals.LoadAvg = oneMinAvg
} else {
setErr(newResult, fmt.Errorf("Can't make sense of '%s' as a load average for %s", newResult.Astats.System.ProcLoadavg, newResult.ID))
return
}
// proc.net.dev -- need to compare to prevSample
// value looks like
// "bond0:8495786321839 31960528603 0 0 0 0 0 2349716 143283576747316 101104535041 0 0 0 0 0 0"
// (without the quotes)
parts := strings.Split(newResult.Astats.System.ProcNetDev, ":")
if len(parts) > 1 {
numbers := strings.Fields(parts[1])
var err error
newResult.Vitals.BytesOut, err = strconv.ParseInt(numbers[8], 10, 64)
if err != nil {
setErr(newResult, fmt.Errorf("Error converting BytesOut from procnetdev: %v", err))
return
}
newResult.Vitals.BytesIn, err = strconv.ParseInt(numbers[0], 10, 64)
if err != nil {
setErr(newResult, fmt.Errorf("Error converting BytesIn from procnetdev: %v", err))
return
}
if prevResult != nil && prevResult.Vitals.BytesOut != 0 {
elapsedTimeInSecs := float64(newResult.Time.UnixNano()-prevResult.Time.UnixNano()) / 1000000000
newResult.Vitals.KbpsOut = int64(float64(((newResult.Vitals.BytesOut - prevResult.Vitals.BytesOut) * 8 / 1000)) / elapsedTimeInSecs)
} else {
// log.Infoln("prevResult == nil for id " + newResult.Id + ". Hope we're just starting up?")
}
} else {
setErr(newResult, fmt.Errorf("Error parsing procnetdev: no fields found"))
return
}
// inf.speed -- value looks like "10000" (without the quotes) so it is in Mbps.
// TODO JvD: Should we really be running this code every second for every cache polled????? I don't think so.
interfaceBandwidth := newResult.Astats.System.InfSpeed
newResult.Vitals.MaxKbpsOut = int64(interfaceBandwidth) * 1000
// log.Infoln(newResult.Id, "BytesOut", newResult.Vitals.BytesOut, "BytesIn", newResult.Vitals.BytesIn, "Kbps", newResult.Vitals.KbpsOut, "max", newResult.Vitals.MaxKbpsOut)
}
func EvalCacheWithStatusInfo(result cache.ResultInfo, mc *tc.TrafficMonitorConfigMap, status tc.CacheStatus, serverInfo tc.TrafficServer) (bool, string, string) {
availability := AvailableStr
if !result.Available {
availability = UnavailableStr
}
switch {
case status == tc.CacheStatusInvalid:
log.Errorf("Cache %v got invalid status from Traffic Ops '%v' - treating as OFFLINE\n", result.ID, serverInfo.ServerStatus)
return false, eventDesc(status, availability+"; invalid status"), ""
case status == tc.CacheStatusAdminDown:
return false, eventDesc(status, availability), ""
case status == tc.CacheStatusOffline:
log.Errorf("Cache %v set to offline, but still polled\n", result.ID)
return false, eventDesc(status, availability), ""
case status == tc.CacheStatusOnline:
return true, eventDesc(status, availability), ""
case result.Error != nil:
return false, eventDesc(status, fmt.Sprintf("%v", result.Error)), ""
case result.System.NotAvailable == true:
return false, eventDesc(status, fmt.Sprintf("system.notAvailable == %v", result.System.NotAvailable)), ""
}
return result.Available, eventDesc(status, availability), ""
}
const AvailableStr = "available"
const UnavailableStr = "unavailable"
// EvalCache returns whether the given cache should be marked available, a string describing why, and which stat exceeded a threshold. The `stats` may be nil, for pollers which don't poll stats.
// The availability of EvalCache MAY NOT be used to directly set the cache's local availability, because the threshold stats may not be part of the poller which produced the result. Rather, if the cache was previously unavailable from a threshold, it must be verified that threshold stat is in the results before setting the cache to available.
// The resultStats may be nil, and if so, won't be checked for thresholds. For example, the Health poller doesn't have Stats.
// TODO change to return a `cache.AvailableStatus`
func EvalCache(result cache.ResultInfo, resultStats *threadsafe.ResultStatValHistory, mc *tc.TrafficMonitorConfigMap) (bool, string, string) {
serverInfo, ok := mc.TrafficServer[string(result.ID)]
if !ok {
log.Errorf("Cache %v missing from from Traffic Ops Monitor Config - treating as OFFLINE\n", result.ID)
return false, "ERROR - server missing in Traffic Ops monitor config", ""
}
status := tc.CacheStatusFromString(serverInfo.ServerStatus)
if status == tc.CacheStatusOnline {
// return here first, even though EvalCacheWithStatus checks online, because we later assume that if EvalCacheWithStatus returns true, to return false if thresholds are exceeded; but, if the cache is ONLINE, we don't want to check thresholds.
return true, eventDesc(status, AvailableStr), ""
}
serverProfile, ok := mc.Profile[serverInfo.Profile]
if !ok {
log.Errorf("Cache %v profile %v missing from from Traffic Ops Monitor Config - treating as OFFLINE\n", result.ID, serverInfo.Profile)
return false, "ERROR - server profile missing in Traffic Ops monitor config", ""
}
avail, eventDescVal, eventMsg := EvalCacheWithStatusInfo(result, mc, status, serverInfo)
if !avail {
return avail, eventDescVal, eventMsg
}
computedStats := cache.ComputedStats()
for stat, threshold := range serverProfile.Parameters.Thresholds {
resultStat := interface{}(nil)
if computedStatF, ok := computedStats[stat]; ok {
dummyCombinedstate := tc.IsAvailable{} // the only stats which use combinedState are things like isAvailable, which don't make sense to ever be thresholds.
resultStat = computedStatF(result, serverInfo, serverProfile, dummyCombinedstate)
} else {
if resultStats == nil {
continue
}
resultStatHistory := resultStats.Load(stat)
if len(resultStatHistory) == 0 {
continue
}
resultStat = resultStatHistory[0].Val
}
resultStatNum, ok := util.ToNumeric(resultStat)
if !ok {
log.Errorf("health.EvalCache threshold stat %s was not a number: %v", stat, resultStat)
continue
}
if !inThreshold(threshold, resultStatNum) {
return false, eventDesc(status, exceedsThresholdMsg(stat, threshold, resultStatNum)), stat
}
}
return avail, eventDescVal, eventMsg
}
// CalcAvailabilityWithStats calculates the availability of each cache in results.
// statResultHistory may be nil, in which case stats won't be used to calculate availability.
func CalcAvailability(results []cache.Result, pollerName string, statResultHistory *threadsafe.ResultStatHistory, mc tc.TrafficMonitorConfigMap, toData todata.TOData, localCacheStatusThreadsafe threadsafe.CacheAvailableStatus, localStates peer.CRStatesThreadsafe, events ThreadsafeEvents) {
localCacheStatuses := localCacheStatusThreadsafe.Get().Copy()
statResults := (*threadsafe.ResultStatValHistory)(nil)
for _, result := range results {
if statResultHistory != nil {
statResultsVal := statResultHistory.LoadOrStore(result.ID)
statResults = &statResultsVal
}
isAvailable, whyAvailable, unavailableStat := EvalCache(cache.ToInfo(result), statResults, &mc)
// if the cache is now Available, and was previously unavailable due to a threshold, make sure this poller contains the stat which exceeded the threshold.
if previousStatus, hasPreviousStatus := localCacheStatuses[result.ID]; isAvailable && hasPreviousStatus && !previousStatus.Available && previousStatus.UnavailableStat != "" {
if !result.HasStat(previousStatus.UnavailableStat) {
return
}
}
localCacheStatuses[result.ID] = cache.AvailableStatus{
Available: isAvailable,
Status: mc.TrafficServer[string(result.ID)].ServerStatus,
Why: whyAvailable,
UnavailableStat: unavailableStat,
Poller: pollerName,
} // TODO move within localStates?
if available, ok := localStates.GetCache(result.ID); !ok || available.IsAvailable != isAvailable {
log.Infof("Changing state for %s was: %t now: %t because %s poller: %v error: %v", result.ID, available.IsAvailable, isAvailable, whyAvailable, pollerName, result.Error)
events.Add(Event{Time: Time(time.Now()), Description: whyAvailable + " (" + pollerName + ")", Name: string(result.ID), Hostname: string(result.ID), Type: toData.ServerTypes[result.ID].String(), Available: isAvailable})
}
localStates.SetCache(result.ID, tc.IsAvailable{IsAvailable: isAvailable})
}
calculateDeliveryServiceState(toData.DeliveryServiceServers, localStates, toData)
localCacheStatusThreadsafe.Set(localCacheStatuses)
}
func setErr(newResult *cache.Result, err error) {
newResult.Error = err
newResult.Available = false
}
// ExceedsThresholdMsg returns a human-readable message for why the given value exceeds the threshold. It does NOT check whether the value actually exceeds the threshold; call `InThreshold` to check first.
func exceedsThresholdMsg(stat string, threshold tc.HealthThreshold, val float64) string {
switch threshold.Comparator {
case "=":
return fmt.Sprintf("%s not equal (%.2f != %.2f)", stat, val, threshold.Val)
case ">":
return fmt.Sprintf("%s too low (%.2f < %.2f)", stat, val, threshold.Val)
case "<":
return fmt.Sprintf("%s too high (%.2f > %.2f)", stat, val, threshold.Val)
case ">=":
return fmt.Sprintf("%s too low (%.2f <= %.2f)", stat, val, threshold.Val)
case "<=":
return fmt.Sprintf("%s too high (%.2f >= %.2f)", stat, val, threshold.Val)
default:
return fmt.Sprintf("ERROR: Invalid Threshold: %+v", threshold)
}
}
func inThreshold(threshold tc.HealthThreshold, val float64) bool {
switch threshold.Comparator {
case "=":
return val == threshold.Val
case ">":
return val > threshold.Val
case "<":
return val < threshold.Val
case ">=":
return val >= threshold.Val
case "<=":
return val <= threshold.Val
default:
log.Errorf("Invalid Threshold: %+v", threshold)
return true // for safety, if a threshold somehow gets corrupted, don't start marking caches down.
}
}
func eventDesc(status tc.CacheStatus, message string) string {
return fmt.Sprintf("%s - %s", status, message)
}
//calculateDeliveryServiceState calculates the state of delivery services from the new cache state data `cacheState` and the CRConfig data `deliveryServiceServers` and puts the calculated state in the outparam `deliveryServiceStates`
func calculateDeliveryServiceState(deliveryServiceServers map[tc.DeliveryServiceName][]tc.CacheName, states peer.CRStatesThreadsafe, toData todata.TOData) {
cacheStates := states.GetCaches() // map[tc.CacheName]IsAvailable
deliveryServices := states.GetDeliveryServices()
for deliveryServiceName, deliveryServiceState := range deliveryServices {
if _, ok := deliveryServiceServers[deliveryServiceName]; !ok {
log.Infof("CRConfig does not have delivery service %s, but traffic monitor poller does; skipping\n", deliveryServiceName)
continue
}
deliveryServiceState.DisabledLocations = getDisabledLocations(deliveryServiceName, toData.DeliveryServiceServers[deliveryServiceName], cacheStates, toData.ServerCachegroups)
states.SetDeliveryService(deliveryServiceName, deliveryServiceState)
}
}
func getDisabledLocations(deliveryService tc.DeliveryServiceName, deliveryServiceServers []tc.CacheName, cacheStates map[tc.CacheName]tc.IsAvailable, serverCacheGroups map[tc.CacheName]tc.CacheGroupName) []tc.CacheGroupName {
disabledLocations := []tc.CacheGroupName{} // it's important this isn't nil, so it serialises to the JSON `[]` instead of `null`
dsCacheStates := getDeliveryServiceCacheAvailability(cacheStates, deliveryServiceServers)
dsCachegroupsAvailable := getDeliveryServiceCachegroupAvailability(dsCacheStates, serverCacheGroups)
for cg, avail := range dsCachegroupsAvailable {
if avail {
continue
}
disabledLocations = append(disabledLocations, cg)
}
return disabledLocations
}
func getDeliveryServiceCacheAvailability(cacheStates map[tc.CacheName]tc.IsAvailable, deliveryServiceServers []tc.CacheName) map[tc.CacheName]tc.IsAvailable {
dsCacheStates := map[tc.CacheName]tc.IsAvailable{}
for _, server := range deliveryServiceServers {
dsCacheStates[server] = cacheStates[server]
}
return dsCacheStates
}
func getDeliveryServiceCachegroupAvailability(dsCacheStates map[tc.CacheName]tc.IsAvailable, serverCachegroups map[tc.CacheName]tc.CacheGroupName) map[tc.CacheGroupName]bool {
cgAvail := map[tc.CacheGroupName]bool{}
for cache, available := range dsCacheStates {
cg, ok := serverCachegroups[cache]
if !ok {
log.Errorf("cache %v not found in cachegroups!\n", cache)
continue
}
if _, ok := cgAvail[cg]; !ok || available.IsAvailable {
cgAvail[cg] = available.IsAvailable
}
}
return cgAvail
}