blob: 8ce26311ec12ab62bf81e2304725a90824e8234d [file] [log] [blame]
package health
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import (
"fmt"
"strings"
"time"
"github.com/apache/trafficcontrol/lib/go-log"
"github.com/apache/trafficcontrol/lib/go-tc"
"github.com/apache/trafficcontrol/lib/go-util"
"github.com/apache/trafficcontrol/traffic_monitor/cache"
"github.com/apache/trafficcontrol/traffic_monitor/config"
"github.com/apache/trafficcontrol/traffic_monitor/peer"
"github.com/apache/trafficcontrol/traffic_monitor/threadsafe"
"github.com/apache/trafficcontrol/traffic_monitor/todata"
)
// Used as a dummy value for evaluating threshold stats (which don't use real
// combined states).
var dummyCombinedState = tc.IsAvailable{}
// AvailableStr is used to describe the state of a cache server that is
// available to serve traffic.
const AvailableStr = "available"
// UnavailableStr is used to describe the state of a cache server that is not
// available to serve traffic.
const UnavailableStr = "unavailable"
// GetVitals Gets the vitals to decide health on in the right format
func GetVitals(newResult *cache.Result, prevResult *cache.Result, mc *tc.TrafficMonitorConfigMap) {
if newResult.Error != nil {
log.Errorf("cache_health.GetVitals() called with an errored Result!")
return
}
if mc == nil {
log.Errorf("TrafficMonitorConfigMap must not be nil")
return
}
if newResult.InterfaceVitals == nil {
newResult.InterfaceVitals = map[string]cache.Vitals{}
}
// proc.loadavg -- we're using the 1 minute average (!?)
newResult.Vitals.LoadAvg = newResult.Statistics.Loadavg.One
ts, exists := mc.TrafficServer[newResult.ID]
if !exists {
log.Errorf("cache server not found in config map for cache: %s", newResult.ID)
return
}
if ts.Interfaces == nil {
log.Warnf("no interfaces reported in config map for cache: %s", newResult.ID)
return
}
var monitoredInterfaces []tc.ServerInterfaceInfo
for _, srvrIfaceInfo := range mc.TrafficServer[newResult.ID].Interfaces {
if srvrIfaceInfo.Monitor {
monitoredInterfaces = append(monitoredInterfaces, srvrIfaceInfo)
}
}
if len(monitoredInterfaces) == 0 {
log.Warnf("no interfaces selected to be monitored for %v", newResult.ID)
return
}
for _, monitoredInterface := range monitoredInterfaces {
ifaceName := monitoredInterface.Name
iface, exists := newResult.Interfaces()[ifaceName]
if !exists {
// monitored interface doesn't exist in Result interfaces, skip
log.Warnf("monitored interface %v does not exist in cache %v", ifaceName, newResult.ID)
continue
}
ifaceVitals := cache.Vitals{
BytesIn: iface.BytesIn,
BytesOut: iface.BytesOut,
MaxKbpsOut: iface.Speed * 1000,
}
if prevResult != nil && prevResult.InterfaceVitals != nil && prevResult.InterfaceVitals[ifaceName].BytesOut != 0 {
elapsedTimeInSecs := float64(newResult.Time.UnixNano()-prevResult.Time.UnixNano()) / 1000000000
ifaceVitals.KbpsOut = int64(float64((ifaceVitals.BytesOut-prevResult.InterfaceVitals[ifaceName].BytesOut)*8/1000) / elapsedTimeInSecs)
}
newResult.InterfaceVitals[ifaceName] = ifaceVitals
// Overflow possible
newResult.Vitals.BytesOut += iface.BytesOut
newResult.Vitals.BytesIn += iface.BytesIn
// TODO JvD: Should we really be running this code every second for every cache polled????? I don't think so.
newResult.Vitals.MaxKbpsOut += iface.Speed * 1000
}
if prevResult != nil && prevResult.Vitals.BytesOut != 0 {
elapsedTimeInSecs := float64(newResult.Time.UnixNano()-prevResult.Time.UnixNano()) / 1000000000
newResult.Vitals.KbpsOut = int64(float64((newResult.Vitals.BytesOut-prevResult.Vitals.BytesOut)*8/1000) / elapsedTimeInSecs)
}
}
// EvalCacheWithStatusInfo evaluates whether the given cache should be marked
// available, taking the server's configured Status into account as well as its
// polling information.
func EvalCacheWithStatusInfo(result cache.ResultInfo, mc *tc.TrafficMonitorConfigMap, status tc.CacheStatus, serverStatus string) (bool, string, string) {
availability := AvailableStr
if !result.Available {
availability = UnavailableStr
}
switch {
case status == tc.CacheStatusInvalid:
log.Errorf("Cache %v got invalid status from Traffic Ops '%v' - treating as OFFLINE\n", result.ID, serverStatus)
return false, eventDesc(status, availability+"; invalid status"), ""
case status == tc.CacheStatusAdminDown:
return false, eventDesc(status, availability), ""
case status == tc.CacheStatusOffline:
log.Errorf("Cache %v set to offline, but still polled\n", result.ID)
return false, eventDesc(status, availability), ""
case status == tc.CacheStatusOnline:
return true, eventDesc(status, availability), ""
case result.Error != nil:
return false, eventDesc(status, fmt.Sprintf("%v", result.Error)), ""
case result.Statistics.NotAvailable == true:
return false, eventDesc(status, fmt.Sprintf("system.notAvailable == %v", result.Statistics.NotAvailable)), ""
}
return result.Available, eventDesc(status, availability), ""
}
// EvalInterface returns whether the given interface should be marked
// available, a boolean of whether the result was over IPv4 (false means it
// was IPv6), a string describing why, and which stat exceeded a threshold. The
// `stats` may be nil, for pollers which don't poll stats. The availability of
// EvalCache MAY NOT be used to directly set the cache's local availability,
// because the threshold stats may not be part of the poller which produced the
// result. Rather, if the cache was previously unavailable from a threshold, it
// must be verified that threshold stat is in the results before setting the
// cache to available. The resultStats may be nil, and if so, won't be checked
// for thresholds. For example, the Health poller doesn't have Stats.
// TODO change to return a `cache.AvailableStatus`
func EvalInterface(infVitals map[string]cache.Vitals, inf tc.ServerInterfaceInfo) (bool, string) {
if !inf.Monitor {
return true, ""
}
vitals, ok := infVitals[inf.Name]
if !ok {
return false, "not found in polled data"
}
if inf.MaxBandwidth == nil {
return true, ""
}
if *inf.MaxBandwidth < uint64(vitals.KbpsOut) {
return false, "maximum bandwidth exceeded"
}
return true, ""
}
// EvalAggregate calculates the availability of a cache server as an aggregate
// of server metrics and metrics of its network interfaces.
func EvalAggregate(result cache.ResultInfo, resultStats *threadsafe.ResultStatValHistory, mc *tc.TrafficMonitorConfigMap) (bool, string, string) {
serverInfo, ok := mc.TrafficServer[string(result.ID)]
if !ok {
log.Errorf("Cache %v missing from from Traffic Ops Monitor Config - treating as OFFLINE\n", result.ID)
return false, "ERROR - server missing in Traffic Ops monitor config", ""
}
status := tc.CacheStatusFromString(serverInfo.ServerStatus)
if status == tc.CacheStatusOnline {
// return here first, even though EvalCacheWithStatus checks online, because we later assume that if EvalCacheWithStatus returns true, to return false if thresholds are exceeded; but, if the cache is ONLINE, we don't want to check thresholds.
return true, eventDesc(status, AvailableStr), ""
}
profile, ok := mc.Profile[serverInfo.Profile]
if !ok {
log.Errorf("Profile '%v' for cache server '%v' missing from monitoring configuration - treating as OFFLINE", serverInfo.Profile, result.ID)
return false, "ERROR - server profile missing in Traffic Ops monitor config", ""
}
avail, eventDescVal, eventMsg := EvalCacheWithStatusInfo(result, mc, status, serverInfo.ServerStatus)
if !avail {
return avail, eventDescVal, eventMsg
}
computedStats := cache.ComputedStats()
for stat, threshold := range profile.Parameters.Thresholds {
resultStat := interface{}(nil)
computedStatF, ok := computedStats[stat]
if !ok {
if resultStats == nil {
continue
}
resultStatHistory := resultStats.Load(stat)
if len(resultStatHistory) == 0 {
continue
}
resultStat = resultStatHistory[0].Val
} else {
resultStat = computedStatF(result, serverInfo, profile, dummyCombinedState)
}
resultStatNum, ok := util.ToNumeric(resultStat)
if !ok {
log.Errorf("health.EvalCache threshold stat %s was not a number: %v", stat, resultStat)
continue
}
if !inThreshold(threshold, resultStatNum) {
return false, eventDesc(status, exceedsThresholdMsg(stat, threshold, resultStatNum)), stat
}
}
return avail, eventDescVal, eventMsg
}
// getProcessAvailableTuple gets a function to process an availability tuple
// based on the protocol used.
func getProcessAvailableTuple(protocol config.PollingProtocol) func(cache.AvailableTuple, tc.TrafficServer) bool {
switch protocol {
case config.IPv4Only:
return func(tuple cache.AvailableTuple, _ tc.TrafficServer) bool {
return tuple.IPv4
}
case config.IPv6Only:
return func(tuple cache.AvailableTuple, _ tc.TrafficServer) bool {
return tuple.IPv6
}
case config.Both:
return func(tuple cache.AvailableTuple, serverInfo tc.TrafficServer) bool {
if serverInfo.IPv4() == "" {
return tuple.IPv6
} else if serverInfo.IPv6() == "" {
return tuple.IPv4
}
return tuple.IPv4 || tuple.IPv6
}
default:
log.Errorf("received an unknown Polling Protocol: %s", protocol)
}
return func(cache.AvailableTuple, tc.TrafficServer) bool { return false }
}
// CalcAvailability calculates the availability of each cache in results.
// statResultHistory may be nil, in which case stats won't be used to calculate
// availability.
func CalcAvailability(
results []cache.Result,
pollerName string,
statResultHistory *threadsafe.ResultStatHistory,
mc tc.TrafficMonitorConfigMap,
toData todata.TOData,
localCacheStatusThreadsafe threadsafe.CacheAvailableStatus,
localStates peer.CRStatesThreadsafe,
events ThreadsafeEvents,
protocol config.PollingProtocol,
) {
localCacheStatuses := localCacheStatusThreadsafe.Get().Copy()
var statResultsVal *threadsafe.CacheStatHistory
processAvailableTuple := getProcessAvailableTuple(protocol)
for _, result := range results {
if statResultHistory != nil {
t := statResultHistory.LoadOrStore(result.ID)
statResultsVal = &t
}
serverInfo, ok := mc.TrafficServer[result.ID]
if !ok {
log.Errorf("Cache %v missing from from Traffic Ops Monitor Config - treating as OFFLINE\n", result.ID)
}
availStatus := cache.AvailableStatus{
LastCheckedIPv4: result.UsingIPv4,
ProcessedAvailable: true,
Poller: pollerName,
Status: serverInfo.ServerStatus,
}
lastStatus, ok := localCacheStatuses[result.ID]
if ok {
if result.UsingIPv4 {
availStatus.Available.IPv4 = true
availStatus.Available.IPv6 = serverInfo.IPv6() != "" && lastStatus.Available.IPv6
} else {
availStatus.Available.IPv6 = true
availStatus.Available.IPv4 = serverInfo.IPv4() != "" && lastStatus.Available.IPv4
}
}
reasons := []string{}
resultInfo := cache.ToInfo(result)
for _, inf := range serverInfo.Interfaces {
if !inf.Monitor {
continue
}
available, why := EvalInterface(resultInfo.InterfaceVitals, inf)
if result.UsingIPv4 {
availStatus.Available.IPv4 = availStatus.Available.IPv4 && available
} else {
availStatus.Available.IPv6 = availStatus.Available.IPv6 && available
}
if why != "" {
reasons = append(reasons, inf.Name+": "+why)
}
}
var aggIsAvailable bool
var aggWhyAvailable string
var aggUnavailableStat string
if statResultsVal != nil {
aggIsAvailable, aggWhyAvailable, aggUnavailableStat = EvalAggregate(cache.ToInfo(result), &statResultsVal.Stats, &mc)
} else {
aggIsAvailable, aggWhyAvailable, aggUnavailableStat = EvalAggregate(cache.ToInfo(result), nil, &mc)
}
if result.UsingIPv4 {
availStatus.Available.IPv4 = availStatus.Available.IPv4 && aggIsAvailable
} else {
availStatus.Available.IPv6 = availStatus.Available.IPv6 && aggIsAvailable
}
availStatus.ProcessedAvailable = processAvailableTuple(availStatus.Available, serverInfo)
if aggWhyAvailable != "" {
reasons = append([]string{aggWhyAvailable}, reasons...)
}
availStatus.Why = strings.Join(reasons, "; ")
if aggUnavailableStat != "" {
availStatus.UnavailableStat = aggUnavailableStat
}
localStates.SetCache(tc.CacheName(result.ID), tc.IsAvailable{
IsAvailable: availStatus.ProcessedAvailable,
Ipv4Available: availStatus.Available.IPv4,
Ipv6Available: availStatus.Available.IPv6,
DirectlyPolled: true, // we know this cache was directly polled because otherwise we wouldn't have a cache.Result for it
Status: availStatus.Why,
LastPoll: result.Time,
})
if available, ok := localStates.GetCache(tc.CacheName(result.ID)); !ok || available.IsAvailable != lastStatus.ProcessedAvailable {
protocol := "IPv4"
if !availStatus.LastCheckedIPv4 {
protocol = "IPv6"
}
log.Infof("Changing state for %s was: %t now: %t because %s poller: %v on protocol %v error: %v",
result.ID, available.IsAvailable, availStatus.ProcessedAvailable, availStatus.Why, pollerName, protocol, result.Error)
event := Event{
Time: Time(time.Now()),
Description: "Protocol (" + protocol + ") " + availStatus.Why + " (" + pollerName + ") ",
Name: result.ID,
Hostname: result.ID,
Type: toData.ServerTypes[tc.CacheName(result.ID)].String(),
Available: availStatus.ProcessedAvailable,
IPv4Available: availStatus.Available.IPv4,
IPv6Available: availStatus.Available.IPv6,
}
events.Add(event)
}
localCacheStatuses[result.ID] = availStatus
}
calculateDeliveryServiceState(localStates)
localCacheStatusThreadsafe.Set(localCacheStatuses)
}
// ExceedsThresholdMsg returns a human-readable message for why the given value exceeds the threshold. It does NOT check whether the value actually exceeds the threshold; call `InThreshold` to check first.
func exceedsThresholdMsg(stat string, threshold tc.HealthThreshold, val float64) string {
switch threshold.Comparator {
case "=":
return fmt.Sprintf("%s not equal (%.2f != %.2f)", stat, val, threshold.Val)
case ">":
return fmt.Sprintf("%s too low (%.2f < %.2f)", stat, val, threshold.Val)
case "<":
return fmt.Sprintf("%s too high (%.2f > %.2f)", stat, val, threshold.Val)
case ">=":
return fmt.Sprintf("%s too low (%.2f <= %.2f)", stat, val, threshold.Val)
case "<=":
return fmt.Sprintf("%s too high (%.2f >= %.2f)", stat, val, threshold.Val)
default:
return fmt.Sprintf("ERROR: Invalid Threshold: %+v", threshold)
}
}
func inThreshold(threshold tc.HealthThreshold, val float64) bool {
switch threshold.Comparator {
case "=":
return val == threshold.Val
case ">":
return val > threshold.Val
case "<":
return val < threshold.Val
case ">=":
return val >= threshold.Val
case "<=":
return val <= threshold.Val
default:
log.Errorf("Invalid Threshold: %+v", threshold)
return true // for safety, if a threshold somehow gets corrupted, don't start marking caches down.
}
}
func eventDesc(status tc.CacheStatus, message string) string {
return fmt.Sprintf("%s - %s", status, message)
}
//calculateDeliveryServiceState calculates the state of delivery services from the new cache state data `cacheState` and the CRConfig data `deliveryServiceServers` and puts the calculated state in the outparam `deliveryServiceStates`
func calculateDeliveryServiceState(states peer.CRStatesThreadsafe) {
deliveryServices := states.GetDeliveryServices()
for deliveryServiceName, deliveryServiceState := range deliveryServices {
// NOTE: DisabledLocations is always empty, and it's important that it isn't nil, so it serialises to the JSON `[]` instead of `null`.
// It's no longer populated due to it being an unnecessary optimization for Traffic Router, but the field is kept for compatibility.
deliveryServiceState.DisabledLocations = []tc.CacheGroupName{}
states.SetDeliveryService(deliveryServiceName, deliveryServiceState)
}
}