blob: 341c743a5f1afd433861b323c2233f182023b559 [file] [log] [blame]
package manager
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import (
"fmt"
"net/url"
"os"
"strconv"
"strings"
"time"
"github.com/apache/trafficcontrol/lib/go-log"
"github.com/apache/trafficcontrol/lib/go-tc"
"github.com/apache/trafficcontrol/traffic_monitor/cache"
"github.com/apache/trafficcontrol/traffic_monitor/config"
"github.com/apache/trafficcontrol/traffic_monitor/peer"
"github.com/apache/trafficcontrol/traffic_monitor/poller"
"github.com/apache/trafficcontrol/traffic_monitor/threadsafe"
"github.com/apache/trafficcontrol/traffic_monitor/todata"
"github.com/apache/trafficcontrol/traffic_monitor/towrap"
)
type PollIntervals struct {
Health time.Duration
HealthNoKeepAlive bool
Peer time.Duration
PeerNoKeepAlive bool
Stat time.Duration
StatNoKeepAlive bool
TO time.Duration
}
// getPollIntervals reads the Traffic Ops Client monitorConfig structure, and parses and returns the health, peer, stat, and TrafficOps poll intervals
func getIntervals(monitorConfig tc.TrafficMonitorConfigMap, cfg config.Config, logMissingParams bool) (PollIntervals, error) {
intervals := PollIntervals{}
peerPollIntervalI, peerPollIntervalExists := monitorConfig.Config["peers.polling.interval"]
if !peerPollIntervalExists {
return PollIntervals{}, fmt.Errorf("Traffic Ops Monitor config missing 'peers.polling.interval', not setting config changes.\n")
}
peerPollIntervalInt, peerPollIntervalIsInt := peerPollIntervalI.(float64)
if !peerPollIntervalIsInt {
return PollIntervals{}, fmt.Errorf("Traffic Ops Monitor config 'peers.polling.interval' value '%v' type %T is not an integer, not setting config changes.\n", peerPollIntervalI, peerPollIntervalI)
}
intervals.Peer = trafficOpsPeerPollIntervalToDuration(int(peerPollIntervalInt))
statPollIntervalI, statPollIntervalExists := monitorConfig.Config["health.polling.interval"]
if !statPollIntervalExists {
return PollIntervals{}, fmt.Errorf("Traffic Ops Monitor config missing 'health.polling.interval', not setting config changes.\n")
}
statPollIntervalInt, statPollIntervalIsInt := statPollIntervalI.(float64)
if !statPollIntervalIsInt {
return PollIntervals{}, fmt.Errorf("Traffic Ops Monitor config 'health.polling.interval' value '%v' type %T is not an integer, not setting config changes.\n", statPollIntervalI, statPollIntervalI)
}
intervals.Stat = trafficOpsStatPollIntervalToDuration(int(statPollIntervalInt))
healthPollIntervalI, healthPollIntervalExists := monitorConfig.Config["heartbeat.polling.interval"]
healthPollIntervalInt, healthPollIntervalIsInt := healthPollIntervalI.(float64)
if !healthPollIntervalExists {
if logMissingParams {
log.Warnln("Traffic Ops Monitor config missing 'heartbeat.polling.interval', using health for heartbeat.")
}
healthPollIntervalInt = statPollIntervalInt
} else if !healthPollIntervalIsInt {
log.Warnf("Traffic Ops Monitor config 'heartbeat.polling.interval' value '%v' type %T is not an integer, using health for heartbeat\n", statPollIntervalI, statPollIntervalI)
healthPollIntervalInt = statPollIntervalInt
}
intervals.Health = trafficOpsHealthPollIntervalToDuration(int(healthPollIntervalInt))
toPollIntervalI, toPollIntervalExists := monitorConfig.Config["tm.polling.interval"]
toPollIntervalInt, toPollIntervalIsInt := toPollIntervalI.(float64)
intervals.TO = cfg.MonitorConfigPollingInterval
if !toPollIntervalExists {
if logMissingParams {
log.Warnf("Traffic Ops Monitor config missing 'tm.polling.interval', using config value '%v'\n", cfg.MonitorConfigPollingInterval)
}
} else if !toPollIntervalIsInt {
log.Warnf("Traffic Ops Monitor config 'tm.polling.interval' value '%v' type %T is not an integer, using config value '%v'\n", toPollIntervalI, toPollIntervalI, cfg.MonitorConfigPollingInterval)
} else {
intervals.TO = trafficOpsTOPollIntervalToDuration(int(toPollIntervalInt))
}
getNoKeepAlive := func(param string) bool {
keepAliveI, keepAliveExists := monitorConfig.Config[param]
keepAliveStr, keepAliveIsStr := keepAliveI.(string)
return keepAliveExists && keepAliveIsStr && !strings.HasPrefix(strings.ToLower(keepAliveStr), "t")
}
intervals.PeerNoKeepAlive = getNoKeepAlive("peer.polling.keepalive")
intervals.HealthNoKeepAlive = getNoKeepAlive("health.polling.keepalive")
intervals.StatNoKeepAlive = getNoKeepAlive("stat.polling.keepalive")
multiplyByRatio := func(i time.Duration) time.Duration {
return time.Duration(float64(i) * PollIntervalRatio)
}
intervals.TO = multiplyByRatio(intervals.TO)
intervals.Health = multiplyByRatio(intervals.Health)
intervals.Peer = multiplyByRatio(intervals.Peer)
intervals.Stat = multiplyByRatio(intervals.Stat)
return intervals, nil
}
// StartMonitorConfigManager runs the monitor config manager goroutine, and returns the threadsafe data which it sets.
func StartMonitorConfigManager(
monitorConfigPollChan <-chan poller.MonitorCfg,
localStates peer.CRStatesThreadsafe,
peerStates peer.CRStatesPeersThreadsafe,
statURLSubscriber chan<- poller.CachePollerConfig,
healthURLSubscriber chan<- poller.CachePollerConfig,
peerURLSubscriber chan<- poller.CachePollerConfig,
toIntervalSubscriber chan<- time.Duration,
cachesChangeSubscriber chan<- struct{},
cfg config.Config,
staticAppData config.StaticAppData,
toSession towrap.ITrafficOpsSession,
toData todata.TODataThreadsafe,
) threadsafe.TrafficMonitorConfigMap {
monitorConfig := threadsafe.NewTrafficMonitorConfigMap()
go monitorConfigListen(monitorConfig,
monitorConfigPollChan,
localStates,
peerStates,
statURLSubscriber,
healthURLSubscriber,
peerURLSubscriber,
toIntervalSubscriber,
cachesChangeSubscriber,
cfg,
staticAppData,
toSession,
toData,
)
return monitorConfig
}
const DefaultHealthConnectionTimeout = time.Second * 2
// trafficOpsHealthConnectionTimeoutToDuration takes the int from Traffic Ops, which is in milliseconds, and returns a time.Duration
// TODO change Traffic Ops Client API to a time.Duration
func trafficOpsHealthConnectionTimeoutToDuration(t int) time.Duration {
return time.Duration(t) * time.Millisecond
}
// trafficOpsPeerPollIntervalToDuration takes the int from Traffic Ops, which is in milliseconds, and returns a time.Duration
// TODO change Traffic Ops Client API to a time.Duration
func trafficOpsPeerPollIntervalToDuration(t int) time.Duration {
return time.Duration(t) * time.Millisecond
}
// trafficOpsStatPollIntervalToDuration takes the int from Traffic Ops, which is in milliseconds, and returns a time.Duration
// TODO change Traffic Ops Client API to a time.Duration
func trafficOpsStatPollIntervalToDuration(t int) time.Duration {
return time.Duration(t) * time.Millisecond
}
// trafficOpsHealthPollIntervalToDuration takes the int from Traffic Ops, which is in milliseconds, and returns a time.Duration
// TODO change Traffic Ops Client API to a time.Duration
func trafficOpsHealthPollIntervalToDuration(t int) time.Duration {
return time.Duration(t) * time.Millisecond
}
// trafficOpsTOPollIntervalToDuration takes the int from Traffic Ops, which is in milliseconds, and returns a time.Duration
// TODO change Traffic Ops Client API to a time.Duration
func trafficOpsTOPollIntervalToDuration(t int) time.Duration {
return time.Duration(t) * time.Millisecond
}
// PollIntervalRatio is the ratio of the configuration interval to poll. The configured intervals are 'target' times, so we actually poll at some small fraction less, in attempt to make the actual poll marginally less than the target.
const PollIntervalRatio = float64(0.97) // TODO make config?
// TODO timing, and determine if the case, or its internal `for`, should be put in a goroutine
// TODO determine if subscribers take action on change, and change to mutexed objects if not.
func monitorConfigListen(
monitorConfigTS threadsafe.TrafficMonitorConfigMap,
monitorConfigPollChan <-chan poller.MonitorCfg,
localStates peer.CRStatesThreadsafe,
peerStates peer.CRStatesPeersThreadsafe,
statURLSubscriber chan<- poller.CachePollerConfig,
healthURLSubscriber chan<- poller.CachePollerConfig,
peerURLSubscriber chan<- poller.CachePollerConfig,
toIntervalSubscriber chan<- time.Duration,
cachesChangeSubscriber chan<- struct{},
cfg config.Config,
staticAppData config.StaticAppData,
toSession towrap.ITrafficOpsSession,
toData todata.TODataThreadsafe,
) {
defer func() {
if err := recover(); err != nil {
log.Errorf("MonitorConfigManager panic: %v\n", err)
} else {
log.Errorf("MonitorConfigManager failed without panic\n")
}
os.Exit(1) // The Monitor can't run without a MonitorConfigManager
}()
logMissingIntervalParams := true
for pollerMonitorCfg := range monitorConfigPollChan {
monitorConfig := pollerMonitorCfg.Cfg
cdn := pollerMonitorCfg.CDN
monitorConfigTS.Set(monitorConfig)
if err := toData.Update(toSession, cdn); err != nil {
log.Errorln("Updating Traffic Ops Data: " + err.Error())
}
healthURLs := map[string]poller.PollConfig{}
statURLs := map[string]poller.PollConfig{}
peerURLs := map[string]poller.PollConfig{}
caches := map[string]string{}
intervals, err := getIntervals(monitorConfig, cfg, logMissingIntervalParams)
logMissingIntervalParams = false // only log missing parameters once
if err != nil {
log.Errorf("monitor config error getting polling intervals, can't poll: %v", err)
continue
}
for _, srv := range monitorConfig.TrafficServer {
caches[srv.HostName] = srv.ServerStatus
cacheName := tc.CacheName(srv.HostName)
srvStatus := tc.CacheStatusFromString(srv.ServerStatus)
if srvStatus == tc.CacheStatusOnline {
localStates.AddCache(cacheName, tc.IsAvailable{IsAvailable: true})
continue
}
if srvStatus == tc.CacheStatusOffline {
continue
}
// seed states with available = false until our polling cycle picks up a result
if _, exists := localStates.GetCache(cacheName); !exists {
localStates.AddCache(cacheName, tc.IsAvailable{IsAvailable: false})
}
pollURLStr := monitorConfig.Profile[srv.Profile].Parameters.HealthPollingURL
if pollURLStr == "" {
log.Errorf("monitor config server %v profile %v has no polling URL; can't poll", srv.HostName, srv.Profile)
continue
}
format := monitorConfig.Profile[srv.Profile].Parameters.HealthPollingFormat
if format == "" {
format = cache.DefaultStatsType
log.Infof("health.polling.format for '%v' is empty, using default '%v'", srv.HostName, format)
}
pollType := monitorConfig.Profile[srv.Profile].Parameters.HealthPollingType
if pollType == "" {
pollType = poller.DefaultPollerType
log.Infof("health.polling.type for '%v' is empty, using default '%v'", srv.HostName, pollType)
}
pollURLStr = createServerHealthPollURL(pollURLStr, srv)
connTimeout := trafficOpsHealthConnectionTimeoutToDuration(monitorConfig.Profile[srv.Profile].Parameters.HealthConnectionTimeout)
if connTimeout == 0 {
connTimeout = DefaultHealthConnectionTimeout
log.Warnln("profile " + srv.Profile + " health.connection.timeout Parameter is missing or zero, using default " + DefaultHealthConnectionTimeout.String())
}
healthURLs[srv.HostName] = poller.PollConfig{URL: pollURLStr, Host: srv.FQDN, Timeout: connTimeout, Format: format, PollType: pollType}
statURL := createServerStatPollURL(pollURLStr)
statURLs[srv.HostName] = poller.PollConfig{URL: statURL, Host: srv.FQDN, Timeout: connTimeout, Format: format, PollType: pollType}
}
peerSet := map[tc.TrafficMonitorName]struct{}{}
for _, srv := range monitorConfig.TrafficMonitor {
if srv.HostName == staticAppData.Hostname {
continue
}
if tc.CacheStatusFromString(srv.ServerStatus) != tc.CacheStatusOnline {
continue
}
// TODO: the URL should be config driven. -jse
url := fmt.Sprintf("http://%s:%d/publish/CrStates?raw", srv.IP, srv.Port)
peerURLs[srv.HostName] = poller.PollConfig{URL: url, Host: srv.FQDN} // TODO determine timeout.
peerSet[tc.TrafficMonitorName(srv.HostName)] = struct{}{}
}
statURLSubscriber <- poller.CachePollerConfig{Urls: statURLs, Interval: intervals.Stat, NoKeepAlive: intervals.StatNoKeepAlive}
healthURLSubscriber <- poller.CachePollerConfig{Urls: healthURLs, Interval: intervals.Health, NoKeepAlive: intervals.HealthNoKeepAlive}
peerURLSubscriber <- poller.CachePollerConfig{Urls: peerURLs, Interval: intervals.Peer, NoKeepAlive: intervals.PeerNoKeepAlive}
toIntervalSubscriber <- intervals.TO
peerStates.SetTimeout((intervals.Peer + cfg.HTTPTimeout) * 2)
peerStates.SetPeers(peerSet)
for cacheName := range localStates.GetCaches() {
if _, exists := monitorConfig.TrafficServer[string(cacheName)]; !exists {
log.Warnf("Removing %s from localStates", cacheName)
localStates.DeleteCache(cacheName)
}
}
if len(healthURLs) == 0 {
log.Errorf("No REPORTED caches exist in Traffic Ops, nothing to poll.")
}
cachesChangeSubscriber <- struct{}{}
// TODO because there are multiple writers to localStates.DeliveryService, there is a race condition, where MonitorConfig (this func) and HealthResultManager could write at the same time, and the HealthResultManager could overwrite a delivery service addition or deletion here. Probably the simplest and most performant fix would be a lock-free algorithm using atomic compare-and-swaps.
for _, ds := range monitorConfig.DeliveryService {
// since caches default to unavailable, also default DS false
if _, exists := localStates.GetDeliveryService(tc.DeliveryServiceName(ds.XMLID)); !exists {
localStates.SetDeliveryService(tc.DeliveryServiceName(ds.XMLID), tc.CRStatesDeliveryService{IsAvailable: false, DisabledLocations: []tc.CacheGroupName{}}) // important to initialize DisabledLocations, so JSON is `[]` not `null`
}
}
for ds := range localStates.GetDeliveryServices() {
if _, exists := monitorConfig.DeliveryService[string(ds)]; !exists {
localStates.DeleteDeliveryService(ds)
}
}
}
}
// createServerHealthPollURL takes the template pollingURLStr, and replaces variables with data from srv, and returns the polling URL for srv.
func createServerHealthPollURL(pollingURLStr string, srv tc.TrafficServer) string {
pollingURLStr = strings.NewReplacer(
"${hostname}", srv.IP,
"${interface_name}", srv.InterfaceName,
"application=plugin.remap", "application=system",
"application=", "application=system",
).Replace(pollingURLStr)
if strings.HasPrefix(strings.ToLower(pollingURLStr), "https") {
if srv.HTTPSPort != 0 {
pollURL, err := url.Parse(pollingURLStr)
if err != nil {
log.Warnln("profile " + srv.Profile + " cache '" + srv.FQDN + "' polling URL '" + pollingURLStr + "' failed to parse, may not be a valid URL! Using anyway, not using custom HTTPS Port " + strconv.Itoa(srv.HTTPSPort) + "!")
} else if pollURL.Port() == "" { // if there's both an HTTPS Port and a port in the polling URL, the polling URL takes precedence
pollURL.Host += ":" + strconv.Itoa(srv.HTTPSPort)
pollingURLStr = pollURL.String()
}
}
} else {
if srv.Port != 0 {
pollURL, err := url.Parse(pollingURLStr)
if err != nil {
log.Warnln("profile " + srv.Profile + " cache '" + srv.FQDN + "' polling URL '" + pollingURLStr + "' failed to parse, may not be a valid URL! Using anyway, not using custom TCP Port " + strconv.Itoa(srv.Port) + "!")
} else if pollURL.Port() == "" { // if there's both a TCP Port and a port in the polling URL, the polling URL takes precedence
pollURL.Host += ":" + strconv.Itoa(srv.Port)
pollingURLStr = pollURL.String()
}
}
}
return pollingURLStr
}
// createServerStatPollURL takes the health polling URL string, and modifies it to be the stat poll URL.
// Note this does not replace template variables with server values, healthPollURLStr must be the health URL for a given server, not a template.
func createServerStatPollURL(healthPollURLStr string) string {
return strings.NewReplacer("application=system", "application=").Replace(healthPollURLStr)
}