| /* |
| Copyright 2016 The Kubernetes Authors. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| */ |
| |
| package priorities |
| |
| import ( |
| "fmt" |
| "strings" |
| |
| "k8s.io/api/core/v1" |
| schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" |
| schedulercache "k8s.io/kubernetes/pkg/scheduler/cache" |
| "k8s.io/kubernetes/pkg/util/parsers" |
| ) |
| |
| // The two thresholds are used as bounds for the image score range. They correspond to a reasonable size range for |
| // container images compressed and stored in registries; 90%ile of images on dockerhub drops into this range. |
| const ( |
| mb int64 = 1024 * 1024 |
| minThreshold int64 = 23 * mb |
| maxThreshold int64 = 1000 * mb |
| ) |
| |
| // ImageLocalityPriorityMap is a priority function that favors nodes that already have requested pod container's images. |
| // It will detect whether the requested images are present on a node, and then calculate a score ranging from 0 to 10 |
| // based on the total size of those images. |
| // - If none of the images are present, this node will be given the lowest priority. |
| // - If some of the images are present on a node, the larger their sizes' sum, the higher the node's priority. |
| func ImageLocalityPriorityMap(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) { |
| node := nodeInfo.Node() |
| if node == nil { |
| return schedulerapi.HostPriority{}, fmt.Errorf("node not found") |
| } |
| |
| var score int |
| if priorityMeta, ok := meta.(*priorityMetadata); ok { |
| score = calculatePriority(sumImageScores(nodeInfo, pod.Spec.Containers, priorityMeta.totalNumNodes)) |
| } else { |
| // if we are not able to parse priority meta data, skip this priority |
| score = 0 |
| } |
| |
| return schedulerapi.HostPriority{ |
| Host: node.Name, |
| Score: score, |
| }, nil |
| } |
| |
| // calculatePriority returns the priority of a node. Given the sumScores of requested images on the node, the node's |
| // priority is obtained by scaling the maximum priority value with a ratio proportional to the sumScores. |
| func calculatePriority(sumScores int64) int { |
| if sumScores < minThreshold { |
| sumScores = minThreshold |
| } else if sumScores > maxThreshold { |
| sumScores = maxThreshold |
| } |
| |
| return int(int64(schedulerapi.MaxPriority) * (sumScores - minThreshold) / (maxThreshold - minThreshold)) |
| } |
| |
| // sumImageScores returns the sum of image scores of all the containers that are already on the node. |
| // Each image receives a raw score of its size, scaled by scaledImageScore. The raw scores are later used to calculate |
| // the final score. Note that the init containers are not considered for it's rare for users to deploy huge init containers. |
| func sumImageScores(nodeInfo *schedulercache.NodeInfo, containers []v1.Container, totalNumNodes int) int64 { |
| var sum int64 |
| imageStates := nodeInfo.ImageStates() |
| |
| for _, container := range containers { |
| if state, ok := imageStates[normalizedImageName(container.Image)]; ok { |
| sum += scaledImageScore(state, totalNumNodes) |
| } |
| } |
| |
| return sum |
| } |
| |
| // scaledImageScore returns an adaptively scaled score for the given state of an image. |
| // The size of the image is used as the base score, scaled by a factor which considers how much nodes the image has "spread" to. |
| // This heuristic aims to mitigate the undesirable "node heating problem", i.e., pods get assigned to the same or |
| // a few nodes due to image locality. |
| func scaledImageScore(imageState *schedulercache.ImageStateSummary, totalNumNodes int) int64 { |
| spread := float64(imageState.NumNodes) / float64(totalNumNodes) |
| return int64(float64(imageState.Size) * spread) |
| } |
| |
| // normalizedImageName returns the CRI compliant name for a given image. |
| // TODO: cover the corner cases of missed matches, e.g, |
| // 1. Using Docker as runtime and docker.io/library/test:tag in pod spec, but only test:tag will present in node status |
| // 2. Using the implicit registry, i.e., test:tag or library/test:tag in pod spec but only docker.io/library/test:tag |
| // in node status; note that if users consistently use one registry format, this should not happen. |
| func normalizedImageName(name string) string { |
| if strings.LastIndex(name, ":") <= strings.LastIndex(name, "/") { |
| name = name + ":" + parsers.DefaultImageTag |
| } |
| return name |
| } |