pkg/dispatcher/dispatcher.go - yunikorn-k8shim - Git at Google

 /*
  Licensed to the Apache Software Foundation (ASF) under one
  or more contributor license agreements.  See the NOTICE file
  distributed with this work for additional information
  regarding copyright ownership.  The ASF licenses this file
  to you under the Apache License, Version 2.0 (the
  "License"); you may not use this file except in compliance
  with the License.  You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
 */

 package dispatcher

 import (
 	"fmt"
 	"sync"
 	"sync/atomic"
 	"time"

 	"go.uber.org/zap"

 	"github.com/apache/yunikorn-k8shim/pkg/common/events"
 	"github.com/apache/yunikorn-k8shim/pkg/conf"
 	"github.com/apache/yunikorn-k8shim/pkg/log"
 )

 var dispatcher *Dispatcher
 var once sync.Once

 type EventType int8

 const (
 	EventTypeApp EventType = iota
 	EventTypeTask
 	EventTypeNode
 	EventTypeScheduler
 	EventTypeAppStatus
 )

 var (
 	AsyncDispatchLimit         int32
 	AsyncDispatchCheckInterval = 3 * time.Second
 	DispatchTimeout            time.Duration
 	asyncDispatchCount         int32 = 0
 )

 // central dispatcher that dispatches scheduling events.
 type Dispatcher struct {
 	eventChan chan events.SchedulingEvent
 	stopChan  chan struct{}
 	handlers  map[EventType]func(interface{})
 	running   atomic.Value
 	lock      sync.RWMutex
 }

 func initDispatcher() {
 	eventChannelCapacity := conf.GetSchedulerConf().EventChannelCapacity
 	dispatcher = &Dispatcher{
 		eventChan: make(chan events.SchedulingEvent, eventChannelCapacity),
 		handlers:  make(map[EventType]func(interface{})),
 		stopChan:  make(chan struct{}),
 		running:   atomic.Value{},
 		lock:      sync.RWMutex{},
 	}
 	dispatcher.setRunning(false)
 	DispatchTimeout = conf.GetSchedulerConf().DispatchTimeout
 	AsyncDispatchLimit = int32(eventChannelCapacity / 10)
 	if AsyncDispatchLimit < 10000 {
 		AsyncDispatchLimit = 10000
 	}
 	log.Logger().Info("Init dispatcher",
 		zap.Int("EventChannelCapacity", eventChannelCapacity),
 		zap.Int32("AsyncDispatchLimit", AsyncDispatchLimit),
 		zap.Float64("DispatchTimeoutInSeconds", DispatchTimeout.Seconds()))
 }

 func RegisterEventHandler(eventType EventType, handlerFn func(interface{})) {
 	eventDispatcher := getDispatcher()
 	eventDispatcher.lock.Lock()
 	defer eventDispatcher.lock.Unlock()
 	eventDispatcher.handlers[eventType] = handlerFn
 }

 // a thread-safe way to get event handlers
 func getEventHandler(eventType EventType) func(interface{}) {
 	eventDispatcher := getDispatcher()
 	eventDispatcher.lock.RLock()
 	defer eventDispatcher.lock.RUnlock()
 	return eventDispatcher.handlers[eventType]
 }

 func getDispatcher() *Dispatcher {
 	// init the dispatcher if it hasn't yet done
 	// this is only called once
 	once.Do(initDispatcher)
 	return dispatcher
 }

 // dispatches scheduler events to actual app/task handler,
 // each app/task has its own state machine and maintain their own states.
 // currently all events share same channel, so they are dispatched
 // one by one in order.
 func Dispatch(event events.SchedulingEvent) {
 	// currently if dispatch fails, we simply log the error
 	// we may revisit this later, e.g add retry here
 	if err := getDispatcher().dispatch(event); err != nil {
 		log.Logger().Warn("failed to dispatch SchedulingEvent",
 			zap.Error(err))
 	}
 }

 func (p *Dispatcher) isRunning() bool {
 	return p.running.Load().(bool)
 }

 func (p *Dispatcher) setRunning(flag bool) {
 	p.running.Store(flag)
 }

 func (p *Dispatcher) dispatch(event events.SchedulingEvent) error {
 	if !p.isRunning() {
 		return fmt.Errorf("dispatcher is not running")
 	}
 	select {
 	case p.eventChan <- event:
 		return nil
 	default:
 		p.asyncDispatch(event)
 		return nil
 	}
 }

 // async-dispatch try to enqueue the event in every 3 seconds util timeout,
 // it's only called when event channel is full.
 func (p *Dispatcher) asyncDispatch(event events.SchedulingEvent) {
 	count := atomic.AddInt32(&asyncDispatchCount, 1)
 	log.Logger().Warn("event channel is full, transition to async-dispatch mode",
 		zap.Int32("asyncDispatchCount", count))
 	if count > AsyncDispatchLimit {
 		panic(fmt.Errorf("dispatcher exceeds async-dispatch limit"))
 	}
 	go func(beginTime time.Time, stop chan struct{}) {
 		defer atomic.AddInt32(&asyncDispatchCount, -1)
 		for p.isRunning() {
 			select {
 			case <-stop:
 				return
 			case p.eventChan <- event:
 				return
 			case <-time.After(AsyncDispatchCheckInterval):
 				elapseTime := time.Since(beginTime)
 				if elapseTime >= DispatchTimeout {
 					log.Logger().Error("dispatch timeout",
 						zap.Float64("elapseSeconds", elapseTime.Seconds()))
 					return
 				}
 				log.Logger().Warn("event channel is full, keep waiting...",
 					zap.Float64("elapseSeconds", elapseTime.Seconds()))
 			}
 		}
 	}(time.Now(), p.stopChan)
 }

 func (p *Dispatcher) drain() {
 	for len(p.eventChan) > 0 {
 		log.Logger().Info("wait dispatcher to drain",
 			zap.Int("remaining events", len(p.eventChan)))
 		time.Sleep(1 * time.Second)
 	}
 	log.Logger().Info("dispatcher is draining out")
 }

 func Start() {
 	log.Logger().Info("starting the dispatcher")
 	if getDispatcher().isRunning() {
 		log.Logger().Info("dispatcher is already running")
 		return
 	}
 	getDispatcher().stopChan = make(chan struct{})
 	go func() {
 		for {
 			select {
 			case event := <-getDispatcher().eventChan:
 				switch v := event.(type) {
 				case events.ApplicationStatusEvent:
 					getEventHandler(EventTypeAppStatus)(v)
 				case events.TaskEvent:
 					getEventHandler(EventTypeTask)(v)
 				case events.ApplicationEvent:
 					getEventHandler(EventTypeApp)(v)
 				case events.SchedulerNodeEvent:
 					getEventHandler(EventTypeNode)(v)
 				case events.SchedulerEvent:
 					getEventHandler(EventTypeScheduler)(v)
 				default:
 					log.Logger().Fatal("unsupported event",
 						zap.Any("event", v))
 				}
 			case <-getDispatcher().stopChan:
 				log.Logger().Info("shutting down event channel")
 				getDispatcher().setRunning(false)
 				return
 			}
 		}
 	}()
 	getDispatcher().setRunning(true)
 }

 // stop the dispatcher and wait at most 5 seconds gracefully
 func Stop() {
 	log.Logger().Info("stopping the dispatcher")

 	var chanClosed bool
 	select {
 	case <-getDispatcher().stopChan:
 		chanClosed = true
 	default:
 	}

 	if chanClosed {
 		if getDispatcher().isRunning() {
 			log.Logger().Info("dispatcher shutdown in progress")
 		} else {
 			log.Logger().Info("dispatcher is already stopped")
 		}
 		return
 	}

 	close(getDispatcher().stopChan)
 	maxTimeout := 5
 	for getDispatcher().isRunning() && maxTimeout > 0 {
 		log.Logger().Info("waiting for dispatcher to be stopped",
 			zap.Int("remainingSeconds", maxTimeout))
 		time.Sleep(1 * time.Second)
 		maxTimeout--
 	}
 	if getDispatcher().isRunning() {
 		log.Logger().Warn("dispatcher even processing did not stop properly")
 	} else {
 		log.Logger().Info("dispatcher stopped successfully")
 	}
 }
	/*
	Licensed to the Apache Software Foundation (ASF) under one
	or more contributor license agreements. See the NOTICE file
	distributed with this work for additional information
	regarding copyright ownership. The ASF licenses this file
	to you under the Apache License, Version 2.0 (the
	"License"); you may not use this file except in compliance
	with the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	*/

	package dispatcher

	import (
	"fmt"
	"sync"
	"sync/atomic"
	"time"

	"go.uber.org/zap"

	"github.com/apache/yunikorn-k8shim/pkg/common/events"
	"github.com/apache/yunikorn-k8shim/pkg/conf"
	"github.com/apache/yunikorn-k8shim/pkg/log"
	)

	var dispatcher *Dispatcher
	var once sync.Once

	type EventType int8

	const (
	EventTypeApp EventType = iota
	EventTypeTask
	EventTypeNode
	EventTypeScheduler
	EventTypeAppStatus
	)

	var (
	AsyncDispatchLimit int32
	AsyncDispatchCheckInterval = 3 * time.Second
	DispatchTimeout time.Duration
	asyncDispatchCount int32 = 0
	)

	// central dispatcher that dispatches scheduling events.
	type Dispatcher struct {
	eventChan chan events.SchedulingEvent
	stopChan chan struct{}
	handlers map[EventType]func(interface{})
	running atomic.Value
	lock sync.RWMutex
	}

	func initDispatcher() {
	eventChannelCapacity := conf.GetSchedulerConf().EventChannelCapacity
	dispatcher = &Dispatcher{
	eventChan: make(chan events.SchedulingEvent, eventChannelCapacity),
	handlers: make(map[EventType]func(interface{})),
	stopChan: make(chan struct{}),
	running: atomic.Value{},
	lock: sync.RWMutex{},
	}
	dispatcher.setRunning(false)
	DispatchTimeout = conf.GetSchedulerConf().DispatchTimeout
	AsyncDispatchLimit = int32(eventChannelCapacity / 10)
	if AsyncDispatchLimit < 10000 {
	AsyncDispatchLimit = 10000
	}
	log.Logger().Info("Init dispatcher",
	zap.Int("EventChannelCapacity", eventChannelCapacity),
	zap.Int32("AsyncDispatchLimit", AsyncDispatchLimit),
	zap.Float64("DispatchTimeoutInSeconds", DispatchTimeout.Seconds()))
	}

	func RegisterEventHandler(eventType EventType, handlerFn func(interface{})) {
	eventDispatcher := getDispatcher()
	eventDispatcher.lock.Lock()
	defer eventDispatcher.lock.Unlock()
	eventDispatcher.handlers[eventType] = handlerFn
	}

	// a thread-safe way to get event handlers
	func getEventHandler(eventType EventType) func(interface{}) {
	eventDispatcher := getDispatcher()
	eventDispatcher.lock.RLock()
	defer eventDispatcher.lock.RUnlock()
	return eventDispatcher.handlers[eventType]
	}

	func getDispatcher() *Dispatcher {
	// init the dispatcher if it hasn't yet done
	// this is only called once
	once.Do(initDispatcher)
	return dispatcher
	}

	// dispatches scheduler events to actual app/task handler,
	// each app/task has its own state machine and maintain their own states.
	// currently all events share same channel, so they are dispatched
	// one by one in order.
	func Dispatch(event events.SchedulingEvent) {
	// currently if dispatch fails, we simply log the error
	// we may revisit this later, e.g add retry here
	if err := getDispatcher().dispatch(event); err != nil {
	log.Logger().Warn("failed to dispatch SchedulingEvent",
	zap.Error(err))
	}
	}

	func (p *Dispatcher) isRunning() bool {
	return p.running.Load().(bool)
	}

	func (p *Dispatcher) setRunning(flag bool) {
	p.running.Store(flag)
	}

	func (p *Dispatcher) dispatch(event events.SchedulingEvent) error {
	if !p.isRunning() {
	return fmt.Errorf("dispatcher is not running")
	}
	select {
	case p.eventChan <- event:
	return nil
	default:
	p.asyncDispatch(event)
	return nil
	}
	}

	// async-dispatch try to enqueue the event in every 3 seconds util timeout,
	// it's only called when event channel is full.
	func (p *Dispatcher) asyncDispatch(event events.SchedulingEvent) {
	count := atomic.AddInt32(&asyncDispatchCount, 1)
	log.Logger().Warn("event channel is full, transition to async-dispatch mode",
	zap.Int32("asyncDispatchCount", count))
	if count > AsyncDispatchLimit {
	panic(fmt.Errorf("dispatcher exceeds async-dispatch limit"))
	}
	go func(beginTime time.Time, stop chan struct{}) {
	defer atomic.AddInt32(&asyncDispatchCount, -1)
	for p.isRunning() {
	select {
	case <-stop:
	return
	case p.eventChan <- event:
	return
	case <-time.After(AsyncDispatchCheckInterval):
	elapseTime := time.Since(beginTime)
	if elapseTime >= DispatchTimeout {
	log.Logger().Error("dispatch timeout",
	zap.Float64("elapseSeconds", elapseTime.Seconds()))
	return
	}
	log.Logger().Warn("event channel is full, keep waiting...",
	zap.Float64("elapseSeconds", elapseTime.Seconds()))
	}
	}
	}(time.Now(), p.stopChan)
	}

	func (p *Dispatcher) drain() {
	for len(p.eventChan) > 0 {
	log.Logger().Info("wait dispatcher to drain",
	zap.Int("remaining events", len(p.eventChan)))
	time.Sleep(1 * time.Second)
	}
	log.Logger().Info("dispatcher is draining out")
	}

	func Start() {
	log.Logger().Info("starting the dispatcher")
	if getDispatcher().isRunning() {
	log.Logger().Info("dispatcher is already running")
	return
	}
	getDispatcher().stopChan = make(chan struct{})
	go func() {
	for {
	select {
	case event := <-getDispatcher().eventChan:
	switch v := event.(type) {
	case events.ApplicationStatusEvent:
	getEventHandler(EventTypeAppStatus)(v)
	case events.TaskEvent:
	getEventHandler(EventTypeTask)(v)
	case events.ApplicationEvent:
	getEventHandler(EventTypeApp)(v)
	case events.SchedulerNodeEvent:
	getEventHandler(EventTypeNode)(v)
	case events.SchedulerEvent:
	getEventHandler(EventTypeScheduler)(v)
	default:
	log.Logger().Fatal("unsupported event",
	zap.Any("event", v))
	}
	case <-getDispatcher().stopChan:
	log.Logger().Info("shutting down event channel")
	getDispatcher().setRunning(false)
	return
	}
	}
	}()
	getDispatcher().setRunning(true)
	}

	// stop the dispatcher and wait at most 5 seconds gracefully
	func Stop() {
	log.Logger().Info("stopping the dispatcher")

	var chanClosed bool
	select {
	case <-getDispatcher().stopChan:
	chanClosed = true
	default:
	}

	if chanClosed {
	if getDispatcher().isRunning() {
	log.Logger().Info("dispatcher shutdown in progress")
	} else {
	log.Logger().Info("dispatcher is already stopped")
	}
	return
	}

	close(getDispatcher().stopChan)
	maxTimeout := 5
	for getDispatcher().isRunning() && maxTimeout > 0 {
	log.Logger().Info("waiting for dispatcher to be stopped",
	zap.Int("remainingSeconds", maxTimeout))
	time.Sleep(1 * time.Second)
	maxTimeout--
	}
	if getDispatcher().isRunning() {
	log.Logger().Warn("dispatcher even processing did not stop properly")
	} else {
	log.Logger().Info("dispatcher stopped successfully")
	}
	}