src/main/java/org/apache/aurora/scheduler/sla/SlaAlgorithm.java - aurora - Git at Google

 /**
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.aurora.scheduler.sla;

 import java.util.Collection;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;

 import com.google.common.base.Function;
 import com.google.common.base.Functions;
 import com.google.common.base.Predicate;
 import com.google.common.base.Predicates;
 import com.google.common.collect.FluentIterable;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Multimaps;
 import com.google.common.collect.Ordering;
 import com.google.common.collect.Range;

 import org.apache.aurora.common.collections.Pair;
 import org.apache.aurora.gen.ScheduleStatus;
 import org.apache.aurora.scheduler.base.Tasks;
 import org.apache.aurora.scheduler.storage.entities.IJobKey;
 import org.apache.aurora.scheduler.storage.entities.IScheduledTask;
 import org.apache.aurora.scheduler.storage.entities.ITaskEvent;

 import static java.util.Objects.requireNonNull;

 import static org.apache.aurora.gen.ScheduleStatus.ASSIGNED;
 import static org.apache.aurora.gen.ScheduleStatus.PENDING;
 import static org.apache.aurora.gen.ScheduleStatus.RUNNING;
 import static org.apache.aurora.gen.ScheduleStatus.STARTING;

 /**
  * Defines an SLA algorithm to be applied to a {@link IScheduledTask}
  * set for calculating a specific SLA metric.
  */
 interface SlaAlgorithm {

   /**
    * Applies this algorithm to a set of {@link IScheduledTask} to
    * produce a named metric value over the specified time frame.
    *
    * @param tasks Set of tasks to apply this algorithm to.
    * @param timeFrame Relevant time frame.
    * @return Produced metric value.
    */
   Number calculate(Iterable<IScheduledTask> tasks, Range<Long> timeFrame);

   /**
    * Pre-configured SLA algorithms.
    */
   enum AlgorithmType {

     JOB_UPTIME_99(new JobUptime(99f), String.format(JobUptime.NAME_FORMAT, 99f)),
     JOB_UPTIME_95(new JobUptime(95f), String.format(JobUptime.NAME_FORMAT, 95f)),
     JOB_UPTIME_90(new JobUptime(90f), String.format(JobUptime.NAME_FORMAT, 90f)),
     JOB_UPTIME_75(new JobUptime(75f), String.format(JobUptime.NAME_FORMAT, 75f)),
     JOB_UPTIME_50(new JobUptime(50f), String.format(JobUptime.NAME_FORMAT, 50f)),
     AGGREGATE_PLATFORM_UPTIME(new AggregatePlatformUptime(), "platform_uptime_percent"),
     MEDIAN_TIME_TO_ASSIGNED(new MedianAlgorithm(ASSIGNED), "mtta_ms"),
     MEDIAN_TIME_TO_STARTING(new MedianAlgorithm(STARTING), "mtts_ms"),
     MEDIAN_TIME_TO_RUNNING(new MedianAlgorithm(RUNNING), "mttr_ms");

     private final SlaAlgorithm algorithm;
     private final String name;

     AlgorithmType(SlaAlgorithm algorithm, String name) {
       this.algorithm = algorithm;
       this.name = name;
     }

     SlaAlgorithm getAlgorithm() {
       return algorithm;
     }

     String getAlgorithmName() {
       return name;
     }
   }

   /**
    * Median time to status SLA algorithm.
    * Represents the median time spent waiting for a set of tasks to reach specified status.
    * A combined metric that helps tracking the task scheduling performance dependency on the
    * requested resources (user scope) as well as the internal scheduler bin-packing algorithm
    * efficiency (platform scope).
    * <p/>
    * Median time calculated as:
    * <pre>
    *    MT =  MEDIAN(Wait_times)
    * where:
    *    Wait_times - a collection of qualifying time intervals between PENDING and specified task
    *                 state. An interval is qualified if its end point is contained by the sample
    *                 time frame.
    *</pre>
    */
   final class MedianAlgorithm implements SlaAlgorithm {

     private final ScheduleStatus status;

     private MedianAlgorithm(ScheduleStatus status) {
       this.status = status;
     }

     @Override
     public Number calculate(Iterable<IScheduledTask> tasks, Range<Long> timeFrame) {
       Iterable<IScheduledTask> activeTasks = FluentIterable.from(tasks)
           .filter(
               Predicates.compose(Predicates.in(Tasks.ACTIVE_STATES), IScheduledTask::getStatus));

       List<Long> waitTimes = Lists.newLinkedList();
       for (IScheduledTask task : activeTasks) {
         long pendingTs = 0;
         for (ITaskEvent event : task.getTaskEvents()) {
           if (event.getStatus() == PENDING) {
             pendingTs = event.getTimestamp();
           } else if (event.getStatus() == status && timeFrame.contains(event.getTimestamp())) {

             if (pendingTs == 0) {
               throw new IllegalArgumentException("SLA: missing PENDING status for:"
                   + task.getAssignedTask().getTaskId());
             }

             waitTimes.add(event.getTimestamp() - pendingTs);
             break;
           }
         }
       }

       return SlaUtil.percentile(waitTimes, 50.0);
     }
   }

   /**
    * Job uptime SLA algorithm.
    * Represents the percentage of instances considered to be in running state for
    * the specified duration relative to SLA calculation time.
    */
   final class JobUptime implements SlaAlgorithm {

     private static final String NAME_FORMAT = "job_uptime_%.2f_sec";
     private final float percentile;

     private static final Predicate<IScheduledTask> IS_RUNNING =
         Predicates.compose(
             Predicates.in(ImmutableSet.of(RUNNING)),
             IScheduledTask::getStatus);

     private static final Function<IScheduledTask, ITaskEvent> TASK_TO_EVENT =
         Tasks::getLatestEvent;

     private JobUptime(float percentile) {
       this.percentile = percentile;
     }

     @Override
     public Number calculate(Iterable<IScheduledTask> tasks, final Range<Long> timeFrame) {
       List<Long> uptimes = FluentIterable.from(tasks)
           .filter(IS_RUNNING)
           .transform(Functions.compose(
               event -> timeFrame.upperEndpoint() - event.getTimestamp(),
               TASK_TO_EVENT)).toList();

       return (double) SlaUtil.percentile(uptimes, percentile) / 1000;
     }
   }

   /**
    * Aggregate Platform Uptime SLA algorithm.
    * Aggregate amount of runnable time a platform managed to deliver for a set of tasks from the
    * moment of reaching them RUNNING status. Excludes any time a task is not in a runnable state
    * due to user activities (e.g. newly created waiting for host assignment or restarted/killed
    * by the user).
    * <p/>
    * Aggregate platform uptime calculated as:
    * <pre>
    *    APU = SUM(Up_time) / SUM(SI - Removed_time)
    * where:
    *    Up_time - the aggregate instance UP time over the sampling interval (SI);
    *    SI - sampling interval (e.g. 1 minute);
    *    Removed_time - the aggregate instance REMOVED time over the sampling interval.
    * </pre>
    */
   final class AggregatePlatformUptime implements SlaAlgorithm {

     /**
      * Task platform SLA state.
      */
     enum SlaState {
       /**
        * Starts a period when the task is not expected to be UP due to user initiated action
        * or failure.
        * <p/>
        * This period is ignored for the calculation purposes.
        */
       REMOVED,

       /**
        * Starts a period when the task cannot reach the UP state for some non-user-related reason.
        * <p/>
        * Only platform-incurred task state transitions are considered here. If a task is newly
        * created (e.g. by job create/update) the amount of time a task spends to reach its UP
        * state is not counted towards platform downtime. For example, a newly added PENDING task
        * is considered as REMOVED, whereas a PENDING task rescheduled from LOST will be considered
        * as DOWN. This approach ensures this metric is not sensitive to user-initiated activities
        * and is a true reflection of the system recovery performance.
        */
       DOWN,

       /**
        * Starts a period when the task is considered to be up and running from the Aurora
        * platform standpoint.
        * <p/>
        * Note: The platform uptime does not necessarily equate to the real application
        * availability. This is because a hosted application needs time to deploy, initialize,
        * and start executing.
        */
       UP
     }

     private static class Interval {
       private final SlaState state;
       private final Range<Long> range;

       Interval(SlaState state, long start, long end) {
         this.state = state;
         range = Range.closedOpen(start, end);
       }
     }

     private static class InstanceId {
       private final IJobKey jobKey;
       private final int id;

       InstanceId(IJobKey jobKey, int instanceId) {
         this.jobKey = requireNonNull(jobKey);
         this.id = instanceId;
       }

       @Override
       public boolean equals(Object o) {
         if (!(o instanceof InstanceId)) {
           return false;
         }

         InstanceId other = (InstanceId) o;
         return Objects.equals(jobKey, other.jobKey)
             && Objects.equals(id, other.id);
       }

       @Override
       public int hashCode() {
         return Objects.hash(jobKey, id);
       }
     }

     private static final Function<IScheduledTask, InstanceId> TO_ID =
         task -> new InstanceId(
             task.getAssignedTask().getTask().getJob(),
             task.getAssignedTask().getInstanceId());

     private static final Function<ITaskEvent, Long> TASK_EVENT_TO_TIMESTAMP =
         ITaskEvent::getTimestamp;

     /**
      * Combine all task events per given instance into the unified sorted instance history view.
      */
     private static final Function<Collection<IScheduledTask>, List<ITaskEvent>> TO_SORTED_EVENTS =
         tasks -> {
           List<ITaskEvent> result = Lists.newLinkedList();
           for (IScheduledTask task : tasks) {
             result.addAll(task.getTaskEvents());
           }

           return Ordering.natural()
               .onResultOf(TASK_EVENT_TO_TIMESTAMP).immutableSortedCopy(result);
         };

     /**
      * Convert instance history into the {@link SlaState} based {@link Interval} list.
      */
     private static final Function<List<ITaskEvent>, List<Interval>> TASK_EVENTS_TO_INTERVALS =
         events -> {

           ImmutableList.Builder<Interval> intervals = ImmutableList.builder();
           Pair<SlaState, Long> current = Pair.of(SlaState.REMOVED, 0L);

           for (ITaskEvent event : events) {
             long timestamp = event.getTimestamp();

             // Event status in the instance timeline signifies either of the following:
             // - termination of the existing SlaState interval AND start of a new one;
             // - continuation of the existing matching SlaState interval.
             switch (event.getStatus()) {
               case LOST:
               case DRAINING:
               case PREEMPTING:
                 current = updateIntervals(timestamp, SlaState.DOWN, current, intervals);
                 break;

               case PENDING:
               case ASSIGNED:
               case STARTING:
                 if (current.getFirst() != SlaState.DOWN) {
                   current = updateIntervals(timestamp, SlaState.REMOVED, current, intervals);
                 }
                 break;

               case THROTTLED:
               case FINISHED:
               case RESTARTING:
               case FAILED:
               case KILLING:
                 current = updateIntervals(timestamp, SlaState.REMOVED, current, intervals);
                 break;

               case RUNNING:
                 current = updateIntervals(timestamp, SlaState.UP, current, intervals);
                 break;

               case KILLED:
                 if (current.getFirst() == SlaState.UP) {
                   current = updateIntervals(timestamp, SlaState.DOWN, current, intervals);
                 }
                 break;

               case INIT:
                 // Ignore.
                 break;

               default:
                 throw new IllegalArgumentException("Unsupported status:" + event.getStatus());
             }
           }
           // Add the last event interval.
           intervals.add(new Interval(current.getFirst(), current.getSecond(), Long.MAX_VALUE));
           return intervals.build();
         };

     private static Pair<SlaState, Long> updateIntervals(
         long timestamp,
         SlaState state,
         Pair<SlaState, Long> current,
         ImmutableList.Builder<Interval> intervals) {

       if (current.getFirst() == state) {
         // Current interval state matches the event state - skip.
         return current;
       } else {
         // Terminate current interval, add it to list and start a new interval.
         intervals.add(new Interval(current.getFirst(), current.getSecond(), timestamp));
         return Pair.of(state, timestamp);
       }
     }

     private AggregatePlatformUptime() {
       // Interface private.
     }

     @Override
     public Number calculate(Iterable<IScheduledTask> tasks, Range<Long> timeFrame) {
       // Given the set of tasks do the following:
       // - index all available tasks by InstanceId (JobKey + instance ID);
       // - combine individual task ITaskEvent lists into the instance based timeline to represent
       //   all available history for a given task instance;
       // - convert instance timeline into the SlaState intervals.
       Map<InstanceId, List<Interval>> instanceSlaTimeline =
           Maps.transformValues(
               Multimaps.index(tasks, TO_ID).asMap(),
               Functions.compose(TASK_EVENTS_TO_INTERVALS, TO_SORTED_EVENTS));

       // Given the instance timeline converted to SlaState-based time intervals, aggregate the
       // platform uptime per given timeFrame.
       long aggregateUptime = 0;
       long aggregateTotal = 0;
       for (List<Interval> intervals : instanceSlaTimeline.values()) {
         long instanceUptime = elapsedFromRange(timeFrame);
         long instanceTotal = instanceUptime;
         for (Interval interval : intervals) {
           if (timeFrame.isConnected(interval.range)) {
             long intersection = elapsedFromRange(timeFrame.intersection(interval.range));
             if (interval.state == SlaState.REMOVED) {
               instanceUptime -= intersection;
               instanceTotal -= intersection;
             } else if (interval.state == SlaState.DOWN) {
               instanceUptime -= intersection;
             }
           }
         }
         aggregateUptime += instanceUptime;
         aggregateTotal += instanceTotal;
       }

       // Calculate effective platform uptime or default to 100.0 if no instances are running yet.
       return aggregateTotal > 0 ? (double) aggregateUptime * 100 / aggregateTotal : 100.0;
     }

     private static long elapsedFromRange(Range<Long> range) {
       return range.upperEndpoint() - range.lowerEndpoint();
     }
   }
 }
	/**
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.aurora.scheduler.sla;

	import java.util.Collection;
	import java.util.List;
	import java.util.Map;
	import java.util.Objects;

	import com.google.common.base.Function;
	import com.google.common.base.Functions;
	import com.google.common.base.Predicate;
	import com.google.common.base.Predicates;
	import com.google.common.collect.FluentIterable;
	import com.google.common.collect.ImmutableList;
	import com.google.common.collect.ImmutableSet;
	import com.google.common.collect.Lists;
	import com.google.common.collect.Maps;
	import com.google.common.collect.Multimaps;
	import com.google.common.collect.Ordering;
	import com.google.common.collect.Range;

	import org.apache.aurora.common.collections.Pair;
	import org.apache.aurora.gen.ScheduleStatus;
	import org.apache.aurora.scheduler.base.Tasks;
	import org.apache.aurora.scheduler.storage.entities.IJobKey;
	import org.apache.aurora.scheduler.storage.entities.IScheduledTask;
	import org.apache.aurora.scheduler.storage.entities.ITaskEvent;

	import static java.util.Objects.requireNonNull;

	import static org.apache.aurora.gen.ScheduleStatus.ASSIGNED;
	import static org.apache.aurora.gen.ScheduleStatus.PENDING;
	import static org.apache.aurora.gen.ScheduleStatus.RUNNING;
	import static org.apache.aurora.gen.ScheduleStatus.STARTING;

	/**
	* Defines an SLA algorithm to be applied to a {@link IScheduledTask}
	* set for calculating a specific SLA metric.
	*/
	interface SlaAlgorithm {

	/**
	* Applies this algorithm to a set of {@link IScheduledTask} to
	* produce a named metric value over the specified time frame.
	*
	* @param tasks Set of tasks to apply this algorithm to.
	* @param timeFrame Relevant time frame.
	* @return Produced metric value.
	*/
	Number calculate(Iterable<IScheduledTask> tasks, Range<Long> timeFrame);

	/**
	* Pre-configured SLA algorithms.
	*/
	enum AlgorithmType {

	JOB_UPTIME_99(new JobUptime(99f), String.format(JobUptime.NAME_FORMAT, 99f)),
	JOB_UPTIME_95(new JobUptime(95f), String.format(JobUptime.NAME_FORMAT, 95f)),
	JOB_UPTIME_90(new JobUptime(90f), String.format(JobUptime.NAME_FORMAT, 90f)),
	JOB_UPTIME_75(new JobUptime(75f), String.format(JobUptime.NAME_FORMAT, 75f)),
	JOB_UPTIME_50(new JobUptime(50f), String.format(JobUptime.NAME_FORMAT, 50f)),
	AGGREGATE_PLATFORM_UPTIME(new AggregatePlatformUptime(), "platform_uptime_percent"),
	MEDIAN_TIME_TO_ASSIGNED(new MedianAlgorithm(ASSIGNED), "mtta_ms"),
	MEDIAN_TIME_TO_STARTING(new MedianAlgorithm(STARTING), "mtts_ms"),
	MEDIAN_TIME_TO_RUNNING(new MedianAlgorithm(RUNNING), "mttr_ms");

	private final SlaAlgorithm algorithm;
	private final String name;

	AlgorithmType(SlaAlgorithm algorithm, String name) {
	this.algorithm = algorithm;
	this.name = name;
	}

	SlaAlgorithm getAlgorithm() {
	return algorithm;
	}

	String getAlgorithmName() {
	return name;
	}
	}

	/**
	* Median time to status SLA algorithm.
	* Represents the median time spent waiting for a set of tasks to reach specified status.
	* A combined metric that helps tracking the task scheduling performance dependency on the
	* requested resources (user scope) as well as the internal scheduler bin-packing algorithm
	* efficiency (platform scope).
	* <p/>
	* Median time calculated as:
	* <pre>
	* MT = MEDIAN(Wait_times)
	* where:
	* Wait_times - a collection of qualifying time intervals between PENDING and specified task
	* state. An interval is qualified if its end point is contained by the sample
	* time frame.
	*</pre>
	*/
	final class MedianAlgorithm implements SlaAlgorithm {

	private final ScheduleStatus status;

	private MedianAlgorithm(ScheduleStatus status) {
	this.status = status;
	}

	@Override
	public Number calculate(Iterable<IScheduledTask> tasks, Range<Long> timeFrame) {
	Iterable<IScheduledTask> activeTasks = FluentIterable.from(tasks)
	.filter(
	Predicates.compose(Predicates.in(Tasks.ACTIVE_STATES), IScheduledTask::getStatus));

	List<Long> waitTimes = Lists.newLinkedList();
	for (IScheduledTask task : activeTasks) {
	long pendingTs = 0;
	for (ITaskEvent event : task.getTaskEvents()) {
	if (event.getStatus() == PENDING) {
	pendingTs = event.getTimestamp();
	} else if (event.getStatus() == status && timeFrame.contains(event.getTimestamp())) {

	if (pendingTs == 0) {
	throw new IllegalArgumentException("SLA: missing PENDING status for:"
	+ task.getAssignedTask().getTaskId());
	}

	waitTimes.add(event.getTimestamp() - pendingTs);
	break;
	}
	}
	}

	return SlaUtil.percentile(waitTimes, 50.0);
	}
	}

	/**
	* Job uptime SLA algorithm.
	* Represents the percentage of instances considered to be in running state for
	* the specified duration relative to SLA calculation time.
	*/
	final class JobUptime implements SlaAlgorithm {

	private static final String NAME_FORMAT = "job_uptime_%.2f_sec";
	private final float percentile;

	private static final Predicate<IScheduledTask> IS_RUNNING =
	Predicates.compose(
	Predicates.in(ImmutableSet.of(RUNNING)),
	IScheduledTask::getStatus);

	private static final Function<IScheduledTask, ITaskEvent> TASK_TO_EVENT =
	Tasks::getLatestEvent;

	private JobUptime(float percentile) {
	this.percentile = percentile;
	}

	@Override
	public Number calculate(Iterable<IScheduledTask> tasks, final Range<Long> timeFrame) {
	List<Long> uptimes = FluentIterable.from(tasks)
	.filter(IS_RUNNING)
	.transform(Functions.compose(
	event -> timeFrame.upperEndpoint() - event.getTimestamp(),
	TASK_TO_EVENT)).toList();

	return (double) SlaUtil.percentile(uptimes, percentile) / 1000;
	}
	}

	/**
	* Aggregate Platform Uptime SLA algorithm.
	* Aggregate amount of runnable time a platform managed to deliver for a set of tasks from the
	* moment of reaching them RUNNING status. Excludes any time a task is not in a runnable state
	* due to user activities (e.g. newly created waiting for host assignment or restarted/killed
	* by the user).
	* <p/>
	* Aggregate platform uptime calculated as:
	* <pre>
	* APU = SUM(Up_time) / SUM(SI - Removed_time)
	* where:
	* Up_time - the aggregate instance UP time over the sampling interval (SI);
	* SI - sampling interval (e.g. 1 minute);
	* Removed_time - the aggregate instance REMOVED time over the sampling interval.
	* </pre>
	*/
	final class AggregatePlatformUptime implements SlaAlgorithm {

	/**
	* Task platform SLA state.
	*/
	enum SlaState {
	/**
	* Starts a period when the task is not expected to be UP due to user initiated action
	* or failure.
	* <p/>
	* This period is ignored for the calculation purposes.
	*/
	REMOVED,

	/**
	* Starts a period when the task cannot reach the UP state for some non-user-related reason.
	* <p/>
	* Only platform-incurred task state transitions are considered here. If a task is newly
	* created (e.g. by job create/update) the amount of time a task spends to reach its UP
	* state is not counted towards platform downtime. For example, a newly added PENDING task
	* is considered as REMOVED, whereas a PENDING task rescheduled from LOST will be considered
	* as DOWN. This approach ensures this metric is not sensitive to user-initiated activities
	* and is a true reflection of the system recovery performance.
	*/
	DOWN,

	/**
	* Starts a period when the task is considered to be up and running from the Aurora
	* platform standpoint.
	* <p/>
	* Note: The platform uptime does not necessarily equate to the real application
	* availability. This is because a hosted application needs time to deploy, initialize,
	* and start executing.
	*/
	UP
	}

	private static class Interval {
	private final SlaState state;
	private final Range<Long> range;

	Interval(SlaState state, long start, long end) {
	this.state = state;
	range = Range.closedOpen(start, end);
	}
	}

	private static class InstanceId {
	private final IJobKey jobKey;
	private final int id;

	InstanceId(IJobKey jobKey, int instanceId) {
	this.jobKey = requireNonNull(jobKey);
	this.id = instanceId;
	}

	@Override
	public boolean equals(Object o) {
	if (!(o instanceof InstanceId)) {
	return false;
	}

	InstanceId other = (InstanceId) o;
	return Objects.equals(jobKey, other.jobKey)
	&& Objects.equals(id, other.id);
	}

	@Override
	public int hashCode() {
	return Objects.hash(jobKey, id);
	}
	}

	private static final Function<IScheduledTask, InstanceId> TO_ID =
	task -> new InstanceId(
	task.getAssignedTask().getTask().getJob(),
	task.getAssignedTask().getInstanceId());

	private static final Function<ITaskEvent, Long> TASK_EVENT_TO_TIMESTAMP =
	ITaskEvent::getTimestamp;

	/**
	* Combine all task events per given instance into the unified sorted instance history view.
	*/
	private static final Function<Collection<IScheduledTask>, List<ITaskEvent>> TO_SORTED_EVENTS =
	tasks -> {
	List<ITaskEvent> result = Lists.newLinkedList();
	for (IScheduledTask task : tasks) {
	result.addAll(task.getTaskEvents());
	}

	return Ordering.natural()
	.onResultOf(TASK_EVENT_TO_TIMESTAMP).immutableSortedCopy(result);
	};

	/**
	* Convert instance history into the {@link SlaState} based {@link Interval} list.
	*/
	private static final Function<List<ITaskEvent>, List<Interval>> TASK_EVENTS_TO_INTERVALS =
	events -> {

	ImmutableList.Builder<Interval> intervals = ImmutableList.builder();
	Pair<SlaState, Long> current = Pair.of(SlaState.REMOVED, 0L);

	for (ITaskEvent event : events) {
	long timestamp = event.getTimestamp();

	// Event status in the instance timeline signifies either of the following:
	// - termination of the existing SlaState interval AND start of a new one;
	// - continuation of the existing matching SlaState interval.
	switch (event.getStatus()) {
	case LOST:
	case DRAINING:
	case PREEMPTING:
	current = updateIntervals(timestamp, SlaState.DOWN, current, intervals);
	break;

	case PENDING:
	case ASSIGNED:
	case STARTING:
	if (current.getFirst() != SlaState.DOWN) {
	current = updateIntervals(timestamp, SlaState.REMOVED, current, intervals);
	}
	break;

	case THROTTLED:
	case FINISHED:
	case RESTARTING:
	case FAILED:
	case KILLING:
	current = updateIntervals(timestamp, SlaState.REMOVED, current, intervals);
	break;

	case RUNNING:
	current = updateIntervals(timestamp, SlaState.UP, current, intervals);
	break;

	case KILLED:
	if (current.getFirst() == SlaState.UP) {
	current = updateIntervals(timestamp, SlaState.DOWN, current, intervals);
	}
	break;

	case INIT:
	// Ignore.
	break;

	default:
	throw new IllegalArgumentException("Unsupported status:" + event.getStatus());
	}
	}
	// Add the last event interval.
	intervals.add(new Interval(current.getFirst(), current.getSecond(), Long.MAX_VALUE));
	return intervals.build();
	};

	private static Pair<SlaState, Long> updateIntervals(
	long timestamp,
	SlaState state,
	Pair<SlaState, Long> current,
	ImmutableList.Builder<Interval> intervals) {

	if (current.getFirst() == state) {
	// Current interval state matches the event state - skip.
	return current;
	} else {
	// Terminate current interval, add it to list and start a new interval.
	intervals.add(new Interval(current.getFirst(), current.getSecond(), timestamp));
	return Pair.of(state, timestamp);
	}
	}

	private AggregatePlatformUptime() {
	// Interface private.
	}

	@Override
	public Number calculate(Iterable<IScheduledTask> tasks, Range<Long> timeFrame) {
	// Given the set of tasks do the following:
	// - index all available tasks by InstanceId (JobKey + instance ID);
	// - combine individual task ITaskEvent lists into the instance based timeline to represent
	// all available history for a given task instance;
	// - convert instance timeline into the SlaState intervals.
	Map<InstanceId, List<Interval>> instanceSlaTimeline =
	Maps.transformValues(
	Multimaps.index(tasks, TO_ID).asMap(),
	Functions.compose(TASK_EVENTS_TO_INTERVALS, TO_SORTED_EVENTS));

	// Given the instance timeline converted to SlaState-based time intervals, aggregate the
	// platform uptime per given timeFrame.
	long aggregateUptime = 0;
	long aggregateTotal = 0;
	for (List<Interval> intervals : instanceSlaTimeline.values()) {
	long instanceUptime = elapsedFromRange(timeFrame);
	long instanceTotal = instanceUptime;
	for (Interval interval : intervals) {
	if (timeFrame.isConnected(interval.range)) {
	long intersection = elapsedFromRange(timeFrame.intersection(interval.range));
	if (interval.state == SlaState.REMOVED) {
	instanceUptime -= intersection;
	instanceTotal -= intersection;
	} else if (interval.state == SlaState.DOWN) {
	instanceUptime -= intersection;
	}
	}
	}
	aggregateUptime += instanceUptime;
	aggregateTotal += instanceTotal;
	}

	// Calculate effective platform uptime or default to 100.0 if no instances are running yet.
	return aggregateTotal > 0 ? (double) aggregateUptime * 100 / aggregateTotal : 100.0;
	}

	private static long elapsedFromRange(Range<Long> range) {
	return range.upperEndpoint() - range.lowerEndpoint();
	}
	}
	}