blob: 60bf9b906aa48525a9ca6d29c4c12d7669f6247a [file] [log] [blame]
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
namespace java org.apache.aurora.gen
namespace py gen.apache.aurora.api
// Thrift interface definition for the aurora scheduler.
/*
* TODO(wfarner): It would be nice if we could put some HTML tags here, regex doesn't handle it though.
* The result of an API operation. A result may only be specified when this is OK.
*/
enum ResponseCode {
INVALID_REQUEST = 0,
OK = 1,
ERROR = 2,
WARNING = 3,
AUTH_FAILED = 4,
/** Raised when an operation was unable to proceed due to an in-progress job update. */
JOB_UPDATING_ERROR = 5,
/** Raised when a scheduler is transiently unavailable and later retry is recommended. */
ERROR_TRANSIENT = 6
}
// Aurora executor framework name.
const string AURORA_EXECUTOR_NAME = 'AuroraExecutor'
// TODO(maxim): Remove in 0.7.0. (AURORA-749)
struct Identity {
2: string user
}
/** A single host attribute. */
struct Attribute {
1: string name
2: set<string> values
}
enum MaintenanceMode {
NONE = 1,
SCHEDULED = 2,
DRAINING = 3,
DRAINED = 4
}
/** The attributes assigned to a host. */
struct HostAttributes {
1: string host
2: set<Attribute> attributes
3: optional MaintenanceMode mode
4: optional string slaveId
}
/**
* A constraint that specifies an explicit set of values, at least one of which must be present
* on a host for a task to be scheduled there.
*/
struct ValueConstraint {
/** If true, treat this as a 'not' - to avoid specific values. */
1: bool negated
2: set<string> values
}
/**
* A constraint the specifies the maximum number of active tasks on a host with a matching
* attribute that may be scheduled simultaneously.
*/
struct LimitConstraint {
1: i32 limit
}
/** Types of constraints that may be applied to a task. */
union TaskConstraint {
1: ValueConstraint value
2: LimitConstraint limit
}
/** A constraint that defines whether a task may be scheduled on a host. */
struct Constraint {
/** Mesos slave attribute that the constraint is matched against. */
1: string name
2: TaskConstraint constraint
}
struct Package {
1: string role
2: string name
3: i32 version
}
/** Arbitrary key-value metadata to be included into TaskConfig. */
struct Metadata {
1: string key
2: string value
}
/** A unique identifier for a Job. */
struct JobKey {
/** User role (Unix service account), for example "mesos" */
1: string role
/** Environment, for example "devel" */
2: string environment
/** Name, for example "labrat" */
3: string name
}
// TODO(jly): Deprecated, remove in 0.21. See AURORA-1959.
/** A unique lock key. */
union LockKey {
1: JobKey job
}
// TODO(jly): Deprecated, remove in 0.21. See AURORA-1959.
/** A generic lock struct to facilitate context specific resource/operation serialization. */
struct Lock {
/** ID of the lock - unique per storage */
1: LockKey key
/** UUID - facilitating soft lock authorization */
2: string token
/** Lock creator */
3: string user
/** Lock creation timestamp in milliseconds */
4: i64 timestampMs
/** Optional message to record with the lock */
5: optional string message
}
/** A unique identifier for the active task within a job. */
struct InstanceKey {
/** Key identifying the job. */
1: JobKey jobKey
/** Unique instance ID for the active task in a job. */
2: i32 instanceId
}
/** URI which mirrors CommandInfo.URI in the Mesos Protobuf */
struct MesosFetcherURI {
/** Where to get the resource from */
1: string value
/** Extract compressed archive after downloading */
2: optional bool extract
/** Cache value using Mesos Fetcher caching mechanism **/
3: optional bool cache
}
struct ExecutorConfig {
/** Name identifying the Executor. */
1: string name
/** Executor configuration data. */
2: string data
}
/** The mode for a volume mount */
enum Mode {
/** Read Write */
RW = 1
/** Read Only */
RO = 2
}
/** A volume mount point within a container */
struct Volume {
/** The path inside the container where the mount will be created. */
1: string containerPath
/** The path on the host that will serve as the source for the mount. */
2: string hostPath
/** The access mode */
3: Mode mode
}
/** Describes an image for use with the Mesos unified containerizer in the Docker format */
struct DockerImage {
/** The name of the image to run */
1: string name
/** The Docker tag identifying the image */
2: string tag
}
/** Describes an image for use with the Mesos unified containerizer in the AppC format */
struct AppcImage {
/** The name of the image to run */
1: string name
/** The appc image id identifying the image */
2: string imageId
}
/** Describes an image to be used with the Mesos unified containerizer */
union Image {
1: DockerImage docker
2: AppcImage appc
}
/** Describes a mesos container, this is the default */
struct MesosContainer {
/** the optional filesystem image to use when launching this task. */
1: optional Image image
/** the optional list of volumes to mount into the task. */
2: optional list<Volume> volumes
}
/** Describes a parameter passed to docker cli */
struct DockerParameter {
/** a parameter to pass to docker. (e.g. volume) */
1: string name
/** the value to pass to a parameter (e.g. /src/webapp:/opt/webapp) */
2: string value
}
/** Describes a docker container */
struct DockerContainer {
/** The container image to be run */
1: string image
/** The arbitrary parameters to pass to container */
2: optional list<DockerParameter> parameters
}
/** Describes a container to be used in a task */
union Container {
1: MesosContainer mesos
2: DockerContainer docker
}
/** Describes resource value required to run a task. */
union Resource {
1: double numCpus
2: i64 ramMb
3: i64 diskMb
4: string namedPort
5: i64 numGpus
}
struct PartitionPolicy {
1: bool reschedule
2: optional i64 delaySecs
}
/** SLA requirements expressed as the percentage of instances to be RUNNING every durationSecs */
struct PercentageSlaPolicy {
/* The percentage of active instances required every `durationSecs`. */
1: double percentage
/** Minimum time duration a task needs to be `RUNNING` to be treated as active */
2: i64 durationSecs
}
/** SLA requirements expressed as the number of instances to be RUNNING every durationSecs */
struct CountSlaPolicy {
/** The number of active instances required every `durationSecs` */
1: i64 count
/** Minimum time duration a task needs to be `RUNNING` to be treated as active */
2: i64 durationSecs
}
/** SLA requirements to be delegated to an external coordinator */
struct CoordinatorSlaPolicy {
/** URL for the coordinator service that needs to be contacted for SLA checks */
1: string coordinatorUrl
/** Field in the Coordinator response json indicating if the action is allowed or not */
2: string statusKey
}
/** SLA requirements expressed in one of the many types */
union SlaPolicy {
1: PercentageSlaPolicy percentageSlaPolicy
2: CountSlaPolicy countSlaPolicy
3: CoordinatorSlaPolicy coordinatorSlaPolicy
}
/** Description of the tasks contained within a job. */
struct TaskConfig {
/** Job task belongs to. */
28: JobKey job
// TODO(maxim): Deprecated. See AURORA-749.
/** contains the role component of JobKey */
17: Identity owner
7: bool isService
11: i32 priority
13: i32 maxTaskFailures
// TODO(mnurolahzade): Deprecated. See AURORA-1708.
/** Whether this is a production task, which can preempt. */
18: optional bool production
/** Task tier type. */
30: optional string tier
/** All resources required to run a task. */
32: set<Resource> resources
20: set<Constraint> constraints
/** Resources to retrieve with Mesos Fetcher */
33: optional set<MesosFetcherURI> mesosFetcherUris
/**
* Custom links to include when displaying this task on the scheduler dashboard. Keys are anchor
* text, values are URLs. Wildcards are supported for dynamic link crafting based on host, ports,
* instance, etc.
*/
22: optional map<string, string> taskLinks
23: optional string contactEmail
/** Executor configuration */
25: optional ExecutorConfig executorConfig
/** Used to display additional details in the UI. */
27: optional set<Metadata> metadata
/** Policy for how to deal with task partitions */
34: optional PartitionPolicy partitionPolicy
/** SLA requirements to be met during maintenance */
35: optional SlaPolicy slaPolicy
// This field is deliberately placed at the end to work around a bug in the immutable wrapper
// code generator. See AURORA-1185 for details.
/** the container the task should use to execute */
29: Container container = { "mesos": {} }
}
struct ResourceAggregate {
/** Aggregated resource values. */
4: set<Resource> resources
}
/** Defines the policy for launching a new cron job when one is already running. */
enum CronCollisionPolicy {
/** Kills the existing job with the colliding name, and runs the new cron job. */
KILL_EXISTING = 0,
/** Cancels execution of the new job, leaving the running job in tact. */
CANCEL_NEW = 1,
/**
* DEPRECATED. For existing jobs, treated the same as CANCEL_NEW.
* createJob will reject jobs with this policy.
*/
RUN_OVERLAP = 2
}
/**
* Description of an Aurora job. One task will be scheduled for each instance within the job.
*/
struct JobConfiguration {
/**
* Key for this job. If not specified name, owner.role, and a reasonable default environment are
* used to construct it server-side.
*/
9: JobKey key
// TODO(maxim): Deprecated. See AURORA-749.
/** Owner of this job. */
7: Identity owner
/**
* If present, the job will be handled as a cron job with this crontab-syntax schedule.
*/
4: optional string cronSchedule
/** Collision policy to use when handling overlapping cron runs. Default is KILL_EXISTING. */
5: CronCollisionPolicy cronCollisionPolicy
/** Task configuration for this job. */
6: TaskConfig taskConfig
/**
* The number of instances in the job. Generated instance IDs for tasks will be in the range
* [0, instances).
*/
8: i32 instanceCount
}
struct JobStats {
/** Number of tasks in active state for this job. */
1: i32 activeTaskCount
/** Number of tasks in finished state for this job. */
2: i32 finishedTaskCount
/** Number of failed tasks for this job. */
3: i32 failedTaskCount
/** Number of tasks in pending state for this job. */
4: i32 pendingTaskCount
}
struct JobSummary {
1: JobConfiguration job
2: JobStats stats
/** Timestamp of next cron run in ms since epoch, for a cron job */
3: optional i64 nextCronRunMs
}
/** Closed range of integers. */
struct Range {
1: i32 first
2: i32 last
}
struct ConfigGroup {
1: TaskConfig config
3: set<Range> instances
}
struct ConfigSummary {
1: JobKey key
2: set<ConfigGroup> groups
}
struct PopulateJobResult {
2: TaskConfig taskConfig
}
struct GetQuotaResult {
/** Total allocated resource quota. */
1: ResourceAggregate quota
/** Resources consumed by production jobs from a shared resource pool. */
2: optional ResourceAggregate prodSharedConsumption
/** Resources consumed by non-production jobs from a shared resource pool. */
3: optional ResourceAggregate nonProdSharedConsumption
/** Resources consumed by production jobs from a dedicated resource pool. */
4: optional ResourceAggregate prodDedicatedConsumption
/** Resources consumed by non-production jobs from a dedicated resource pool. */
5: optional ResourceAggregate nonProdDedicatedConsumption
}
/** States that a task may be in. */
enum ScheduleStatus {
// TODO(maxim): This state does not add much value. Consider dropping it completely.
/* Initial state for a task. A task will remain in this state until it has been persisted. */
INIT = 11,
/** The task will be rescheduled, but is being throttled for restarting too frequently. */
THROTTLED = 16,
/** Task is awaiting assignment to a slave. */
PENDING = 0,
/** Task has been assigned to a slave. */
ASSIGNED = 9,
/** Slave has acknowledged receipt of task and is bootstrapping the task. */
STARTING = 1,
/** The task is running on the slave. */
RUNNING = 2,
/** The task terminated with an exit code of zero. */
FINISHED = 3,
/** The task is being preempted by another task. */
PREEMPTING = 13,
/** The task is being restarted in response to a user request. */
RESTARTING = 12,
/** The task is being restarted in response to a host maintenance request. */
DRAINING = 17,
/** The task terminated with a non-zero exit code. */
FAILED = 4,
/** Execution of the task was terminated by the system. */
KILLED = 5,
/** The task is being forcibly killed. */
KILLING = 6,
/** A fault in the task environment has caused the system to believe the task no longer exists.
* This can happen, for example, when a slave process disappears.
*/
LOST = 7,
/**
* The task is currently partitioned and in an unknown state.
**/
PARTITIONED = 18
}
// States that a task may be in while still considered active.
const set<ScheduleStatus> ACTIVE_STATES = [ScheduleStatus.ASSIGNED,
ScheduleStatus.DRAINING,
ScheduleStatus.KILLING,
ScheduleStatus.PENDING,
ScheduleStatus.PREEMPTING,
ScheduleStatus.RESTARTING
ScheduleStatus.RUNNING,
ScheduleStatus.STARTING,
ScheduleStatus.PARTITIONED,
ScheduleStatus.THROTTLED]
// States that a task may be in while associated with a slave machine and non-terminal.
const set<ScheduleStatus> SLAVE_ASSIGNED_STATES = [ScheduleStatus.ASSIGNED,
ScheduleStatus.DRAINING,
ScheduleStatus.KILLING,
ScheduleStatus.PREEMPTING,
ScheduleStatus.RESTARTING,
ScheduleStatus.RUNNING,
ScheduleStatus.PARTITIONED,
ScheduleStatus.STARTING]
// States that a task may be in while in an active sandbox.
const set<ScheduleStatus> LIVE_STATES = [ScheduleStatus.KILLING,
ScheduleStatus.PREEMPTING,
ScheduleStatus.RESTARTING,
ScheduleStatus.DRAINING,
ScheduleStatus.PARTITIONED,
ScheduleStatus.RUNNING]
// States a completed task may be in.
const set<ScheduleStatus> TERMINAL_STATES = [ScheduleStatus.FAILED,
ScheduleStatus.FINISHED,
ScheduleStatus.KILLED,
ScheduleStatus.LOST]
// Regular expressions for matching valid identifiers for job path components. All expressions
// below should accept and reject the same set of inputs.
const string GOOD_IDENTIFIER_PATTERN = "^[\\w\\-\\.]+$"
// JVM: Use with java.util.regex.Pattern#compile
const string GOOD_IDENTIFIER_PATTERN_JVM = GOOD_IDENTIFIER_PATTERN
// Python: Use with re.compile
const string GOOD_IDENTIFIER_PATTERN_PYTHON = GOOD_IDENTIFIER_PATTERN
/** Event marking a state transition within a task's lifecycle. */
struct TaskEvent {
/** Epoch timestamp in milliseconds. */
1: i64 timestamp
/** New status of the task. */
2: ScheduleStatus status
/** Audit message that explains why a transition occurred. */
3: optional string message
/** Hostname of the scheduler machine that performed the event. */
4: optional string scheduler
}
/** A task assignment that is provided to an executor. */
struct AssignedTask {
/** The mesos task ID for this task. Guaranteed to be globally unique */
1: string taskId
/**
* The mesos slave ID that this task has been assigned to.
* This will not be populated for a PENDING task.
*/
2: string slaveId
/**
* The name of the machine that this task has been assigned to.
* This will not be populated for a PENDING task.
*/
3: string slaveHost
/** Information about how to run this task. */
4: TaskConfig task
/** Ports reserved on the machine while this task is running. */
5: map<string, i32> assignedPorts
/**
* The instance ID assigned to this task. Instance IDs must be unique and contiguous within a
* job, and will be in the range [0, N-1] (inclusive) for a job that has N instances.
*/
6: i32 instanceId
}
/** A task that has been scheduled. */
struct ScheduledTask {
/** The task that was scheduled. */
1: AssignedTask assignedTask
/** The current status of this task. */
2: ScheduleStatus status
/**
* The number of failures that this task has accumulated over the multi-generational history of
* this task.
*/
3: i32 failureCount
/**
* The number of partitions this task has accumulated over its lifetime.
*/
6: i32 timesPartitioned
/** State change history for this task. */
4: list<TaskEvent> taskEvents
/**
* The task ID of the previous generation of this task. When a task is automatically rescheduled,
* a copy of the task is created and ancestor ID of the previous task's task ID.
*/
5: string ancestorId
}
struct ScheduleStatusResult {
1: list<ScheduledTask> tasks
}
struct GetJobsResult {
1: set<JobConfiguration> configs
}
/**
* Contains a set of restrictions on matching tasks where all restrictions must be met
* (terms are AND'ed together).
*/
struct TaskQuery {
14: optional string role
9: optional string environment
2: optional string jobName
4: optional set<string> taskIds
5: optional set<ScheduleStatus> statuses
7: optional set<i32> instanceIds
10: optional set<string> slaveHosts
11: optional set<JobKey> jobKeys
12: optional i32 offset
13: optional i32 limit
}
struct HostStatus {
1: string host
2: MaintenanceMode mode
}
struct RoleSummary {
1: string role
2: i32 jobCount
3: i32 cronJobCount
}
struct Hosts {
1: set<string> hostNames
}
struct PendingReason {
1: string taskId
2: string reason
}
/** States that a job update may be in. */
enum JobUpdateStatus {
/** Update is in progress. */
ROLLING_FORWARD = 0,
/** Update has failed and is being rolled back. */
ROLLING_BACK = 1,
/** Update has been paused while in progress. */
ROLL_FORWARD_PAUSED = 2,
/** Update has been paused during rollback. */
ROLL_BACK_PAUSED = 3,
/** Update has completed successfully. */
ROLLED_FORWARD = 4,
/** Update has failed and rolled back. */
ROLLED_BACK = 5,
/** Update was aborted. */
ABORTED = 6,
/** Unknown error during update. */
ERROR = 7,
/**
* Update failed to complete.
* This can happen if failure thresholds are met while rolling forward, but rollback is disabled,
* or if failure thresholds are met when rolling back.
*/
FAILED = 8,
/** Update has been blocked while in progress due to missing/expired pulse. */
ROLL_FORWARD_AWAITING_PULSE = 9,
/** Update has been blocked during rollback due to missing/expired pulse. */
ROLL_BACK_AWAITING_PULSE = 10
}
/** States the job update can be in while still considered active. */
const set<JobUpdateStatus> ACTIVE_JOB_UPDATE_STATES = [JobUpdateStatus.ROLLING_FORWARD,
JobUpdateStatus.ROLLING_BACK,
JobUpdateStatus.ROLL_FORWARD_PAUSED,
JobUpdateStatus.ROLL_BACK_PAUSED,
JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE,
JobUpdateStatus.ROLL_BACK_AWAITING_PULSE]
/** States the job update can be in while waiting for a pulse. */
const set<JobUpdateStatus> AWAITNG_PULSE_JOB_UPDATE_STATES = [JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE,
JobUpdateStatus.ROLL_BACK_AWAITING_PULSE]
/** Job update actions that can be applied to job instances. */
enum JobUpdateAction {
/**
* An instance was moved to the target state successfully, and declared healthy if the desired
* state did not involve deleting the instance.
*/
INSTANCE_UPDATED = 1,
/**
* An instance was rolled back because the job update did not succeed. The instance was reverted
* to the original state prior to the job update, which means that the instance was removed if
* the update added instances to the job.
*/
INSTANCE_ROLLED_BACK = 2,
/**
* An instance is being moved from the original state to the desired state.
*/
INSTANCE_UPDATING = 3,
/**
* An instance is being moved from the desired state back to the original state, because the job
* update failed.
*/
INSTANCE_ROLLING_BACK = 4,
/** An instance update was attempted but failed and was not rolled back. */
INSTANCE_UPDATE_FAILED = 5,
/** An instance rollback was attempted but failed. */
INSTANCE_ROLLBACK_FAILED = 6
}
/** Status of the coordinated update. Intended as a response to pulseJobUpdate RPC. */
enum JobUpdatePulseStatus {
/**
* Update is active. See ACTIVE_JOB_UPDATE_STATES for statuses considered active.
*/
OK = 1,
/**
* Update has reached terminal state. See TERMINAL_JOB_UPDATE_STATES for statuses
* considered terminal.
*/
FINISHED = 2
}
/** Job update key. */
struct JobUpdateKey {
/** Job being updated */
1: JobKey job
/** Update ID. */
2: string id
}
/** Limits the amount of active changes being made to instances to groupSize. */
struct QueueJobUpdateStrategy {
1: i32 groupSize
}
/** Similar to Queue strategy but will not start a new group until all instances in an active
* group have finished updating.
*/
struct BatchJobUpdateStrategy {
1: i32 groupSize
}
/** Same as Batch strategy but each time an active group completes, the size of the next active
* group may change.
*/
struct VariableBatchJobUpdateStrategy {
1: list<i32> groupSizes
}
union JobUpdateStrategy {
1: QueueJobUpdateStrategy queueStrategy
2: BatchJobUpdateStrategy batchStrategy
3: VariableBatchJobUpdateStrategy varBatchStrategy
}
/** Job update thresholds and limits. **/
struct JobUpdateSettings {
/** Deprecated, please set value inside of desired update strategy instead.
* Max number of instances being updated at any given moment.
*/
1: i32 updateGroupSize
/** Max number of instance failures to tolerate before marking instance as FAILED. */
2: i32 maxPerInstanceFailures
/** Max number of FAILED instances to tolerate before terminating the update. */
3: i32 maxFailedInstances
/** Min time to watch a RUNNING instance. */
5: i32 minWaitInInstanceRunningMs
/** If true, enables failed update rollback. */
6: bool rollbackOnFailure
/** Instance IDs to act on. All instances will be affected if this is not set. */
7: set<Range> updateOnlyTheseInstances
/** Deprecated, please set updateStrategy to the Batch strategy instead.
* If true, use updateGroupSize as strict batching boundaries, and avoid proceeding to another
* batch until the preceding batch finishes updating.
*/
8: bool waitForBatchCompletion
/**
* If set, requires external calls to pulseJobUpdate RPC within the specified rate for the
* update to make progress. If no pulses received within specified interval the update will
* block. A blocked update is unable to continue but retains its current status. It may only get
* unblocked by a fresh pulseJobUpdate call.
*/
9: optional i32 blockIfNoPulsesAfterMs
/**
* If true, updates will obey the SLA requirements of the tasks being updated. If the SLA policy
* differs between the old and new task configurations, updates will use the newest configuration.
*/
10: optional bool slaAware
/** Update strategy to be used for the update. See JobUpdateStrategy for choices. */
11: optional JobUpdateStrategy updateStrategy
}
/** Event marking a state transition in job update lifecycle. */
struct JobUpdateEvent {
/** Update status. */
1: JobUpdateStatus status
/** Epoch timestamp in milliseconds. */
2: i64 timestampMs
/** User who performed this event (if user-initiated). */
3: optional string user
/**
* Message from the user (for user-initiated transitions) or the scheduler about why the state was
* changed.
*/
4: optional string message
}
/** Event marking a state transition in job instance update lifecycle. */
struct JobInstanceUpdateEvent {
/** Job instance ID. */
1: i32 instanceId
/** Epoch timestamp in milliseconds. */
2: i64 timestampMs
/** Job update action taken on the instance. */
3: JobUpdateAction action
/** Optional message explaining the instance update event. */
4: optional string message
}
/** Maps instance IDs to TaskConfigs it. */
struct InstanceTaskConfig {
/** A TaskConfig associated with instances. */
1: TaskConfig task
/** Instances associated with the TaskConfig. */
2: set<Range> instances
}
/** Current job update state including status and created/modified timestamps. */
struct JobUpdateState {
/** Current status of the update. */
1: JobUpdateStatus status
/** Created timestamp in milliseconds. */
2: i64 createdTimestampMs
/** Last modified timestamp in milliseconds. */
3: i64 lastModifiedTimestampMs
}
/** Summary of the job update including job key, user and current state. */
struct JobUpdateSummary {
/** Unique identifier for the update. */
5: JobUpdateKey key
/** User initiated an update. */
3: string user
/** Current job update state. */
4: JobUpdateState state
/** Update metadata supplied by the client. */
6: optional set<Metadata> metadata
}
/** Update configuration and setting details. */
struct JobUpdateInstructions {
/** Actual InstanceId -> TaskConfig mapping when the update was requested. */
1: set<InstanceTaskConfig> initialState
/** Desired configuration when the update completes. */
2: InstanceTaskConfig desiredState
/** Update specific settings. */
3: JobUpdateSettings settings
}
/** Full definition of the job update. */
struct JobUpdate {
/** Update summary. */
1: JobUpdateSummary summary
/** Update configuration. */
2: JobUpdateInstructions instructions
}
struct JobUpdateDetails {
/** Update definition. */
1: JobUpdate update
/** History for this update. */
2: list<JobUpdateEvent> updateEvents
/** History for the individual instances updated. */
3: list<JobInstanceUpdateEvent> instanceEvents
}
/** A request to update the following instances of an existing job. Used by startUpdate. */
struct JobUpdateRequest {
/** Desired TaskConfig to apply. */
1: TaskConfig taskConfig
/** Desired number of instances of the task config. */
2: i32 instanceCount
/** Update settings and limits. */
3: JobUpdateSettings settings
/** Update metadata supplied by the client issuing the JobUpdateRequest. */
4: optional set<Metadata> metadata
}
/**
* Contains a set of restrictions on matching job updates where all restrictions must be met
* (terms are AND'ed together).
*/
struct JobUpdateQuery {
/** Job role. */
2: string role
/** Unique identifier for a job update. */
8: JobUpdateKey key
/** Job key. */
3: JobKey jobKey
/** User who created the update. */
4: string user
/** Set of update statuses. */
5: set<JobUpdateStatus> updateStatuses
/** Offset to serve data from. Used by pagination. */
6: i32 offset
/** Number or records to serve. Used by pagination. */
7: i32 limit
}
struct HostMaintenanceRequest {
1: string host
2: SlaPolicy defaultSlaPolicy
3: i64 timeoutSecs
4: i64 createdTimestampMs
}
struct ListBackupsResult {
1: set<string> backups
}
struct StartMaintenanceResult {
1: set<HostStatus> statuses
}
struct DrainHostsResult {
1: set<HostStatus> statuses
}
struct QueryRecoveryResult {
1: set<ScheduledTask> tasks
}
struct MaintenanceStatusResult {
1: set<HostStatus> statuses
}
struct EndMaintenanceResult {
1: set<HostStatus> statuses
}
struct RoleSummaryResult {
1: set<RoleSummary> summaries
}
struct JobSummaryResult {
1: set<JobSummary> summaries
}
struct ConfigSummaryResult {
1: ConfigSummary summary
}
struct GetPendingReasonResult {
1: set<PendingReason> reasons
}
/** Result of the startUpdate call. */
struct StartJobUpdateResult {
/** Unique identifier for the job update. */
1: JobUpdateKey key
/** Summary of the update that is in progress for the given JobKey. */
2: optional JobUpdateSummary updateSummary
}
/** Result of the getJobUpdateSummaries call. */
struct GetJobUpdateSummariesResult {
1: list<JobUpdateSummary> updateSummaries
}
/** Result of the getJobUpdateDetails call. */
struct GetJobUpdateDetailsResult {
// TODO(zmanji): Remove this once we complete AURORA-1765
1: JobUpdateDetails details
2: list<JobUpdateDetails> detailsList
}
/** Result of the pulseJobUpdate call. */
struct PulseJobUpdateResult {
1: JobUpdatePulseStatus status
}
struct GetJobUpdateDiffResult {
/** Instance addition diff details. */
1: set<ConfigGroup> add
/** Instance removal diff details. */
2: set<ConfigGroup> remove
/** Instance update diff details. */
3: set<ConfigGroup> update
/** Instances unchanged by the update. */
4: set<ConfigGroup> unchanged
}
/** Tier information. */
struct TierConfig {
/** Name of tier. */
1: string name
/** Tier attributes. */
2: map<string, string> settings
}
/** Result of the getTierConfigResult call. */
struct GetTierConfigResult {
/** Name of the default tier. */
1: string defaultTierName
/** Set of tier configurations. */
2: set<TierConfig> tiers
}
/** Information about the scheduler. */
struct ServerInfo {
1: string clusterName
/** A url prefix for job container stats. */
3: string statsUrlPrefix
}
union Result {
1: PopulateJobResult populateJobResult
3: ScheduleStatusResult scheduleStatusResult
4: GetJobsResult getJobsResult
5: GetQuotaResult getQuotaResult
6: ListBackupsResult listBackupsResult
7: StartMaintenanceResult startMaintenanceResult
8: DrainHostsResult drainHostsResult
9: QueryRecoveryResult queryRecoveryResult
10: MaintenanceStatusResult maintenanceStatusResult
11: EndMaintenanceResult endMaintenanceResult
17: RoleSummaryResult roleSummaryResult
18: JobSummaryResult jobSummaryResult
20: ConfigSummaryResult configSummaryResult
21: GetPendingReasonResult getPendingReasonResult
22: StartJobUpdateResult startJobUpdateResult
23: GetJobUpdateSummariesResult getJobUpdateSummariesResult
24: GetJobUpdateDetailsResult getJobUpdateDetailsResult
25: PulseJobUpdateResult pulseJobUpdateResult
26: GetJobUpdateDiffResult getJobUpdateDiffResult
27: GetTierConfigResult getTierConfigResult
}
struct ResponseDetail {
1: string message
}
struct Response {
1: ResponseCode responseCode
5: ServerInfo serverInfo
/** Payload from the invoked RPC. */
3: optional Result result
/**
* Messages from the server relevant to the request, such as warnings or use of deprecated
* features.
*/
6: list<ResponseDetail> details
}
// A service that provides all the read only calls to the Aurora scheduler.
service ReadOnlyScheduler {
/** Returns a summary of the jobs grouped by role. */
Response getRoleSummary()
/** Returns a summary of jobs, optionally only those owned by a specific role. */
Response getJobSummary(1: string role)
/** Fetches the status of tasks. */
Response getTasksStatus(1: TaskQuery query)
/**
* Same as getTaskStatus but without the TaskConfig.ExecutorConfig data set.
* This is an interim solution until we have a better way to query TaskConfigs (AURORA-541).
*/
Response getTasksWithoutConfigs(1: TaskQuery query)
/** Returns user-friendly reasons (if available) for tasks retained in PENDING state. */
Response getPendingReason(1: TaskQuery query)
/** Fetches the configuration summary of active tasks for the specified job. */
Response getConfigSummary(1: JobKey job)
/**
* Fetches the status of jobs.
* ownerRole is optional, in which case all jobs are returned.
*/
Response getJobs(1: string ownerRole)
/** Fetches the quota allocated for a user. */
Response getQuota(1: string ownerRole)
/**
* Populates fields in a job configuration as though it were about to be run.
* This can be used to diff a configuration running tasks.
*/
Response populateJobConfig(1: JobConfiguration description)
/** Gets job update summaries. */
Response getJobUpdateSummaries(1: JobUpdateQuery jobUpdateQuery)
/** Gets job update details. */
// TODO(zmanji): `key` is deprecated, remove this with AURORA-1765
Response getJobUpdateDetails(1: JobUpdateKey key, 2: JobUpdateQuery query)
/** Gets the diff between client (desired) and server (current) job states. */
Response getJobUpdateDiff(1: JobUpdateRequest request)
/** Gets tier configurations. */
Response getTierConfigs()
}
service AuroraSchedulerManager extends ReadOnlyScheduler {
/**
* Creates a new job. The request will be denied if a job with the provided name already exists
* in the cluster.
*/
Response createJob(1: JobConfiguration description)
/**
* Enters a job into the cron schedule, without actually starting the job.
* If the job is already present in the schedule, this will update the schedule entry with the new
* configuration.
*/
Response scheduleCronJob(1: JobConfiguration description)
/**
* Removes a job from the cron schedule. The request will be denied if the job was not previously
* scheduled with scheduleCronJob.
*/
Response descheduleCronJob(4: JobKey job)
/**
* Starts a cron job immediately. The request will be denied if the specified job does not
* exist for the role account, or the job is not a cron job.
*/
Response startCronJob(4: JobKey job)
/** Restarts a batch of shards. */
Response restartShards(5: JobKey job, 3: set<i32> shardIds)
/** Initiates a kill on tasks. */
Response killTasks(4: JobKey job, 5: set<i32> instances, 6: string message)
/**
* Adds new instances with the TaskConfig of the existing instance pointed by the key.
*/
Response addInstances(3: InstanceKey key, 4: i32 count)
// TODO(maxim): reevaluate if it's still needed when client updater is gone (AURORA-785).
/**
* Replaces the template (configuration) for the existing cron job.
* The cron job template (configuration) must exist for the call to succeed.
*/
Response replaceCronTemplate(1: JobConfiguration config)
/** Starts update of the existing service job. */
Response startJobUpdate(
/** A description of how to change the job. */
1: JobUpdateRequest request,
/** A user-specified message to include with the induced job update state change. */
3: string message)
/**
* Pauses the specified job update. Can be resumed by resumeUpdate call.
*/
Response pauseJobUpdate(
/** The update to pause. */
1: JobUpdateKey key,
/** A user-specified message to include with the induced job update state change. */
3: string message)
/** Resumes progress of a previously paused job update. */
Response resumeJobUpdate(
/** The update to resume. */
1: JobUpdateKey key,
/** A user-specified message to include with the induced job update state change. */
3: string message)
/** Permanently aborts the job update. Does not remove the update history. */
Response abortJobUpdate(
/** The update to abort. */
1: JobUpdateKey key,
/** A user-specified message to include with the induced job update state change. */
3: string message)
/**
* Rollbacks the specified active job update to the initial state.
*/
Response rollbackJobUpdate(
/** The update to rollback. */
1: JobUpdateKey key,
/** A user-specified message to include with the induced job update state change. */
2: string message)
/**
* Allows progress of the job update in case blockIfNoPulsesAfterMs is specified in
* JobUpdateSettings. Unblocks progress if the update was previously blocked.
* Responds with ResponseCode.INVALID_REQUEST in case an unknown update key is specified.
*/
Response pulseJobUpdate(1: JobUpdateKey key)
}
struct ExplicitReconciliationSettings {
1: optional i32 batchSize
}
// It would be great to compose these services rather than extend, but that won't be possible until
// https://issues.apache.org/jira/browse/THRIFT-66 is resolved.
service AuroraAdmin extends AuroraSchedulerManager {
/** Assign quota to a user. This will overwrite any pre-existing quota for the user. */
Response setQuota(1: string ownerRole, 2: ResourceAggregate quota)
/**
* Forces a task into a specific state. This does not guarantee the task will enter the given
* state, as the task must still transition within the bounds of the state machine. However,
* it attempts to enter that state via the state machine.
*/
Response forceTaskState(
1: string taskId,
2: ScheduleStatus status)
/** Immediately writes a storage snapshot to disk. */
Response performBackup()
/** Lists backups that are available for recovery. */
Response listBackups()
/** Loads a backup to an in-memory storage. This must precede all other recovery operations. */
Response stageRecovery(1: string backupId)
/** Queries for tasks in a staged recovery. */
Response queryRecovery(1: TaskQuery query)
/** Deletes tasks from a staged recovery. */
Response deleteRecoveryTasks(1: TaskQuery query)
/** Commits a staged recovery, completely replacing the previous storage state. */
Response commitRecovery()
/** Unloads (aborts) a staged recovery. */
Response unloadRecovery()
/** Put the given hosts into maintenance mode. */
Response startMaintenance(1: Hosts hosts)
/** Ask scheduler to begin moving tasks scheduled on given hosts. */
Response drainHosts(1: Hosts hosts)
/** Retrieve the current maintenance states for a group of hosts. */
Response maintenanceStatus(1: Hosts hosts)
/** Set the given hosts back into serving mode. */
Response endMaintenance(1: Hosts hosts)
/**
* Ask scheduler to put hosts into DRAINING mode and move scheduled tasks off of the hosts
* such that its SLA requirements are satisfied. Use defaultSlaPolicy if it is not set for a task.
**/
Response slaDrainHosts(1: Hosts hosts, 2: SlaPolicy defaultSlaPolicy, 3: i64 timeoutSecs)
/** Start a storage snapshot and block until it completes. */
Response snapshot()
/** Tell scheduler to trigger an explicit task reconciliation with the given settings. */
Response triggerExplicitTaskReconciliation(1: ExplicitReconciliationSettings settings)
/** Tell scheduler to trigger an implicit task reconciliation. */
Response triggerImplicitTaskReconciliation()
/**
* Force prune any (terminal) tasks that match the query. If no statuses are supplied with the
* query, it will default to all terminal task states. If statuses are supplied, they must be
* terminal states.
*/
Response pruneTasks(1: TaskQuery query)
}
// The name of the header that should be sent to bypass leader redirection in the Scheduler.
const string BYPASS_LEADER_REDIRECT_HEADER_NAME = 'Bypass-Leader-Redirect'
// The path under which a task's filesystem should be mounted when using images and the Mesos
// unified containerizer.
const string TASK_FILESYSTEM_MOUNT_POINT = 'taskfs'