blob: 64fc829ac3b58d95fc0bd074571a46518a80bbba [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <process/metrics/counter.hpp>
#include <process/metrics/gauge.hpp>
#include <process/metrics/metrics.hpp>
#include <stout/foreach.hpp>
#include "master/master.hpp"
#include "master/metrics.hpp"
using process::metrics::Counter;
using process::metrics::Gauge;
using std::string;
namespace mesos {
namespace internal {
namespace master {
// Message counters are named with "messages_" prefix so they can
// be grouped together alphabetically in the output.
// TODO(alexandra.sava): Add metrics for registered and removed slaves.
Metrics::Metrics(const Master& master)
: uptime_secs(
"master/uptime_secs",
defer(master, &Master::_uptime_secs)),
elected(
"master/elected",
defer(master, &Master::_elected)),
slaves_connected(
"master/slaves_connected",
defer(master, &Master::_slaves_connected)),
slaves_disconnected(
"master/slaves_disconnected",
defer(master, &Master::_slaves_disconnected)),
slaves_active(
"master/slaves_active",
defer(master, &Master::_slaves_active)),
slaves_inactive(
"master/slaves_inactive",
defer(master, &Master::_slaves_inactive)),
slaves_unreachable(
"master/slaves_unreachable",
defer(master, &Master::_slaves_unreachable)),
frameworks_connected(
"master/frameworks_connected",
defer(master, &Master::_frameworks_connected)),
frameworks_disconnected(
"master/frameworks_disconnected",
defer(master, &Master::_frameworks_disconnected)),
frameworks_active(
"master/frameworks_active",
defer(master, &Master::_frameworks_active)),
frameworks_inactive(
"master/frameworks_inactive",
defer(master, &Master::_frameworks_inactive)),
outstanding_offers(
"master/outstanding_offers",
defer(master, &Master::_outstanding_offers)),
tasks_staging(
"master/tasks_staging",
defer(master, &Master::_tasks_staging)),
tasks_starting(
"master/tasks_starting",
defer(master, &Master::_tasks_starting)),
tasks_running(
"master/tasks_running",
defer(master, &Master::_tasks_running)),
tasks_unreachable(
"master/tasks_unreachable",
defer(master, &Master::_tasks_unreachable)),
tasks_killing(
"master/tasks_killing",
defer(master, &Master::_tasks_killing)),
tasks_finished(
"master/tasks_finished"),
tasks_failed(
"master/tasks_failed"),
tasks_killed(
"master/tasks_killed"),
tasks_lost(
"master/tasks_lost"),
tasks_error(
"master/tasks_error"),
tasks_dropped(
"master/tasks_dropped"),
tasks_gone(
"master/tasks_gone"),
tasks_gone_by_operator(
"master/tasks_gone_by_operator"),
dropped_messages(
"master/dropped_messages"),
messages_register_framework(
"master/messages_register_framework"),
messages_reregister_framework(
"master/messages_reregister_framework"),
messages_unregister_framework(
"master/messages_unregister_framework"),
messages_deactivate_framework(
"master/messages_deactivate_framework"),
messages_kill_task(
"master/messages_kill_task"),
messages_status_update_acknowledgement(
"master/messages_status_update_acknowledgement"),
messages_resource_request(
"master/messages_resource_request"),
messages_launch_tasks(
"master/messages_launch_tasks"),
messages_decline_offers(
"master/messages_decline_offers"),
messages_revive_offers(
"master/messages_revive_offers"),
messages_suppress_offers(
"master/messages_suppress_offers"),
messages_reconcile_tasks(
"master/messages_reconcile_tasks"),
messages_framework_to_executor(
"master/messages_framework_to_executor"),
messages_executor_to_framework(
"master/messages_executor_to_framework"),
messages_register_slave(
"master/messages_register_slave"),
messages_reregister_slave(
"master/messages_reregister_slave"),
messages_unregister_slave(
"master/messages_unregister_slave"),
messages_status_update(
"master/messages_status_update"),
messages_exited_executor(
"master/messages_exited_executor"),
messages_update_slave(
"master/messages_update_slave"),
messages_authenticate(
"master/messages_authenticate"),
valid_framework_to_executor_messages(
"master/valid_framework_to_executor_messages"),
invalid_framework_to_executor_messages(
"master/invalid_framework_to_executor_messages"),
valid_executor_to_framework_messages(
"master/valid_executor_to_framework_messages"),
invalid_executor_to_framework_messages(
"master/invalid_executor_to_framework_messages"),
valid_status_updates(
"master/valid_status_updates"),
invalid_status_updates(
"master/invalid_status_updates"),
valid_status_update_acknowledgements(
"master/valid_status_update_acknowledgements"),
invalid_status_update_acknowledgements(
"master/invalid_status_update_acknowledgements"),
recovery_slave_removals(
"master/recovery_slave_removals"),
event_queue_messages(
"master/event_queue_messages",
defer(master, &Master::_event_queue_messages)),
event_queue_dispatches(
"master/event_queue_dispatches",
defer(master, &Master::_event_queue_dispatches)),
event_queue_http_requests(
"master/event_queue_http_requests",
defer(master, &Master::_event_queue_http_requests)),
slave_registrations(
"master/slave_registrations"),
slave_reregistrations(
"master/slave_reregistrations"),
slave_removals(
"master/slave_removals"),
slave_removals_reason_unhealthy(
"master/slave_removals/reason_unhealthy"),
slave_removals_reason_unregistered(
"master/slave_removals/reason_unregistered"),
slave_removals_reason_registered(
"master/slave_removals/reason_registered"),
slave_shutdowns_scheduled(
"master/slave_shutdowns_scheduled"),
slave_shutdowns_completed(
"master/slave_shutdowns_completed"),
slave_shutdowns_canceled(
"master/slave_shutdowns_canceled"),
slave_unreachable_scheduled(
"master/slave_unreachable_scheduled"),
slave_unreachable_completed(
"master/slave_unreachable_completed"),
slave_unreachable_canceled(
"master/slave_unreachable_canceled")
{
// TODO(dhamon): Check return values of 'add'.
process::metrics::add(uptime_secs);
process::metrics::add(elected);
process::metrics::add(slaves_connected);
process::metrics::add(slaves_disconnected);
process::metrics::add(slaves_active);
process::metrics::add(slaves_inactive);
process::metrics::add(slaves_unreachable);
process::metrics::add(frameworks_connected);
process::metrics::add(frameworks_disconnected);
process::metrics::add(frameworks_active);
process::metrics::add(frameworks_inactive);
process::metrics::add(outstanding_offers);
process::metrics::add(tasks_staging);
process::metrics::add(tasks_starting);
process::metrics::add(tasks_running);
process::metrics::add(tasks_killing);
process::metrics::add(tasks_finished);
process::metrics::add(tasks_failed);
process::metrics::add(tasks_killed);
process::metrics::add(tasks_lost);
process::metrics::add(tasks_error);
process::metrics::add(tasks_dropped);
process::metrics::add(tasks_unreachable);
process::metrics::add(tasks_gone);
process::metrics::add(tasks_gone_by_operator);
process::metrics::add(dropped_messages);
// Messages from schedulers.
process::metrics::add(messages_register_framework);
process::metrics::add(messages_reregister_framework);
process::metrics::add(messages_unregister_framework);
process::metrics::add(messages_deactivate_framework);
process::metrics::add(messages_kill_task);
process::metrics::add(messages_status_update_acknowledgement);
process::metrics::add(messages_resource_request);
process::metrics::add(messages_launch_tasks);
process::metrics::add(messages_decline_offers);
process::metrics::add(messages_revive_offers);
process::metrics::add(messages_suppress_offers);
process::metrics::add(messages_reconcile_tasks);
process::metrics::add(messages_framework_to_executor);
process::metrics::add(messages_executor_to_framework);
// Messages from slaves.
process::metrics::add(messages_register_slave);
process::metrics::add(messages_reregister_slave);
process::metrics::add(messages_unregister_slave);
process::metrics::add(messages_status_update);
process::metrics::add(messages_exited_executor);
process::metrics::add(messages_update_slave);
// Messages from both schedulers and slaves.
process::metrics::add(messages_authenticate);
process::metrics::add(valid_framework_to_executor_messages);
process::metrics::add(invalid_framework_to_executor_messages);
process::metrics::add(valid_executor_to_framework_messages);
process::metrics::add(invalid_executor_to_framework_messages);
process::metrics::add(valid_status_updates);
process::metrics::add(invalid_status_updates);
process::metrics::add(valid_status_update_acknowledgements);
process::metrics::add(invalid_status_update_acknowledgements);
process::metrics::add(recovery_slave_removals);
process::metrics::add(event_queue_messages);
process::metrics::add(event_queue_dispatches);
process::metrics::add(event_queue_http_requests);
process::metrics::add(slave_registrations);
process::metrics::add(slave_reregistrations);
process::metrics::add(slave_removals);
process::metrics::add(slave_removals_reason_unhealthy);
process::metrics::add(slave_removals_reason_unregistered);
process::metrics::add(slave_removals_reason_registered);
process::metrics::add(slave_shutdowns_scheduled);
process::metrics::add(slave_shutdowns_completed);
process::metrics::add(slave_shutdowns_canceled);
process::metrics::add(slave_unreachable_scheduled);
process::metrics::add(slave_unreachable_completed);
process::metrics::add(slave_unreachable_canceled);
// Create resource gauges.
// TODO(dhamon): Set these up dynamically when adding a slave based on the
// resources the slave exposes.
const string resources[] = {"cpus", "gpus", "mem", "disk"};
foreach (const string& resource, resources) {
Gauge total(
"master/" + resource + "_total",
defer(master, &Master::_resources_total, resource));
Gauge used(
"master/" + resource + "_used",
defer(master, &Master::_resources_used, resource));
Gauge percent(
"master/" + resource + "_percent",
defer(master, &Master::_resources_percent, resource));
resources_total.push_back(total);
resources_used.push_back(used);
resources_percent.push_back(percent);
process::metrics::add(total);
process::metrics::add(used);
process::metrics::add(percent);
}
foreach (const string& resource, resources) {
Gauge total(
"master/" + resource + "_revocable_total",
defer(master, &Master::_resources_revocable_total, resource));
Gauge used(
"master/" + resource + "_revocable_used",
defer(master, &Master::_resources_revocable_used, resource));
Gauge percent(
"master/" + resource + "_revocable_percent",
defer(master, &Master::_resources_revocable_percent, resource));
resources_revocable_total.push_back(total);
resources_revocable_used.push_back(used);
resources_revocable_percent.push_back(percent);
process::metrics::add(total);
process::metrics::add(used);
process::metrics::add(percent);
}
}
Metrics::~Metrics()
{
// TODO(dhamon): Check return values of 'remove'.
process::metrics::remove(uptime_secs);
process::metrics::remove(elected);
process::metrics::remove(slaves_connected);
process::metrics::remove(slaves_disconnected);
process::metrics::remove(slaves_active);
process::metrics::remove(slaves_inactive);
process::metrics::remove(slaves_unreachable);
process::metrics::remove(frameworks_connected);
process::metrics::remove(frameworks_disconnected);
process::metrics::remove(frameworks_active);
process::metrics::remove(frameworks_inactive);
process::metrics::remove(outstanding_offers);
process::metrics::remove(tasks_staging);
process::metrics::remove(tasks_starting);
process::metrics::remove(tasks_running);
process::metrics::remove(tasks_killing);
process::metrics::remove(tasks_finished);
process::metrics::remove(tasks_failed);
process::metrics::remove(tasks_killed);
process::metrics::remove(tasks_lost);
process::metrics::remove(tasks_error);
process::metrics::remove(tasks_dropped);
process::metrics::remove(tasks_unreachable);
process::metrics::remove(tasks_gone);
process::metrics::remove(tasks_gone_by_operator);
process::metrics::remove(dropped_messages);
// Messages from schedulers.
process::metrics::remove(messages_register_framework);
process::metrics::remove(messages_reregister_framework);
process::metrics::remove(messages_unregister_framework);
process::metrics::remove(messages_deactivate_framework);
process::metrics::remove(messages_kill_task);
process::metrics::remove(messages_status_update_acknowledgement);
process::metrics::remove(messages_resource_request);
process::metrics::remove(messages_launch_tasks);
process::metrics::remove(messages_decline_offers);
process::metrics::remove(messages_revive_offers);
process::metrics::remove(messages_suppress_offers);
process::metrics::remove(messages_reconcile_tasks);
process::metrics::remove(messages_framework_to_executor);
process::metrics::remove(messages_executor_to_framework);
// Messages from slaves.
process::metrics::remove(messages_register_slave);
process::metrics::remove(messages_reregister_slave);
process::metrics::remove(messages_unregister_slave);
process::metrics::remove(messages_status_update);
process::metrics::remove(messages_exited_executor);
process::metrics::remove(messages_update_slave);
// Messages from both schedulers and slaves.
process::metrics::remove(messages_authenticate);
process::metrics::remove(valid_framework_to_executor_messages);
process::metrics::remove(invalid_framework_to_executor_messages);
process::metrics::remove(valid_executor_to_framework_messages);
process::metrics::remove(invalid_executor_to_framework_messages);
process::metrics::remove(valid_status_updates);
process::metrics::remove(invalid_status_updates);
process::metrics::remove(valid_status_update_acknowledgements);
process::metrics::remove(invalid_status_update_acknowledgements);
process::metrics::remove(recovery_slave_removals);
process::metrics::remove(event_queue_messages);
process::metrics::remove(event_queue_dispatches);
process::metrics::remove(event_queue_http_requests);
process::metrics::remove(slave_registrations);
process::metrics::remove(slave_reregistrations);
process::metrics::remove(slave_removals);
process::metrics::remove(slave_removals_reason_unhealthy);
process::metrics::remove(slave_removals_reason_unregistered);
process::metrics::remove(slave_removals_reason_registered);
process::metrics::remove(slave_shutdowns_scheduled);
process::metrics::remove(slave_shutdowns_completed);
process::metrics::remove(slave_shutdowns_canceled);
process::metrics::remove(slave_unreachable_scheduled);
process::metrics::remove(slave_unreachable_completed);
process::metrics::remove(slave_unreachable_canceled);
foreach (const Gauge& gauge, resources_total) {
process::metrics::remove(gauge);
}
resources_total.clear();
foreach (const Gauge& gauge, resources_used) {
process::metrics::remove(gauge);
}
resources_used.clear();
foreach (const Gauge& gauge, resources_percent) {
process::metrics::remove(gauge);
}
resources_percent.clear();
foreach (const Gauge& gauge, resources_revocable_total) {
process::metrics::remove(gauge);
}
resources_revocable_total.clear();
foreach (const Gauge& gauge, resources_revocable_used) {
process::metrics::remove(gauge);
}
resources_revocable_used.clear();
foreach (const Gauge& gauge, resources_revocable_percent) {
process::metrics::remove(gauge);
}
resources_revocable_percent.clear();
foreachvalue (const auto& source_reason, tasks_states) {
foreachvalue (const auto& reason_counter, source_reason) {
foreachvalue (const Counter& counter, reason_counter) {
process::metrics::remove(counter);
}
}
}
tasks_states.clear();
}
void Metrics::incrementTasksStates(
const TaskState& state,
const TaskStatus::Source& source,
const TaskStatus::Reason& reason)
{
if (!tasks_states.contains(state)) {
tasks_states[state] = SourcesReasons();
}
if (!tasks_states[state].contains(source)) {
tasks_states[state][source] = Reasons();
}
if (!tasks_states[state][source].contains(reason)) {
Counter counter = Counter(
"master/" +
strings::lower(TaskState_Name(state)) + "/" +
strings::lower(TaskStatus::Source_Name(source)) + "/" +
strings::lower(TaskStatus::Reason_Name(reason)));
tasks_states[state][source].put(reason, counter);
process::metrics::add(counter);
}
Counter counter = tasks_states[state][source].get(reason).get();
counter++;
}
} // namespace master {
} // namespace internal {
} // namespace mesos {