blob: 604db7f1117efd18d5de403c92fbf71d256351e7 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef IMPALA_RUNTIME_RESERVATION_TRACKER_H
#define IMPALA_RUNTIME_RESERVATION_TRACKER_H
#include <stdint.h>
#include <boost/scoped_ptr.hpp>
#include <boost/thread/locks.hpp>
#include <string>
#include "common/status.h"
#include "runtime/bufferpool/reservation-tracker-counters.h"
#include "runtime/mem-tracker-types.h"
#include "util/spinlock.h"
namespace impala {
class DummyProfile;
class RuntimeProfile;
/// A tracker for a hierarchy of buffer pool memory reservations, denominated in bytes.
/// A hierarchy of ReservationTrackers provides a mechanism for subdividing buffer pool
/// memory and enforcing upper and lower bounds on memory usage.
///
/// The root of the tracker tree enforces a global maximum, which is distributed among its
/// children. Each tracker in the tree has a 'reservation': the total bytes of buffer pool
/// memory it is entitled to use. The reservation is inclusive of any memory that is
/// already allocated from the reservation, i.e. using a reservation to allocate memory
/// does not subtract from the reservation.
///
/// A reservation can be used directly at the tracker by calling AllocateFrom(), or
/// distributed to children of the tracker for the childrens' reservations. Each tracker
/// in the tree can use up to its reservation without checking parent trackers. To
/// increase its reservation, a tracker must use some of its parent's reservation (and
/// perhaps increase reservations all the way to the root of the tree).
///
/// Each tracker also has a maximum reservation that is enforced. E.g. if the root of the
/// tracker hierarchy is the global tracker for the Impala daemon and the next level of
/// the hierarchy is made up of per-query trackers, then the maximum reservation
/// mechanism can enforce both process-level and query-level limits on reservations.
///
/// Invariants:
/// * A tracker's reservation is at most its reservation limit: reservation <= limit
/// * A tracker's reservation is at least the sum of its childrens' reservations plus
/// the amount of the reservation used directly at this tracker. The difference is
/// the unused reservation:
/// child_reservations + used_reservation + unused_reservation = reservation.
///
/// Thread-safety:
/// All public ReservationTracker methods are thread-safe. If multiple threads
/// concurrently invoke methods on a ReservationTracker, each operation is applied
/// atomically to leave the ReservationTracker in a consistent state. Calling threads
/// are responsible for coordinating to avoid violating any method preconditions,
/// e.g. ensuring that there is sufficient unused reservation before calling AllocateTo().
///
/// Integration with MemTracker hierarchy:
/// TODO: we will remove MemTracker and this integration once all memory is accounted via
/// reservations.
///
/// Each ReservationTracker can optionally have a linked MemTracker. E.g. an exec
/// node's ReservationTracker can be linked with the exec node's MemTracker, so that
/// reservations are included in query memory consumption for the purposes of enforcing
/// memory limits, reporting and logging. The reservation is accounted as consumption
/// against the linked MemTracker and its ancestors because reserved memory is committed.
/// Allocating from a reservation therefore does not change the consumption reflected in
/// the MemTracker hierarchy.
///
/// MemTracker limits are only checked via the topmost link (i.e. the query-level
/// trackers): we require that no MemTrackers below this level have limits.
///
/// We require that the MemTracker hierarchy is consistent with the ReservationTracker
/// hierarchy. I.e. if a ReservationTracker is linked to a MemTracker "A", and its parent
/// is linked to a MemTracker "B", then "B" must be the parent of "A"'.
class ReservationTracker {
public:
ReservationTracker();
virtual ~ReservationTracker();
/// Initializes the root tracker with the given reservation limit in bytes. The initial
/// reservation is 0.
/// if 'profile' is not NULL, the counters defined in ReservationTrackerCounters are
/// added to 'profile'.
void InitRootTracker(RuntimeProfile* profile, int64_t reservation_limit);
/// Initializes a new ReservationTracker with a parent.
/// If 'mem_tracker' is not NULL, reservations for this ReservationTracker and its
/// children will be counted as consumption against 'mem_tracker'.
/// 'reservation_limit' is the maximum reservation for this tracker in bytes.
/// 'mem_limit_mode' determines whether reservation increases are checked against the
/// soft or hard limit of 'mem_tracker'. If 'profile' is not NULL, the counters in
/// 'counters_' are added to 'profile'.
void InitChildTracker(RuntimeProfile* profile, ReservationTracker* parent,
MemTracker* mem_tracker, int64_t reservation_limit,
MemLimit mem_limit_mode = MemLimit::SOFT);
/// If the tracker is initialized, deregister the ReservationTracker from its parent,
/// relinquishing all this tracker's reservation. All of the reservation must be unused
/// and all the tracker's children must be closed before calling this method.
/// TODO: decide on and implement policy for how far to release the reservation up
/// the tree. Currently the reservation is released all the way to the root.
void Close();
/// Request to increase reservation by 'bytes'. The request is either granted in
/// full or not at all. Uses any unused reservation on ancestors and increase
/// ancestors' reservations if needed to fit the increased reservation.
/// Returns true if the reservation increase is granted, or false if not granted.
/// If the reservation is not granted, no modifications are made to the state of
/// any ReservationTrackers and if 'error_status' is non-null, it returns an
/// appropriate status message in it.
bool IncreaseReservation(int64_t bytes, Status* error_status = nullptr)
WARN_UNUSED_RESULT;
/// Tries to ensure that 'bytes' of unused reservation is available. If not already
/// available, tries to increase the reservation such that the unused reservation is
/// exactly equal to 'bytes'. Uses any unused reservation on ancestors and increase
/// ancestors' reservations if needed to fit the increased reservation.
/// Returns true if the reservation increase was successful or not necessary. Otherwise
/// returns false and if 'error_status' is non-null, it returns an appropriate status
/// message in it.
bool IncreaseReservationToFit(
int64_t bytes, Status* error_status = nullptr) WARN_UNUSED_RESULT;
/// Like IncreaseReservationToFit(), except 'bytes' is also allocated from
/// the reservation on success.
bool IncreaseReservationToFitAndAllocate(
int64_t bytes, Status* error_status = nullptr) WARN_UNUSED_RESULT;
/// Decrease reservation by 'bytes' on this tracker and all ancestors. This tracker's
/// reservation must be at least 'bytes' before calling this method.
void DecreaseReservation(int64_t bytes) { DecreaseReservation(bytes, false); }
/// Transfer reservation from this tracker to 'other'. Both trackers must be in the
/// same query subtree of the hierarchy. One tracker can be the ancestor of the other,
/// or they can share a common ancestor. The subtree root must be at the query level
/// or below so that the transfer cannot cause a MemTracker limit to be exceeded
/// (because linked MemTrackers with limits below the query level are not supported).
/// Returns true on success or false if the transfer would have caused a reservation
/// limit to be exceeded.
bool TransferReservationTo(ReservationTracker* other, int64_t bytes) WARN_UNUSED_RESULT;
/// Allocate 'bytes' from the reservation. The tracker must have at least 'bytes'
/// unused reservation before calling this method.
void AllocateFrom(int64_t bytes);
/// Release 'bytes' of previously allocated memory. The used reservation is
/// decreased by 'bytes'. Before the call, the used reservation must be at least
/// 'bytes' before calling this method.
void ReleaseTo(int64_t bytes);
/// Returns the amount of the reservation in bytes. Does not acquire the internal lock.
int64_t GetReservation();
/// Returns the current amount of the reservation used at this tracker, not including
/// reservations of children in bytes. Does not acquire the internal lock.
int64_t GetUsedReservation();
/// Returns the amount of the reservation neither used nor given to childrens'
/// reservations at this tracker in bytes. Acquires the internal lock.
int64_t GetUnusedReservation();
/// Returns the total reservations of children in bytes. Does not acquire the
/// internal lock.
int64_t GetChildReservations();
/// Support for debug actions: deny reservation increase with probability 'probability'.
void SetDebugDenyIncreaseReservation(double probability) {
increase_deny_probability_ = probability;
}
ReservationTracker* parent() const { return parent_; }
std::string DebugString();
private:
/// Returns the amount of 'reservation_' that is unused.
inline int64_t unused_reservation() const {
return reservation_ - used_reservation_ - child_reservations_;
}
/// Returns the parent's memtracker if 'parent_' is non-NULL, or NULL otherwise.
MemTracker* GetParentMemTracker() const {
return parent_ == nullptr ? nullptr : parent_->mem_tracker_;
}
/// Initializes 'counters_', storing the counters in 'profile'.
/// If 'profile' is NULL, creates a dummy profile to store the counters.
void InitCounters(RuntimeProfile* profile, int64_t max_reservation);
/// Internal helper for IncreaseReservation(). If 'use_existing_reservation' is true,
/// increase by the minimum amount so that 'bytes' fits in the reservation, otherwise
/// just increase by 'bytes'. If 'is_child_reservation' is true, also increase
/// 'child_reservations_' by 'bytes'. If 'error_status' is not null and reservation
/// increase fails then an appropriate status message is returned in it.
/// 'lock_' must be held by caller.
/// Example error message if a reservation tracker hits its limit:
/// Failed to increase reservation by 2.12 GB because it would exceed the applicable
/// reservation limit for the "Process" ReservationTracker: reservation_limit=2.00 GB
/// reservation=0 used_reservation=0 child_reservations=0
/// The top 5 queries that allocated memory under this tracker are:
/// Query(20449659107d67ce:2e9058b500000000): Reservation=0 ReservationLimit=6.67 GB
/// OtherMemory=0 Total=0 Peak=0
bool IncreaseReservationInternalLocked(int64_t bytes, bool use_existing_reservation,
bool is_child_reservation, Status* error_status = nullptr);
/// Increase consumption on linked MemTracker to reflect an increase in reservation
/// of 'reservation_increase'. For the topmost link, return false if this failed
/// because it would exceed a memory limit. If there is no linked MemTracker, just
/// returns true.
/// TODO: remove once we account all memory via ReservationTrackers.
bool TryConsumeFromMemTracker(int64_t reservation_increase, MemLimit mem_limit_mode);
/// Decrease consumption on linked MemTracker to reflect a decrease in reservation of
/// 'reservation_decrease'. If there is no linked MemTracker, does nothing.
/// TODO: remove once we account all memory via ReservationTrackers.
void ReleaseToMemTracker(int64_t reservation_decrease);
/// Decrease reservation by 'bytes' on this tracker and all ancestors. This tracker's
/// reservation must be at least 'bytes' before calling this method. If
/// 'is_child_reservation' is true it decreases 'child_reservations_' by 'bytes'
void DecreaseReservation(int64_t bytes, bool is_child_reservation);
/// Same as DecreaseReservation(), but 'lock_' must be held by caller.
void DecreaseReservationLocked(int64_t bytes, bool is_child_reservation);
/// Return a vector containing the trackers on the path to the root tracker. Includes
/// the current tracker and the root tracker.
std::vector<ReservationTracker*> FindPathToRoot();
/// Return true if trackers in the subtree rooted at 'subtree1' precede trackers in
/// the subtree rooted at 'subtree2' in the lock order. 'subtree1' and 'subtree2'
/// must share the same parent.
static bool lock_sibling_subtree_first(
ReservationTracker* subtree1, ReservationTracker* subtree2) {
DCHECK_EQ(subtree1->parent_, subtree2->parent_);
return reinterpret_cast<uintptr_t>(subtree1) < reinterpret_cast<uintptr_t>(subtree2);
}
/// Check the internal consistency of the ReservationTracker and DCHECKs if in an
/// inconsistent state.
/// 'lock_' must be held by caller.
void CheckConsistency() const;
/// Same as AllocateFrom() except 'lock_' must be held by caller.
void AllocateFromLocked(int64_t bytes);
/// Increase or decrease 'used_reservation_' and update profile counters accordingly.
/// 'lock_' must be held by caller.
void UpdateUsedReservation(int64_t delta);
/// Increase or decrease 'reservation_' and update profile counters accordingly.
/// 'lock_' must be held by caller.
void UpdateReservation(int64_t delta);
/// Support for debug actions: see SetDebugDenyIncreaseReservation() for behaviour.
double increase_deny_probability_ = 0.0;
/// lock_ protects all below members. The lock order in a tree of ReservationTrackers is
/// based on a post-order traversal of the tree, with children visited in order of the
/// memory address of the ReservationTracker object. The following rules can be applied
/// to determine the relative positions of two trackers t1 and t2 in the lock order:
/// * If t1 is a descendant of t2, t1's lock must be acquired before t2's lock (i.e.
/// locks are acquired bottom-up).
/// * If neither t1 or t2 is a descendant of the other, they must be in subtrees of
/// under a common ancestor. If the memory address of t1's subtree's root is less
/// than the memory address of t2's subtree's root, t1's lock must be acquired before
/// t2's lock. This check is implemented in lock_sibling_subtree_first().
/// Since MemTracker::child_trackers_lock_ objects are acquired in a top-down lock
/// order, if a MemTracker::child_trackers_lock_ is acquired while holding a lock_, any
/// more calls to acquire a lock_ should not be made to avoid any deadlock that might
/// occur due to ReservationTracker's bottom-up lock order.
SpinLock lock_;
/// True if the tracker is initialized.
bool initialized_ = false;
/// A dummy profile to hold the counters in 'counters_' in the case that no profile
/// is provided.
boost::scoped_ptr<DummyProfile> dummy_profile_;
/// The RuntimeProfile counters for this tracker.
/// All non-NULL if 'initialized_' is true.
ReservationTrackerCounters counters_;
/// The parent of this tracker in the hierarchy. Does not change after initialization.
ReservationTracker* parent_ = nullptr;
/// If non-NULL, reservations are counted as memory consumption against this tracker.
/// Does not change after initialization. Not owned.
/// TODO: remove once all memory is accounted via ReservationTrackers.
MemTracker* mem_tracker_ = nullptr;
/// Determines whether the soft or hard limit of 'mem_tracker_' is checked for
/// reservation increases.
MemLimit mem_limit_mode_;
/// The maximum reservation in bytes that this tracker can have. Can be read with an
/// atomic load without holding lock.
int64_t reservation_limit_;
/// This tracker's current reservation in bytes. 'reservation_' <= 'reservation_limit_'.
/// Can be read with an atomic load without holding lock.
int64_t reservation_;
/// Total reservation of children in bytes. This is included in 'reservation_'.
/// 'used_reservation_' + 'child_reservations_' <= 'reservation_'.
/// Can be read with an atomic load without holding lock.
int64_t child_reservations_;
/// The amount of the reservation currently used by this tracker in bytes.
/// 'used_reservation_' + 'child_reservations_' <= 'reservation_'.
/// Can be read with an atomic load without holding lock.
int64_t used_reservation_;
};
}
#endif