be/src/runtime/bufferpool/buffer-allocator.h - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #ifndef IMPALA_RUNTIME_BUFFER_ALLOCATOR_H
 #define IMPALA_RUNTIME_BUFFER_ALLOCATOR_H

 #include <boost/scoped_ptr.hpp>

 #include "runtime/bufferpool/buffer-pool-internal.h"
 #include "runtime/bufferpool/free-list.h"
 #include "util/aligned-new.h"

 namespace impala {

 /// The internal buffer allocator used by BufferPool to allocator power-of-two sized
 /// buffers. BufferAllocator builds on top of SystemAllocator by adding caching of
 /// free buffers and clean pages where the memory is not currently in use by a client
 /// but has not yet been released to SystemAllocator.
 ///
 /// The allocator is optimised for the common case where an allocation can be served
 /// by reclaiming a buffer of the request size from the current core's arena. In this
 /// case there is no contention for locks between concurrently-running threads. If this
 /// fails, progressively more expensive approaches to allocate memory are tried until
 /// the allocation eventually success (see AllocateInternal() for details).
 ///
 /// Buffer Reservations
 /// ===================
 /// The implementation of the BufferAllocator relies on the BufferPool's reservation
 /// tracking system. The allocator is given a hard limit ('system_bytes_limit'), above
 /// which all allocations will fail. Allocations up to 'system_bytes_limit' are
 /// guaranteed to succeed unless an unexpected system error occurs (e.g. we can't allocate
 /// all of the required memory from the OS). Reservations must be set up so that the total
 /// of all reservations does not exceed 'system_bytes_limit', thus ensuring that
 /// BufferAllocator can alway find memory to fulfill reservations.
 ///
 /// +========================+
 /// | IMPLEMENTATION NOTES   |
 /// +========================+
 ///
 /// Memory
 /// ======
 /// Memory managed by BufferAllocator comes in four forms:
 /// 1. Buffers returned to the client (corresponding to a used reservation)
 /// 2. Free buffers cached in the BufferAllocator's free lists.
 /// 3. Buffers attached to clean unpinned pages in the BufferAllocator's clean page lists.
 /// 4. Bytes that are not allocated from the system: 'system_bytes_remaining_'.
 /// Together these always add up to 'system_bytes_limit', which allows BufferAllocator
 /// to always fulfill reservations via some combination of memory in forms 2, 3 or 4.
 ///
 /// The BufferAllocator code is careful not to make memory inaccessible to concurrently
 /// executing threads that are entitled to it. E.g. if one thread is entitled to allocate
 /// a 1MB buffer from the BufferAllocator's free or clean page lists but needs to release
 /// a 2MB buffer to the system to free up enough memory, it must add 1MB to
 /// 'system_bytes_remaining_' in the same critical section in which it freed the 2MB
 /// buffer. Otherwise a concurrent thread that had a reservation for 1MB of memory might
 /// not be able to find it.
 ///
 /// Arenas
 /// ======
 /// The buffer allocator's data structures are broken up into arenas, with an arena per
 /// core. Within each arena, each buffer or page is stored in a list with buffers and
 /// pages of the same size: there is a separate list for every power-of-two size. Each
 /// arena is protected by a separate lock, so in the common case where threads are able
 /// to fulfill allocations from their own arena, there will be no lock contention.
 ///
 class BufferPool::BufferAllocator {
  public:
   BufferAllocator(BufferPool* pool, MetricGroup* metrics, int64_t min_buffer_len,
       int64_t system_bytes_limit, int64_t clean_page_bytes_limit);
   ~BufferAllocator();

   /// Allocate a buffer with a power-of-two length 'len'. This function may acquire
   /// 'FreeBufferArena::lock_' and Page::lock so no locks lower in the lock acquisition
   /// order (see buffer-pool-internal.h) should be held by the caller.
   ///
   /// Always succeeds on allocating memory up to 'system_bytes_limit', unless the system
   /// is unable to give us 'system_bytes_limit' of memory or an internal bug: if all
   /// clients write out enough dirty pages to stay within their reservation, then there
   /// should always be enough free buffers and clean pages to reclaim.
   Status Allocate(ClientHandle* client, int64_t len,
       BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;

   /// Frees 'buffer', which must be open before calling. Closes 'buffer' and updates
   /// internal state but does not release to any reservation.
   void Free(BufferPool::BufferHandle&& buffer);

   /// Adds a clean page 'page' to a clean page list. Caller must hold the page's
   /// client's lock via 'client_lock' so that moving the page between the client list and
   /// the free page list is atomic. Caller must not hold 'FreeBufferArena::lock_' or any
   /// Page::lock.
   void AddCleanPage(const boost::unique_lock<boost::mutex>& client_lock, Page* page);

   /// Removes a clean page 'page' from a clean page list and returns true, if present in
   /// one of the lists. Returns true if it was present. If 'claim_buffer' is true, the
   /// caller must have reservation for the buffer, which is returned along with the page.
   /// Otherwise the buffer is moved directly to the free buffer list. Caller must hold
   /// the page's client's lock via 'client_lock' so that moving the page between the
   /// client list and the free page list is atomic. Caller must not hold
   /// 'FreeBufferArena::lock_' or any Page::lock.
   bool RemoveCleanPage(
       const boost::unique_lock<boost::mutex>& client_lock, bool claim_buffer, Page* page);

   /// Periodically called to release free buffers back to the SystemAllocator. Releases
   /// buffers based on recent allocation patterns, trying to minimise the number of
   /// excess buffers retained in each list above the minimum required to avoid going
   /// to the system allocator.
   void Maintenance();

   /// Try to release at least 'bytes_to_free' bytes of memory to the system allocator.
   void ReleaseMemory(int64_t bytes_to_free);

   int64_t system_bytes_limit() const { return system_bytes_limit_; }

   /// Return the amount of memory currently allocated from the system.
   int64_t GetSystemBytesAllocated() const {
     return system_bytes_limit_ - system_bytes_remaining_.Load();
   }

   /// Return the total number of free buffers in the allocator.
   int64_t GetNumFreeBuffers() const;

   /// Return the total bytes of free buffers in the allocator.
   int64_t GetFreeBufferBytes() const;

   /// Return the limit on bytes of clean pages in the allocator.
   int64_t GetCleanPageBytesLimit() const;

   /// Return the total number of clean pages in the allocator.
   int64_t GetNumCleanPages() const;

   /// Return the total bytes of clean pages in the allocator.
   int64_t GetCleanPageBytes() const;

   std::string DebugString();

  protected:
   friend class BufferAllocatorTest;
   friend class BufferPoolTest;
   friend class FreeBufferArena;

   /// Test helper: gets the current size of the free list for buffers of 'len' bytes
   /// on core 'core'.
   int GetFreeListSize(int core, int64_t len);

   /// Test helper: reduce the number of scavenge attempts so backend tests can force
   /// use of the "locked" scavenging code path.
   void set_max_scavenge_attempts(int val) {
     DCHECK_GE(val, 1);
     max_scavenge_attempts_ = val;
   }

  private:
   /// Compute the maximum power-of-two buffer length that could be allocated based on the
   /// amount of memory available 'system_bytes_limit'. The value is always at least
   /// 'min_buffer_len' so that there is at least one valid buffer size.
   static int64_t CalcMaxBufferLen(int64_t min_buffer_len, int64_t system_bytes_limit);

   /// Same as Allocate() but leaves 'buffer->client_' NULL and only updates the
   /// 'sys_alloc_time' and no other counters.
   Status AllocateInternal(BufferPool::Client* client, int64_t len,
       BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;

   /// Tries to reclaim enough memory from various sources so that the caller can allocate
   /// a buffer of 'target_bytes' from the system allocator. Scavenges buffers from the
   /// free buffer and clean page lists of all cores and frees them with
   /// 'system_allocator_'. Also tries to decrement 'system_bytes_remaining_'.
   /// 'current_core' is the index of the current CPU core. Any bytes freed in excess of
   /// 'target_bytes' are added to 'system_bytes_remaining_.' If 'slow_but_sure' is true,
   /// this function uses a slower strategy that guarantees enough memory will be found
   /// but can block progress of other threads for longer. If 'slow_but_sure' is false,
   /// then this function optimistically tries to reclaim the memory but may not reclaim
   /// 'target_bytes' of memory. Returns the number of bytes reclaimed.
   int64_t ScavengeBuffers(bool slow_but_sure, int current_core, int64_t target_bytes);

   /// Helper to free a list of buffers to the system. Returns the number of bytes freed.
   int64_t FreeToSystem(std::vector<BufferHandle>&& buffers);

   /// Compute a sum over all arenas. Does not lock the arenas.
   int64_t SumOverArenas(std::function<int64_t(FreeBufferArena* arena)> compute_fn) const;

   /// The pool that this allocator is associated with.
   BufferPool* const pool_;

   /// System allocator that is ultimately used to allocate and free buffers.
   const boost::scoped_ptr<SystemAllocator> system_allocator_;

   /// The minimum power-of-two buffer length that can be allocated.
   const int64_t min_buffer_len_;

   /// The maximum power-of-two buffer length that can be allocated. Always >=
   /// 'min_buffer_len' so that there is at least one valid buffer size.
   const int64_t max_buffer_len_;

   /// The log2 of 'min_buffer_len_'.
   const int log_min_buffer_len_;

   /// The log2 of 'max_buffer_len_'.
   const int log_max_buffer_len_;

   /// The maximum physical memory in bytes that will be allocated from the system.
   const int64_t system_bytes_limit_;

   /// The remaining number of bytes of 'system_bytes_limit_' that can be used for
   /// allocating new buffers. Must be updated atomically before a new buffer is
   /// allocated or after an existing buffer is freed with the system allocator.
   AtomicInt64 system_bytes_remaining_;

   /// The maximum bytes of clean pages that can accumulate across all arenas before
   /// they will be evicted.
   const int64_t clean_page_bytes_limit_;

   /// The number of bytes of 'clean_page_bytes_limit_' not used by clean pages. I.e.
   /// (clean_page_bytes_limit - bytes of clean pages in the BufferAllocator).
   /// 'clean_pages_bytes_limit_' is enforced by increasing this value before a
   /// clean page is added and decreasing it after a clean page is reclaimed or evicted.
   AtomicInt64 clean_page_bytes_remaining_;

   /// Free and clean pages. One arena per core.
   std::vector<std::unique_ptr<FreeBufferArena>> per_core_arenas_;

   /// Default number of times to attempt scavenging.
   static const int MAX_SCAVENGE_ATTEMPTS = 3;

   /// Number of times to attempt scavenging. Usually MAX_SCAVENGE_ATTEMPTS but can be
   /// overridden by tests. The first max_scavenge_attempts_ - 1 attempts do not lock
   /// all arenas so may fail. The final attempt locks all arenas, which is expensive
   /// but is guaranteed to succeed.
   int max_scavenge_attempts_;
 };
 }

 #endif
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#ifndef IMPALA_RUNTIME_BUFFER_ALLOCATOR_H
	#define IMPALA_RUNTIME_BUFFER_ALLOCATOR_H

	#include <boost/scoped_ptr.hpp>

	#include "runtime/bufferpool/buffer-pool-internal.h"
	#include "runtime/bufferpool/free-list.h"
	#include "util/aligned-new.h"

	namespace impala {

	/// The internal buffer allocator used by BufferPool to allocator power-of-two sized
	/// buffers. BufferAllocator builds on top of SystemAllocator by adding caching of
	/// free buffers and clean pages where the memory is not currently in use by a client
	/// but has not yet been released to SystemAllocator.
	///
	/// The allocator is optimised for the common case where an allocation can be served
	/// by reclaiming a buffer of the request size from the current core's arena. In this
	/// case there is no contention for locks between concurrently-running threads. If this
	/// fails, progressively more expensive approaches to allocate memory are tried until
	/// the allocation eventually success (see AllocateInternal() for details).
	///
	/// Buffer Reservations
	/// ===================
	/// The implementation of the BufferAllocator relies on the BufferPool's reservation
	/// tracking system. The allocator is given a hard limit ('system_bytes_limit'), above
	/// which all allocations will fail. Allocations up to 'system_bytes_limit' are
	/// guaranteed to succeed unless an unexpected system error occurs (e.g. we can't allocate
	/// all of the required memory from the OS). Reservations must be set up so that the total
	/// of all reservations does not exceed 'system_bytes_limit', thus ensuring that
	/// BufferAllocator can alway find memory to fulfill reservations.
	///
	/// +========================+
	/// \| IMPLEMENTATION NOTES \|
	/// +========================+
	///
	/// Memory
	/// ======
	/// Memory managed by BufferAllocator comes in four forms:
	/// 1. Buffers returned to the client (corresponding to a used reservation)
	/// 2. Free buffers cached in the BufferAllocator's free lists.
	/// 3. Buffers attached to clean unpinned pages in the BufferAllocator's clean page lists.
	/// 4. Bytes that are not allocated from the system: 'system_bytes_remaining_'.
	/// Together these always add up to 'system_bytes_limit', which allows BufferAllocator
	/// to always fulfill reservations via some combination of memory in forms 2, 3 or 4.
	///
	/// The BufferAllocator code is careful not to make memory inaccessible to concurrently
	/// executing threads that are entitled to it. E.g. if one thread is entitled to allocate
	/// a 1MB buffer from the BufferAllocator's free or clean page lists but needs to release
	/// a 2MB buffer to the system to free up enough memory, it must add 1MB to
	/// 'system_bytes_remaining_' in the same critical section in which it freed the 2MB
	/// buffer. Otherwise a concurrent thread that had a reservation for 1MB of memory might
	/// not be able to find it.
	///
	/// Arenas
	/// ======
	/// The buffer allocator's data structures are broken up into arenas, with an arena per
	/// core. Within each arena, each buffer or page is stored in a list with buffers and
	/// pages of the same size: there is a separate list for every power-of-two size. Each
	/// arena is protected by a separate lock, so in the common case where threads are able
	/// to fulfill allocations from their own arena, there will be no lock contention.
	///
	class BufferPool::BufferAllocator {
	public:
	BufferAllocator(BufferPool* pool, MetricGroup* metrics, int64_t min_buffer_len,
	int64_t system_bytes_limit, int64_t clean_page_bytes_limit);
	~BufferAllocator();

	/// Allocate a buffer with a power-of-two length 'len'. This function may acquire
	/// 'FreeBufferArena::lock_' and Page::lock so no locks lower in the lock acquisition
	/// order (see buffer-pool-internal.h) should be held by the caller.
	///
	/// Always succeeds on allocating memory up to 'system_bytes_limit', unless the system
	/// is unable to give us 'system_bytes_limit' of memory or an internal bug: if all
	/// clients write out enough dirty pages to stay within their reservation, then there
	/// should always be enough free buffers and clean pages to reclaim.
	Status Allocate(ClientHandle* client, int64_t len,
	BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;

	/// Frees 'buffer', which must be open before calling. Closes 'buffer' and updates
	/// internal state but does not release to any reservation.
	void Free(BufferPool::BufferHandle&& buffer);

	/// Adds a clean page 'page' to a clean page list. Caller must hold the page's
	/// client's lock via 'client_lock' so that moving the page between the client list and
	/// the free page list is atomic. Caller must not hold 'FreeBufferArena::lock_' or any
	/// Page::lock.
	void AddCleanPage(const boost::unique_lock<boost::mutex>& client_lock, Page* page);

	/// Removes a clean page 'page' from a clean page list and returns true, if present in
	/// one of the lists. Returns true if it was present. If 'claim_buffer' is true, the
	/// caller must have reservation for the buffer, which is returned along with the page.
	/// Otherwise the buffer is moved directly to the free buffer list. Caller must hold
	/// the page's client's lock via 'client_lock' so that moving the page between the
	/// client list and the free page list is atomic. Caller must not hold
	/// 'FreeBufferArena::lock_' or any Page::lock.
	bool RemoveCleanPage(
	const boost::unique_lock<boost::mutex>& client_lock, bool claim_buffer, Page* page);

	/// Periodically called to release free buffers back to the SystemAllocator. Releases
	/// buffers based on recent allocation patterns, trying to minimise the number of
	/// excess buffers retained in each list above the minimum required to avoid going
	/// to the system allocator.
	void Maintenance();

	/// Try to release at least 'bytes_to_free' bytes of memory to the system allocator.
	void ReleaseMemory(int64_t bytes_to_free);

	int64_t system_bytes_limit() const { return system_bytes_limit_; }

	/// Return the amount of memory currently allocated from the system.
	int64_t GetSystemBytesAllocated() const {
	return system_bytes_limit_ - system_bytes_remaining_.Load();
	}

	/// Return the total number of free buffers in the allocator.
	int64_t GetNumFreeBuffers() const;

	/// Return the total bytes of free buffers in the allocator.
	int64_t GetFreeBufferBytes() const;

	/// Return the limit on bytes of clean pages in the allocator.
	int64_t GetCleanPageBytesLimit() const;

	/// Return the total number of clean pages in the allocator.
	int64_t GetNumCleanPages() const;

	/// Return the total bytes of clean pages in the allocator.
	int64_t GetCleanPageBytes() const;

	std::string DebugString();

	protected:
	friend class BufferAllocatorTest;
	friend class BufferPoolTest;
	friend class FreeBufferArena;

	/// Test helper: gets the current size of the free list for buffers of 'len' bytes
	/// on core 'core'.
	int GetFreeListSize(int core, int64_t len);

	/// Test helper: reduce the number of scavenge attempts so backend tests can force
	/// use of the "locked" scavenging code path.
	void set_max_scavenge_attempts(int val) {
	DCHECK_GE(val, 1);
	max_scavenge_attempts_ = val;
	}

	private:
	/// Compute the maximum power-of-two buffer length that could be allocated based on the
	/// amount of memory available 'system_bytes_limit'. The value is always at least
	/// 'min_buffer_len' so that there is at least one valid buffer size.
	static int64_t CalcMaxBufferLen(int64_t min_buffer_len, int64_t system_bytes_limit);

	/// Same as Allocate() but leaves 'buffer->client_' NULL and only updates the
	/// 'sys_alloc_time' and no other counters.
	Status AllocateInternal(BufferPool::Client* client, int64_t len,
	BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;

	/// Tries to reclaim enough memory from various sources so that the caller can allocate
	/// a buffer of 'target_bytes' from the system allocator. Scavenges buffers from the
	/// free buffer and clean page lists of all cores and frees them with
	/// 'system_allocator_'. Also tries to decrement 'system_bytes_remaining_'.
	/// 'current_core' is the index of the current CPU core. Any bytes freed in excess of
	/// 'target_bytes' are added to 'system_bytes_remaining_.' If 'slow_but_sure' is true,
	/// this function uses a slower strategy that guarantees enough memory will be found
	/// but can block progress of other threads for longer. If 'slow_but_sure' is false,
	/// then this function optimistically tries to reclaim the memory but may not reclaim
	/// 'target_bytes' of memory. Returns the number of bytes reclaimed.
	int64_t ScavengeBuffers(bool slow_but_sure, int current_core, int64_t target_bytes);

	/// Helper to free a list of buffers to the system. Returns the number of bytes freed.
	int64_t FreeToSystem(std::vector<BufferHandle>&& buffers);

	/// Compute a sum over all arenas. Does not lock the arenas.
	int64_t SumOverArenas(std::function<int64_t(FreeBufferArena* arena)> compute_fn) const;

	/// The pool that this allocator is associated with.
	BufferPool* const pool_;

	/// System allocator that is ultimately used to allocate and free buffers.
	const boost::scoped_ptr<SystemAllocator> system_allocator_;

	/// The minimum power-of-two buffer length that can be allocated.
	const int64_t min_buffer_len_;

	/// The maximum power-of-two buffer length that can be allocated. Always >=
	/// 'min_buffer_len' so that there is at least one valid buffer size.
	const int64_t max_buffer_len_;

	/// The log2 of 'min_buffer_len_'.
	const int log_min_buffer_len_;

	/// The log2 of 'max_buffer_len_'.
	const int log_max_buffer_len_;

	/// The maximum physical memory in bytes that will be allocated from the system.
	const int64_t system_bytes_limit_;

	/// The remaining number of bytes of 'system_bytes_limit_' that can be used for
	/// allocating new buffers. Must be updated atomically before a new buffer is
	/// allocated or after an existing buffer is freed with the system allocator.
	AtomicInt64 system_bytes_remaining_;

	/// The maximum bytes of clean pages that can accumulate across all arenas before
	/// they will be evicted.
	const int64_t clean_page_bytes_limit_;

	/// The number of bytes of 'clean_page_bytes_limit_' not used by clean pages. I.e.
	/// (clean_page_bytes_limit - bytes of clean pages in the BufferAllocator).
	/// 'clean_pages_bytes_limit_' is enforced by increasing this value before a
	/// clean page is added and decreasing it after a clean page is reclaimed or evicted.
	AtomicInt64 clean_page_bytes_remaining_;

	/// Free and clean pages. One arena per core.
	std::vector<std::unique_ptr<FreeBufferArena>> per_core_arenas_;

	/// Default number of times to attempt scavenging.
	static const int MAX_SCAVENGE_ATTEMPTS = 3;

	/// Number of times to attempt scavenging. Usually MAX_SCAVENGE_ATTEMPTS but can be
	/// overridden by tests. The first max_scavenge_attempts_ - 1 attempts do not lock
	/// all arenas so may fail. The final attempt locks all arenas, which is expensive
	/// but is guaranteed to succeed.
	int max_scavenge_attempts_;
	};
	}

	#endif