be/src/runtime/bufferpool/system-allocator.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "runtime/bufferpool/system-allocator.h"

 #include <sys/mman.h>

 #include <gperftools/malloc_extension.h>

 #include "gutil/strings/substitute.h"
 #include "util/bit-util.h"

 #include "common/names.h"

 // TODO: IMPALA-5073: this should eventually become the default once we are confident
 // that it is superior to allocating via TCMalloc.
 DEFINE_bool(mmap_buffers, false,
     "(Experimental) If true, allocate buffers directly from the operating system "
     "instead of with TCMalloc.");

 DEFINE_bool(madvise_huge_pages, true,
     "(Advanced) If true, advise operating system to back large memory buffers with huge "
     "pages");

 namespace impala {

 /// These are the page sizes on x86-64. We could parse /proc/meminfo to programmatically
 /// get this, but it is unlikely to change unless we port to a different architecture.
 static int64_t SMALL_PAGE_SIZE = 4LL * 1024;
 static int64_t HUGE_PAGE_SIZE = 2LL * 1024 * 1024;

 SystemAllocator::SystemAllocator(int64_t min_buffer_len)
   : min_buffer_len_(min_buffer_len) {
   DCHECK(BitUtil::IsPowerOf2(min_buffer_len));
 #if !defined(ADDRESS_SANITIZER) && !defined(THREAD_SANITIZER)
   // Free() assumes that aggressive decommit is enabled for TCMalloc.
   size_t aggressive_decommit_enabled;
   MallocExtension::instance()->GetNumericProperty(
       "tcmalloc.aggressive_memory_decommit", &aggressive_decommit_enabled);
   CHECK_EQ(true, aggressive_decommit_enabled);
 #endif
 }

 Status SystemAllocator::Allocate(int64_t len, BufferPool::BufferHandle* buffer) {
   DCHECK_GE(len, min_buffer_len_);
   DCHECK_LE(len, BufferPool::MAX_BUFFER_BYTES);
   DCHECK(BitUtil::IsPowerOf2(len)) << len;

   uint8_t* buffer_mem;
   if (FLAGS_mmap_buffers) {
     RETURN_IF_ERROR(AllocateViaMMap(len, &buffer_mem));
   } else {
     RETURN_IF_ERROR(AllocateViaMalloc(len, &buffer_mem));
   }
   buffer->Open(buffer_mem, len, CpuInfo::GetCurrentCore());
   return Status::OK();
 }

 Status SystemAllocator::AllocateViaMMap(int64_t len, uint8_t** buffer_mem) {
   int64_t map_len = len;
   bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
   if (use_huge_pages) {
     // Map an extra huge page so we can fix up the alignment if needed.
     map_len += HUGE_PAGE_SIZE;
   }
   uint8_t* mem = reinterpret_cast<uint8_t*>(
       mmap(nullptr, map_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
   if (mem == MAP_FAILED) {
     const char* error = strerror(errno);
     return Status(TErrorCode::BUFFER_ALLOCATION_FAILED, len, error);
   }

   if (use_huge_pages) {
     // mmap() may return memory that is not aligned to the huge page size. For the
     // subsequent madvise() call to work well, we need to align it ourselves and
     // unmap the memory on either side of the buffer that we don't need.
     uintptr_t misalignment = reinterpret_cast<uintptr_t>(mem) % HUGE_PAGE_SIZE;
     if (misalignment != 0) {
       uintptr_t fixup = HUGE_PAGE_SIZE - misalignment;
       munmap(mem, fixup);
       mem += fixup;
       map_len -= fixup;
     }
     munmap(mem + len, map_len - len);
     DCHECK_EQ(reinterpret_cast<uintptr_t>(mem) % HUGE_PAGE_SIZE, 0)
         << std::hex << reinterpret_cast<uintptr_t>(mem);
     // Mark the buffer as a candidate for promotion to huge pages. The Linux Transparent
     // Huge Pages implementation will try to back the memory with a huge page if it is
     // enabled. MADV_HUGEPAGE was introduced in 2.6.38, so we similarly need to skip this
     // code if we are compiling against an older kernel.
 #ifdef MADV_HUGEPAGE
     int rc;
     // According to madvise() docs it may return EAGAIN to signal that we should retry.
     do {
       rc = madvise(mem, len, MADV_HUGEPAGE);
     } while (rc == -1 && errno == EAGAIN);
     DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
 #endif
   }
   *buffer_mem = mem;
   return Status::OK();
 }

 Status SystemAllocator::AllocateViaMalloc(int64_t len, uint8_t** buffer_mem) {
   bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
   // Allocate, aligned to the page size that we expect to back the memory range.
   // This ensures that it can be backed by a whole pages, rather than parts of pages.
   size_t alignment = use_huge_pages ? HUGE_PAGE_SIZE : SMALL_PAGE_SIZE;
   int rc = posix_memalign(reinterpret_cast<void**>(buffer_mem), alignment, len);
 #ifdef ADDRESS_SANITIZER
   // Workaround ASAN bug where posix_memalign returns 0 even when allocation fails.
   // It should instead return ENOMEM. See https://bugs.llvm.org/show_bug.cgi?id=32968.
   if (rc == 0 && *buffer_mem == nullptr && len != 0) rc = ENOMEM;
 #endif
   if (rc != 0) {
     return Status(TErrorCode::BUFFER_ALLOCATION_FAILED, len,
         Substitute("posix_memalign() failed to allocate buffer: $0", GetStrErrMsg()));
   }
   if (use_huge_pages) {
 #ifdef MADV_HUGEPAGE
     // According to madvise() docs it may return EAGAIN to signal that we should retry.
     do {
       rc = madvise(*buffer_mem, len, MADV_HUGEPAGE);
     } while (rc == -1 && errno == EAGAIN);
     DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
 #endif
   }
   return Status::OK();
 }

 void SystemAllocator::Free(BufferPool::BufferHandle&& buffer) {
   if (FLAGS_mmap_buffers) {
     int rc = munmap(buffer.data(), buffer.len());
     DCHECK_EQ(rc, 0) << "Unexpected munmap() error: " << errno;
   } else {
     bool use_huge_pages = buffer.len() % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
     if (use_huge_pages) {
       // Undo the madvise so that is isn't a candidate to be newly backed by huge pages.
       // We depend on TCMalloc's "aggressive decommit" mode decommitting the physical
       // huge pages with madvise(DONTNEED) when we call free(). Otherwise, this huge
       // page region may be divvied up and subsequently decommitted in smaller chunks,
       // which may not actually release the physical memory, causing Impala physical
       // memory usage to exceed the process limit.
 #ifdef MADV_NOHUGEPAGE
       // According to madvise() docs it may return EAGAIN to signal that we should retry.
       int rc;
       do {
         rc = madvise(buffer.data(), buffer.len(), MADV_NOHUGEPAGE);
       } while (rc == -1 && errno == EAGAIN);
       DCHECK(rc == 0) << "madvise(MADV_NOHUGEPAGE) shouldn't fail" << errno;
 #endif
     }
     free(buffer.data());
   }
   buffer.Reset(); // Avoid DCHECK in ~BufferHandle().
 }
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include "runtime/bufferpool/system-allocator.h"

	#include <sys/mman.h>

	#include <gperftools/malloc_extension.h>

	#include "gutil/strings/substitute.h"
	#include "util/bit-util.h"

	#include "common/names.h"

	// TODO: IMPALA-5073: this should eventually become the default once we are confident
	// that it is superior to allocating via TCMalloc.
	DEFINE_bool(mmap_buffers, false,
	"(Experimental) If true, allocate buffers directly from the operating system "
	"instead of with TCMalloc.");

	DEFINE_bool(madvise_huge_pages, true,
	"(Advanced) If true, advise operating system to back large memory buffers with huge "
	"pages");

	namespace impala {

	/// These are the page sizes on x86-64. We could parse /proc/meminfo to programmatically
	/// get this, but it is unlikely to change unless we port to a different architecture.
	static int64_t SMALL_PAGE_SIZE = 4LL * 1024;
	static int64_t HUGE_PAGE_SIZE = 2LL * 1024 * 1024;

	SystemAllocator::SystemAllocator(int64_t min_buffer_len)
	: min_buffer_len_(min_buffer_len) {
	DCHECK(BitUtil::IsPowerOf2(min_buffer_len));
	#if !defined(ADDRESS_SANITIZER) && !defined(THREAD_SANITIZER)
	// Free() assumes that aggressive decommit is enabled for TCMalloc.
	size_t aggressive_decommit_enabled;
	MallocExtension::instance()->GetNumericProperty(
	"tcmalloc.aggressive_memory_decommit", &aggressive_decommit_enabled);
	CHECK_EQ(true, aggressive_decommit_enabled);
	#endif
	}

	Status SystemAllocator::Allocate(int64_t len, BufferPool::BufferHandle* buffer) {
	DCHECK_GE(len, min_buffer_len_);
	DCHECK_LE(len, BufferPool::MAX_BUFFER_BYTES);
	DCHECK(BitUtil::IsPowerOf2(len)) << len;

	uint8_t* buffer_mem;
	if (FLAGS_mmap_buffers) {
	RETURN_IF_ERROR(AllocateViaMMap(len, &buffer_mem));
	} else {
	RETURN_IF_ERROR(AllocateViaMalloc(len, &buffer_mem));
	}
	buffer->Open(buffer_mem, len, CpuInfo::GetCurrentCore());
	return Status::OK();
	}

	Status SystemAllocator::AllocateViaMMap(int64_t len, uint8_t** buffer_mem) {
	int64_t map_len = len;
	bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
	if (use_huge_pages) {
	// Map an extra huge page so we can fix up the alignment if needed.
	map_len += HUGE_PAGE_SIZE;
	}
	uint8_t* mem = reinterpret_cast<uint8_t*>(
	mmap(nullptr, map_len, PROT_READ \| PROT_WRITE, MAP_ANONYMOUS \| MAP_PRIVATE, -1, 0));
	if (mem == MAP_FAILED) {
	const char* error = strerror(errno);
	return Status(TErrorCode::BUFFER_ALLOCATION_FAILED, len, error);
	}

	if (use_huge_pages) {
	// mmap() may return memory that is not aligned to the huge page size. For the
	// subsequent madvise() call to work well, we need to align it ourselves and
	// unmap the memory on either side of the buffer that we don't need.
	uintptr_t misalignment = reinterpret_cast<uintptr_t>(mem) % HUGE_PAGE_SIZE;
	if (misalignment != 0) {
	uintptr_t fixup = HUGE_PAGE_SIZE - misalignment;
	munmap(mem, fixup);
	mem += fixup;
	map_len -= fixup;
	}
	munmap(mem + len, map_len - len);
	DCHECK_EQ(reinterpret_cast<uintptr_t>(mem) % HUGE_PAGE_SIZE, 0)
	<< std::hex << reinterpret_cast<uintptr_t>(mem);
	// Mark the buffer as a candidate for promotion to huge pages. The Linux Transparent
	// Huge Pages implementation will try to back the memory with a huge page if it is
	// enabled. MADV_HUGEPAGE was introduced in 2.6.38, so we similarly need to skip this
	// code if we are compiling against an older kernel.
	#ifdef MADV_HUGEPAGE
	int rc;
	// According to madvise() docs it may return EAGAIN to signal that we should retry.
	do {
	rc = madvise(mem, len, MADV_HUGEPAGE);
	} while (rc == -1 && errno == EAGAIN);
	DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
	#endif
	}
	*buffer_mem = mem;
	return Status::OK();
	}

	Status SystemAllocator::AllocateViaMalloc(int64_t len, uint8_t** buffer_mem) {
	bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
	// Allocate, aligned to the page size that we expect to back the memory range.
	// This ensures that it can be backed by a whole pages, rather than parts of pages.
	size_t alignment = use_huge_pages ? HUGE_PAGE_SIZE : SMALL_PAGE_SIZE;
	int rc = posix_memalign(reinterpret_cast<void**>(buffer_mem), alignment, len);
	#ifdef ADDRESS_SANITIZER
	// Workaround ASAN bug where posix_memalign returns 0 even when allocation fails.
	// It should instead return ENOMEM. See https://bugs.llvm.org/show_bug.cgi?id=32968.
	if (rc == 0 && *buffer_mem == nullptr && len != 0) rc = ENOMEM;
	#endif
	if (rc != 0) {
	return Status(TErrorCode::BUFFER_ALLOCATION_FAILED, len,
	Substitute("posix_memalign() failed to allocate buffer: $0", GetStrErrMsg()));
	}
	if (use_huge_pages) {
	#ifdef MADV_HUGEPAGE
	// According to madvise() docs it may return EAGAIN to signal that we should retry.
	do {
	rc = madvise(*buffer_mem, len, MADV_HUGEPAGE);
	} while (rc == -1 && errno == EAGAIN);
	DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
	#endif
	}
	return Status::OK();
	}

	void SystemAllocator::Free(BufferPool::BufferHandle&& buffer) {
	if (FLAGS_mmap_buffers) {
	int rc = munmap(buffer.data(), buffer.len());
	DCHECK_EQ(rc, 0) << "Unexpected munmap() error: " << errno;
	} else {
	bool use_huge_pages = buffer.len() % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
	if (use_huge_pages) {
	// Undo the madvise so that is isn't a candidate to be newly backed by huge pages.
	// We depend on TCMalloc's "aggressive decommit" mode decommitting the physical
	// huge pages with madvise(DONTNEED) when we call free(). Otherwise, this huge
	// page region may be divvied up and subsequently decommitted in smaller chunks,
	// which may not actually release the physical memory, causing Impala physical
	// memory usage to exceed the process limit.
	#ifdef MADV_NOHUGEPAGE
	// According to madvise() docs it may return EAGAIN to signal that we should retry.
	int rc;
	do {
	rc = madvise(buffer.data(), buffer.len(), MADV_NOHUGEPAGE);
	} while (rc == -1 && errno == EAGAIN);
	DCHECK(rc == 0) << "madvise(MADV_NOHUGEPAGE) shouldn't fail" << errno;
	#endif
	}
	free(buffer.data());
	}
	buffer.Reset(); // Avoid DCHECK in ~BufferHandle().
	}
	}