blob: 8dc462528319e02c522a0234501dd28ab52ddfc8 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "runtime/bufferpool/system-allocator.h"
#include <sys/mman.h>
#include <gperftools/malloc_extension.h>
#include "gutil/strings/substitute.h"
#include "util/bit-util.h"
#include "common/names.h"
// TODO: IMPALA-5073: this should eventually become the default once we are confident
// that it is superior to allocating via TCMalloc.
DEFINE_bool(mmap_buffers, false,
"(Experimental) If true, allocate buffers directly from the operating system "
"instead of with TCMalloc.");
DEFINE_bool(madvise_huge_pages, true,
"(Advanced) If true, advise operating system to back large memory buffers with huge "
"pages");
namespace impala {
/// These are the page sizes on x86-64. We could parse /proc/meminfo to programmatically
/// get this, but it is unlikely to change unless we port to a different architecture.
static int64_t SMALL_PAGE_SIZE = 4LL * 1024;
static int64_t HUGE_PAGE_SIZE = 2LL * 1024 * 1024;
SystemAllocator::SystemAllocator(int64_t min_buffer_len)
: min_buffer_len_(min_buffer_len) {
DCHECK(BitUtil::IsPowerOf2(min_buffer_len));
#if !defined(ADDRESS_SANITIZER) && !defined(THREAD_SANITIZER)
// Free() assumes that aggressive decommit is enabled for TCMalloc.
size_t aggressive_decommit_enabled;
MallocExtension::instance()->GetNumericProperty(
"tcmalloc.aggressive_memory_decommit", &aggressive_decommit_enabled);
CHECK_EQ(true, aggressive_decommit_enabled);
#endif
}
Status SystemAllocator::Allocate(int64_t len, BufferPool::BufferHandle* buffer) {
DCHECK_GE(len, min_buffer_len_);
DCHECK_LE(len, BufferPool::MAX_BUFFER_BYTES);
DCHECK(BitUtil::IsPowerOf2(len)) << len;
uint8_t* buffer_mem;
if (FLAGS_mmap_buffers) {
RETURN_IF_ERROR(AllocateViaMMap(len, &buffer_mem));
} else {
RETURN_IF_ERROR(AllocateViaMalloc(len, &buffer_mem));
}
buffer->Open(buffer_mem, len, CpuInfo::GetCurrentCore());
return Status::OK();
}
Status SystemAllocator::AllocateViaMMap(int64_t len, uint8_t** buffer_mem) {
int64_t map_len = len;
bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
if (use_huge_pages) {
// Map an extra huge page so we can fix up the alignment if needed.
map_len += HUGE_PAGE_SIZE;
}
uint8_t* mem = reinterpret_cast<uint8_t*>(
mmap(nullptr, map_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
if (mem == MAP_FAILED) {
const char* error = strerror(errno);
return Status(TErrorCode::BUFFER_ALLOCATION_FAILED, len, error);
}
if (use_huge_pages) {
// mmap() may return memory that is not aligned to the huge page size. For the
// subsequent madvise() call to work well, we need to align it ourselves and
// unmap the memory on either side of the buffer that we don't need.
uintptr_t misalignment = reinterpret_cast<uintptr_t>(mem) % HUGE_PAGE_SIZE;
if (misalignment != 0) {
uintptr_t fixup = HUGE_PAGE_SIZE - misalignment;
munmap(mem, fixup);
mem += fixup;
map_len -= fixup;
}
munmap(mem + len, map_len - len);
DCHECK_EQ(reinterpret_cast<uintptr_t>(mem) % HUGE_PAGE_SIZE, 0)
<< std::hex << reinterpret_cast<uintptr_t>(mem);
// Mark the buffer as a candidate for promotion to huge pages. The Linux Transparent
// Huge Pages implementation will try to back the memory with a huge page if it is
// enabled. MADV_HUGEPAGE was introduced in 2.6.38, so we similarly need to skip this
// code if we are compiling against an older kernel.
#ifdef MADV_HUGEPAGE
int rc;
// According to madvise() docs it may return EAGAIN to signal that we should retry.
do {
rc = madvise(mem, len, MADV_HUGEPAGE);
} while (rc == -1 && errno == EAGAIN);
DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
#endif
}
*buffer_mem = mem;
return Status::OK();
}
Status SystemAllocator::AllocateViaMalloc(int64_t len, uint8_t** buffer_mem) {
bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
// Allocate, aligned to the page size that we expect to back the memory range.
// This ensures that it can be backed by a whole pages, rather than parts of pages.
size_t alignment = use_huge_pages ? HUGE_PAGE_SIZE : SMALL_PAGE_SIZE;
int rc = posix_memalign(reinterpret_cast<void**>(buffer_mem), alignment, len);
#ifdef ADDRESS_SANITIZER
// Workaround ASAN bug where posix_memalign returns 0 even when allocation fails.
// It should instead return ENOMEM. See https://bugs.llvm.org/show_bug.cgi?id=32968.
if (rc == 0 && *buffer_mem == nullptr && len != 0) rc = ENOMEM;
#endif
if (rc != 0) {
return Status(TErrorCode::BUFFER_ALLOCATION_FAILED, len,
Substitute("posix_memalign() failed to allocate buffer: $0", GetStrErrMsg()));
}
if (use_huge_pages) {
#ifdef MADV_HUGEPAGE
// According to madvise() docs it may return EAGAIN to signal that we should retry.
do {
rc = madvise(*buffer_mem, len, MADV_HUGEPAGE);
} while (rc == -1 && errno == EAGAIN);
DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
#endif
}
return Status::OK();
}
void SystemAllocator::Free(BufferPool::BufferHandle&& buffer) {
if (FLAGS_mmap_buffers) {
int rc = munmap(buffer.data(), buffer.len());
DCHECK_EQ(rc, 0) << "Unexpected munmap() error: " << errno;
} else {
bool use_huge_pages = buffer.len() % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
if (use_huge_pages) {
// Undo the madvise so that is isn't a candidate to be newly backed by huge pages.
// We depend on TCMalloc's "aggressive decommit" mode decommitting the physical
// huge pages with madvise(DONTNEED) when we call free(). Otherwise, this huge
// page region may be divvied up and subsequently decommitted in smaller chunks,
// which may not actually release the physical memory, causing Impala physical
// memory usage to exceed the process limit.
#ifdef MADV_NOHUGEPAGE
// According to madvise() docs it may return EAGAIN to signal that we should retry.
int rc;
do {
rc = madvise(buffer.data(), buffer.len(), MADV_NOHUGEPAGE);
} while (rc == -1 && errno == EAGAIN);
DCHECK(rc == 0) << "madvise(MADV_NOHUGEPAGE) shouldn't fail" << errno;
#endif
}
free(buffer.data());
}
buffer.Reset(); // Avoid DCHECK in ~BufferHandle().
}
}