blob: cbadc035f90727f3ee145c713863070df13c6f0f [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <math.h>
#include <random>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <utility>
#include <vector>
#include <boost/thread.hpp>
#include "common/object-pool.h"
#include "gutil/strings/substitute.h"
#include "runtime/bufferpool/free-list.h"
#include "runtime/bufferpool/system-allocator.h"
#include "util/aligned-new.h"
#include "util/benchmark.h"
#include "util/cpu-info.h"
#include "util/mem-range.h"
#include "util/stopwatch.h"
#include "common/names.h"
using namespace impala;
using boost::thread;
using boost::thread_group;
using std::mt19937_64;
using std::uniform_int_distribution;
using strings::Substitute;
// This benchmark tests free list performance under concurrency by measuring the
// throughput of a batch of Allocate()/Free() operations that use the free list
// to cache buffers. The benchmark measures total throughput across all threads.
// The baseline is the throughput of a single thread that is not competing for any
// resources.
//
// These Allocate() and Free() calls are interleaved with work that touches the
// allocated memory, so that the benchmark exhibits a realistic level of lock
// contention.
//
// The buffer size is varied from 64KB to 256KB (at 256KB, throughput is already limited
// by memory bandwidth instead of the free lists so there is little value in collecting
// more data points) and the number of iterations ("iters") over the allocated memory is
// varied.
//
// Summary of results:
// -------------------
// In the 0 iters case, which measures pure throughput of free list operations,
// ListPerThread has much higher throughput than SharedList because there is no
// contention over locks. NoList measures performance of the underlying system
// allocator (TCMalloc). TCMalloc performs poorly on large buffers compared to
// ListPerThread because it must frequently access the shared global heap and
// contend for a single global lock to serve large allocations.
//
// Performance of all three free list approaches (SharedList, ListPerThread, NoList)
// converges for larger buffer sizes and # of iterations. Performance per core also
// drops off as core count increases, regardless of whether the threads are contending
// over locks. This suggests that typical workloads that do some non-trivial processing
// over allocated buffers >= 64KB will usually be bottlenecked by memory access instead
// of the allocator. However, the balance may shift on larger machines with more cores
// as lock contention over a global mutex would become very expensive. Thus, sharing a
// free list between a small number of cores may work reasonably in practice, but sharing
// one between all cores will likely not scale at some point.
//
/*
Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
FreeLists 0 iters on 64 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 24.6 25 25.3 1X 1X 1X
2 threads SharedList 9.05 9.53 10.6 0.368X 0.381X 0.42X
4 threads SharedList 7.96 8.18 8.41 0.324X 0.327X 0.332X
8 threads SharedList 6.37 6.5 6.72 0.259X 0.26X 0.265X
1 threads ListPerThread 24.4 25 25.3 0.993X 0.998X 0.999X
2 threads ListPerThread 33.9 35.6 36.6 1.38X 1.42X 1.45X
4 threads ListPerThread 40.5 45.9 50.4 1.65X 1.83X 1.99X
8 threads ListPerThread 38.1 41.9 44.5 1.55X 1.67X 1.76X
1 threads NoList 33.1 34.5 35.1 1.35X 1.38X 1.38X
2 threads NoList 42.5 43.9 45.2 1.73X 1.75X 1.79X
4 threads NoList 34.7 36.8 39.6 1.41X 1.47X 1.56X
8 threads NoList 25.3 27.6 29.6 1.03X 1.1X 1.17X
FreeLists 0 iters on 128 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 19.9 20.2 20.7 1X 1X 1X
2 threads SharedList 8.85 9.35 10.3 0.445X 0.463X 0.496X
4 threads SharedList 7.72 7.93 8.09 0.388X 0.393X 0.39X
8 threads SharedList 6.21 6.38 6.54 0.312X 0.316X 0.315X
1 threads ListPerThread 19.8 20.2 20.7 0.997X 1X 1X
2 threads ListPerThread 30 31.3 32.2 1.51X 1.55X 1.55X
4 threads ListPerThread 34.9 36.9 40.6 1.75X 1.83X 1.96X
8 threads ListPerThread 30.6 32.1 33.1 1.54X 1.59X 1.6X
1 threads NoList 20.6 21.8 22.3 1.04X 1.08X 1.08X
2 threads NoList 29.3 30.6 32 1.47X 1.51X 1.54X
4 threads NoList 22.2 23.1 23.8 1.11X 1.14X 1.15X
8 threads NoList 14 14.3 15 0.704X 0.71X 0.722X
FreeLists 0 iters on 256 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 15.5 16.1 16.4 1X 1X 1X
2 threads SharedList 8.46 8.92 9.46 0.545X 0.555X 0.578X
4 threads SharedList 7.31 7.48 7.69 0.471X 0.465X 0.47X
8 threads SharedList 5.91 6.04 6.21 0.381X 0.376X 0.379X
1 threads ListPerThread 15.5 16 16.5 0.999X 0.995X 1.01X
2 threads ListPerThread 24.9 26 27.1 1.6X 1.62X 1.66X
4 threads ListPerThread 24.8 26.9 29.6 1.6X 1.67X 1.81X
8 threads ListPerThread 18.4 21.9 23.1 1.18X 1.36X 1.41X
1 threads NoList 14.6 15.4 15.8 0.939X 0.956X 0.967X
2 threads NoList 17.5 18.8 19.2 1.12X 1.17X 1.17X
4 threads NoList 11.7 12.2 12.6 0.756X 0.757X 0.773X
8 threads NoList 6.42 6.61 6.82 0.414X 0.411X 0.417X
FreeLists 1 iters on 64 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 1.38 1.39 1.4 1X 1X 1X
2 threads SharedList 1.77 1.81 1.84 1.28X 1.3X 1.31X
4 threads SharedList 1.86 1.96 2.03 1.35X 1.41X 1.45X
8 threads SharedList 1.55 1.68 1.77 1.13X 1.21X 1.26X
1 threads ListPerThread 1.38 1.39 1.4 1X 1X 1X
2 threads ListPerThread 2.05 2.13 2.19 1.49X 1.53X 1.56X
4 threads ListPerThread 2.07 2.2 2.32 1.51X 1.59X 1.66X
8 threads ListPerThread 1.52 1.63 1.71 1.11X 1.17X 1.22X
1 threads NoList 1.04 1.05 1.05 0.754X 0.754X 0.748X
2 threads NoList 1.57 1.58 1.61 1.14X 1.14X 1.15X
4 threads NoList 1.9 2.09 2.17 1.38X 1.5X 1.55X
8 threads NoList 1.49 1.6 1.73 1.08X 1.16X 1.24X
FreeLists 1 iters on 128 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 0.557 0.561 0.561 1X 1X 1X
2 threads SharedList 0.75 0.765 0.772 1.35X 1.36X 1.38X
4 threads SharedList 0.759 0.793 0.833 1.36X 1.41X 1.48X
8 threads SharedList 0.661 0.685 0.714 1.19X 1.22X 1.27X
1 threads ListPerThread 0.557 0.561 0.561 1X 1X 1X
2 threads ListPerThread 0.817 0.833 0.85 1.47X 1.48X 1.51X
4 threads ListPerThread 0.802 0.842 0.885 1.44X 1.5X 1.58X
8 threads ListPerThread 0.621 0.667 0.697 1.12X 1.19X 1.24X
1 threads NoList 0.4 0.404 0.404 0.719X 0.719X 0.719X
2 threads NoList 0.596 0.607 0.621 1.07X 1.08X 1.11X
4 threads NoList 0.724 0.773 0.81 1.3X 1.38X 1.44X
8 threads NoList 0.561 0.619 0.65 1.01X 1.1X 1.16X
FreeLists 1 iters on 256 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 0.25 0.252 0.255 1X 1X 1X
2 threads SharedList 0.355 0.365 0.369 1.42X 1.45X 1.45X
4 threads SharedList 0.297 0.306 0.318 1.19X 1.21X 1.25X
8 threads SharedList 0.315 0.324 0.337 1.26X 1.28X 1.32X
1 threads ListPerThread 0.252 0.255 0.255 1.01X 1.01X 1X
2 threads ListPerThread 0.355 0.358 0.381 1.42X 1.42X 1.49X
4 threads ListPerThread 0.296 0.309 0.345 1.18X 1.22X 1.35X
8 threads ListPerThread 0.291 0.297 0.304 1.17X 1.18X 1.19X
1 threads NoList 0.143 0.143 0.143 0.571X 0.566X 0.56X
2 threads NoList 0.224 0.226 0.229 0.897X 0.897X 0.897X
4 threads NoList 0.275 0.283 0.294 1.1X 1.12X 1.15X
8 threads NoList 0.278 0.288 0.297 1.11X 1.14X 1.17X
FreeLists 2 iters on 64 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 0.772 0.779 0.779 1X 1X 1X
2 threads SharedList 1.17 1.18 1.19 1.51X 1.51X 1.53X
4 threads SharedList 1.36 1.5 1.65 1.77X 1.92X 2.12X
8 threads SharedList 1.4 1.46 1.51 1.81X 1.87X 1.94X
1 threads ListPerThread 0.772 0.779 0.779 1X 1X 1X
2 threads ListPerThread 1.27 1.29 1.3 1.65X 1.65X 1.67X
4 threads ListPerThread 1.6 1.73 1.87 2.08X 2.22X 2.4X
8 threads ListPerThread 1.48 1.54 1.59 1.92X 1.98X 2.04X
1 threads NoList 0.644 0.65 0.65 0.834X 0.835X 0.835X
2 threads NoList 1.09 1.1 1.11 1.41X 1.41X 1.43X
4 threads NoList 1.45 1.63 1.82 1.88X 2.1X 2.34X
8 threads NoList 1.43 1.5 1.54 1.85X 1.92X 1.97X
FreeLists 2 iters on 128 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 0.365 0.365 0.369 1X 1X 1X
2 threads SharedList 0.587 0.593 0.604 1.61X 1.62X 1.64X
4 threads SharedList 0.636 0.667 0.701 1.74X 1.82X 1.9X
8 threads SharedList 0.582 0.644 0.655 1.59X 1.76X 1.78X
1 threads ListPerThread 0.365 0.369 0.369 1X 1.01X 1X
2 threads ListPerThread 0.586 0.591 0.621 1.6X 1.62X 1.68X
4 threads ListPerThread 0.64 0.714 0.765 1.75X 1.95X 2.07X
8 threads ListPerThread 0.571 0.613 0.65 1.56X 1.68X 1.76X
1 threads NoList 0.294 0.297 0.297 0.805X 0.813X 0.805X
2 threads NoList 0.455 0.46 0.468 1.25X 1.26X 1.27X
4 threads NoList 0.55 0.62 0.67 1.51X 1.7X 1.82X
8 threads NoList 0.519 0.539 0.566 1.42X 1.48X 1.54X
FreeLists 2 iters on 256 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 0.15 0.15 0.151 1X 1X 1X
2 threads SharedList 0.226 0.233 0.235 1.51X 1.56X 1.56X
4 threads SharedList 0.263 0.275 0.286 1.76X 1.84X 1.89X
8 threads SharedList 0.26 0.288 0.306 1.74X 1.93X 2.03X
1 threads ListPerThread 0.15 0.15 0.151 1X 1X 1X
2 threads ListPerThread 0.245 0.248 0.27 1.64X 1.66X 1.79X
4 threads ListPerThread 0.238 0.263 0.278 1.59X 1.76X 1.84X
8 threads ListPerThread 0.22 0.231 0.24 1.47X 1.54X 1.59X
1 threads NoList 0.115 0.117 0.117 0.772X 0.779X 0.772X
2 threads NoList 0.182 0.185 0.187 1.22X 1.24X 1.24X
4 threads NoList 0.226 0.243 0.255 1.51X 1.63X 1.69X
8 threads NoList 0.238 0.25 0.262 1.59X 1.67X 1.73X
FreeLists 4 iters on 64 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 0.414 0.414 0.418 1X 1X 1X
2 threads SharedList 0.714 0.721 0.728 1.72X 1.74X 1.74X
4 threads SharedList 0.883 0.965 1.1 2.13X 2.33X 2.63X
8 threads SharedList 1.09 1.11 1.14 2.62X 2.68X 2.73X
1 threads ListPerThread 0.411 0.414 0.418 0.991X 1X 1X
2 threads ListPerThread 0.75 0.759 0.765 1.81X 1.83X 1.83X
4 threads ListPerThread 0.953 1.08 1.25 2.3X 2.61X 2.99X
8 threads ListPerThread 1.03 1.09 1.12 2.48X 2.62X 2.68X
1 threads NoList 0.381 0.385 0.426 0.919X 0.928X 1.02X
2 threads NoList 0.679 0.685 0.694 1.64X 1.65X 1.66X
4 threads NoList 0.924 1.05 1.18 2.23X 2.52X 2.81X
8 threads NoList 1.06 1.11 1.15 2.55X 2.68X 2.75X
FreeLists 4 iters on 128 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 0.214 0.216 0.216 1X 1X 1X
2 threads SharedList 0.352 0.355 0.355 1.65X 1.65X 1.65X
4 threads SharedList 0.362 0.46 0.519 1.69X 2.13X 2.41X
8 threads SharedList 0.442 0.464 0.481 2.07X 2.15X 2.23X
1 threads ListPerThread 0.214 0.216 0.216 1X 1X 1X
2 threads ListPerThread 0.352 0.355 0.358 1.65X 1.65X 1.66X
4 threads ListPerThread 0.376 0.5 0.545 1.76X 2.32X 2.52X
8 threads ListPerThread 0.4 0.426 0.447 1.87X 1.97X 2.07X
1 threads NoList 0.165 0.167 0.167 0.773X 0.773X 0.773X
2 threads NoList 0.301 0.304 0.306 1.41X 1.41X 1.42X
4 threads NoList 0.345 0.434 0.482 1.62X 2.01X 2.24X
8 threads NoList 0.404 0.434 0.451 1.89X 2.01X 2.09X
FreeLists 4 iters on 256 kb:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
(relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
1 threads SharedList 0.102 0.103 0.103 1X 1X 1X
2 threads SharedList 0.162 0.167 0.168 1.59X 1.62X 1.64X
4 threads SharedList 0.202 0.222 0.238 1.98X 2.16X 2.31X
8 threads SharedList 0.204 0.222 0.24 2X 2.16X 2.33X
1 threads ListPerThread 0.102 0.102 0.103 1X 0.991X 1X
2 threads ListPerThread 0.163 0.167 0.167 1.6X 1.62X 1.62X
4 threads ListPerThread 0.188 0.204 0.22 1.85X 1.98X 2.14X
8 threads ListPerThread 0.187 0.198 0.2 1.84X 1.93X 1.95X
1 threads NoList 0.0833 0.0833 0.0833 0.818X 0.811X 0.811X
2 threads NoList 0.137 0.14 0.14 1.35X 1.36X 1.36X
4 threads NoList 0.176 0.19 0.206 1.73X 1.85X 2X
8 threads NoList 0.176 0.208 0.218 1.73X 2.02X 2.12X
*/
enum class FreeListMode {
SHARED_LIST, // One list shared between all threads.
LIST_PER_THREAD, // One list per thread.
NO_LIST, // No list - just allocate and free directly.
};
struct BenchmarkParams {
/// Number of concurrent threads.
int num_threads;
/// How to set up the free lists.
FreeListMode free_list_mode;
/// Number of iterations of work to do between freelist operations.
int work_iterations;
/// Size of allocation to make.
int64_t allocation_size;
};
/// Ensure that locks don't share cache lines.
struct LockedList : public CacheLineAligned {
SpinLock lock;
FreeList list;
};
static const char* ModeString(FreeListMode mode) {
switch (mode) {
case FreeListMode::SHARED_LIST:
return "SharedList";
case FreeListMode::LIST_PER_THREAD:
return "ListPerThread";
case FreeListMode::NO_LIST:
return "NoList";
default:
return "Invalid";
}
}
static const int NUM_OPERATIONS_PER_BATCH = 512;
// Choose a list size large enough so that most allocs should hit in the free list
// but small enough that it will sometimes fill up.
static const int MAX_LIST_ENTRIES = 64;
static const int ALLOC_OP = 0;
static const int FREE_OP = 1;
// Not a static allocation because SystemAllocator's constructor uses gperftool's
// MallocExtension instance, which may not yet be defined due to constructor order
static SystemAllocator* allocator = nullptr;
// Simulate doing some work with the buffer.
void DoWork(uint8_t* data, int64_t len) {
// Touch all the data in the allocation. This is about the minimum amount of
// work we're likely to do with a buffer in a real workload.
memset(data, 1, len);
}
/// Make an allocation and do some work on it.
void DoAlloc(const BenchmarkParams& params, LockedList* free_list,
vector<BufferPool::BufferHandle>* buffers) {
BufferPool::BufferHandle buffer;
bool got_buffer = false;
if (free_list != nullptr) {
lock_guard<SpinLock> l(free_list->lock);
got_buffer = free_list->list.PopFreeBuffer(&buffer);
}
if (!got_buffer) {
Status status = allocator->Allocate(params.allocation_size, &buffer);
if (!status.ok()) LOG(FATAL) << "Failed alloc " << status.msg().msg();
}
// Do some processing to simulate a vaguely realistic work pattern.
for (int j = 0; j < params.work_iterations; ++j) {
DoWork(buffer.data(), buffer.len());
}
buffers->emplace_back(move(buffer));
}
/// Free an allocation.
void DoFree(const BenchmarkParams& params, LockedList* free_list,
vector<BufferPool::BufferHandle>* buffers) {
if (!buffers->empty()) {
if (free_list != nullptr) {
lock_guard<SpinLock> l(free_list->lock);
FreeList* list = &free_list->list;
list->AddFreeBuffer(move(buffers->back()));
if (list->Size() > MAX_LIST_ENTRIES) {
// Discard around 1/4 of the buffers to amortise the cost of sorting.
vector<BufferHandle> buffers =
list->GetBuffersToFree(list->Size() - MAX_LIST_ENTRIES * 3 / 4);
for (BufferHandle& buffer : buffers) allocator->Free(move(buffer));
}
} else {
allocator->Free(move(buffers->back()));
}
buffers->pop_back();
}
}
/// Execute 'num_operations' on 'free_list' according to the parameters in
/// 'params'.
void FreeListBenchmarkThread(int thread_id, int num_operations,
const BenchmarkParams& params, LockedList* free_list) {
mt19937_64 rng(thread_id);
vector<BufferPool::BufferHandle> buffers;
int ops_done = 0;
while (ops_done < num_operations) {
// Execute a number of the same operation in a row.
const int64_t op = uniform_int_distribution<int64_t>(0, 1)(rng);
const int num_ops = uniform_int_distribution<int64_t>(1, 10)(rng);
for (int i = 0; i < num_ops; ++i) {
if (op == ALLOC_OP) {
DoAlloc(params, free_list, &buffers);
} else {
DCHECK_EQ(op, FREE_OP);
DoFree(params, free_list, &buffers);
}
}
ops_done += num_ops;
}
for (BufferHandle& buffer : buffers) allocator->Free(move(buffer));
}
/// Execute the benchmark with the BenchmarkParams passed via 'data'.
void FreeListBenchmark(int batch_size, void* data) {
ObjectPool pool;
const BenchmarkParams& params = *static_cast<BenchmarkParams*>(data);
thread_group threads;
vector<LockedList*> free_lists;
if (params.free_list_mode == FreeListMode::SHARED_LIST) {
free_lists.push_back(pool.Add(new LockedList));
}
for (int i = 0; i < params.num_threads; ++i) {
if (params.free_list_mode == FreeListMode::LIST_PER_THREAD) {
free_lists.push_back(pool.Add(new LockedList));
}
// Divide operations between threads.
int ops_per_thread = batch_size * NUM_OPERATIONS_PER_BATCH / params.num_threads;
auto* free_list = free_lists.empty() ? nullptr : free_lists.back();
threads.add_thread(new thread(FreeListBenchmarkThread, i,
ops_per_thread, params, free_list));
}
threads.join_all();
// Empty out all of the free lists.
for (LockedList* free_list : free_lists) {
vector<BufferHandle> buffers =
free_list->list.GetBuffersToFree(free_list->list.Size());
for (BufferHandle& buffer : buffers) allocator->Free(move(buffer));
}
}
int main(int argc, char** argv) {
CpuInfo::Init();
cout << endl << Benchmark::GetMachineInfo() << endl << endl;
allocator = new SystemAllocator(64 * 1024);
ObjectPool pool;
for (int work_iterations : {0, 1, 2, 4}) {
// Don't test allocations beyond 256KB - by that point we are purely memory-bound.
for (int allocation_size_kb : {64, 128, 256}) {
Benchmark suite(Substitute("FreeLists $0 iters on $1 kb",
work_iterations, allocation_size_kb));
for (FreeListMode free_list_mode : {FreeListMode::SHARED_LIST,
FreeListMode::LIST_PER_THREAD, FreeListMode::NO_LIST}) {
// Vary concurrency up to the # of logical cores.
for (int num_threads : {1, 2, 4, CpuInfo::num_cores()}) {
BenchmarkParams* params = pool.Add(new BenchmarkParams());
params->num_threads = num_threads;
params->free_list_mode = free_list_mode;
params->work_iterations = work_iterations;
params->allocation_size = allocation_size_kb * 1024;
suite.AddBenchmark(
Substitute("$0 threads $1", num_threads, ModeString(free_list_mode)),
FreeListBenchmark, params);
}
}
// Increase benchmark time to get more accurate result.
cout << suite.Measure(100) << endl;
}
}
}