Fix IndexManager with many small segments
IxManager_Choose_Sparse would throw when handling more than about 40
segments because of the integer overflow test in S_fibonacci.
Also, the naive recursive implementation of S_fibonacci had abysmal
performance. After all, the result was created by adding only 0s and
1s. The number of calls to S_fibonacci when computing fib(n) was
fib(n+1), resulting in more than a billion of iterations with n ~ 45.
On my (heavily loaded and low-end) test machine, Choose_Sparse could
take up to 40 seconds to complete.
On the positive side, if anyone was hit by this performance problem,
it's likely they also saw the "index n too large" exception and we
only had two reports so far.
Use a precomputed lookup table and fix the test for large values of n.
Fixes LUCY-318.
diff --git a/core/Lucy/Index/IndexManager.c b/core/Lucy/Index/IndexManager.c
index 7823499..b20b270 100644
--- a/core/Lucy/Index/IndexManager.c
+++ b/core/Lucy/Index/IndexManager.c
@@ -33,6 +33,56 @@
#include <stdlib.h>
+static const int32_t S_fibonacci[47] = {
+ 0,
+ 1,
+ 1,
+ 2,
+ 3,
+ 5,
+ 8,
+ 13,
+ 21,
+ 34,
+ 55,
+ 89,
+ 144,
+ 233,
+ 377,
+ 610,
+ 987,
+ 1597,
+ 2584,
+ 4181,
+ 6765,
+ 10946,
+ 17711,
+ 28657,
+ 46368,
+ 75025,
+ 121393,
+ 196418,
+ 317811,
+ 514229,
+ 832040,
+ 1346269,
+ 2178309,
+ 3524578,
+ 5702887,
+ 9227465,
+ 14930352,
+ 24157817,
+ 39088169,
+ 63245986,
+ 102334155,
+ 165580141,
+ 267914296,
+ 433494437,
+ 701408733,
+ 1134903170,
+ 1836311903
+};
+
IndexManager*
IxManager_new(String *host, LockFactory *lock_factory) {
IndexManager *self = (IndexManager*)Class_Make_Obj(INDEXMANAGER);
@@ -116,21 +166,6 @@
return SegReader_Doc_Count(a) - SegReader_Doc_Count(b);
}
-static uint32_t
-S_fibonacci(uint32_t n) {
- uint32_t result = 0;
- if (n > 46) {
- THROW(ERR, "input %u32 too high", n);
- }
- else if (n < 2) {
- result = n;
- }
- else {
- result = S_fibonacci(n - 1) + S_fibonacci(n - 2);
- }
- return result;
-}
-
Vector*
IxManager_Recycle_IMP(IndexManager *self, PolyReader *reader,
DeletionsWriter *del_writer, int64_t cutoff,
@@ -199,7 +234,9 @@
for (uint32_t i = 0; i < num_candidates; i++) {
uint32_t num_segs_when_done = num_candidates - threshold + 1;
total_docs += I32Arr_Get(doc_counts, i);
- if (total_docs < S_fibonacci(num_segs_when_done + 5)) {
+ uint32_t n = num_segs_when_done + 5;
+ if (n >= sizeof(S_fibonacci) / sizeof(S_fibonacci[0])
+ || total_docs < S_fibonacci[n]) {
threshold = i + 1;
}
}
diff --git a/core/Lucy/Test/Index/TestIndexManager.c b/core/Lucy/Test/Index/TestIndexManager.c
index 87b1247..7bc0661 100644
--- a/core/Lucy/Test/Index/TestIndexManager.c
+++ b/core/Lucy/Test/Index/TestIndexManager.c
@@ -52,12 +52,23 @@
DECREF(doc_counts);
}
+ {
+ size_t num_segs = 1000;
+ I32Array *doc_counts = I32Arr_new_blank(num_segs);
+ for (size_t j = 0; j < num_segs; j++) {
+ I32Arr_Set(doc_counts, j, 1);
+ }
+ uint32_t threshold = IxManager_Choose_Sparse(manager, doc_counts);
+ TEST_TRUE(runner, threshold > 500, "Many small segments");
+ DECREF(doc_counts);
+ }
+
DECREF(manager);
}
void
TestIxManager_Run_IMP(TestIndexManager *self, TestBatchRunner *runner) {
- TestBatchRunner_Plan(runner, (TestBatch*)self, 34);
+ TestBatchRunner_Plan(runner, (TestBatch*)self, 35);
test_Choose_Sparse(runner);
}