blob: 10dc7a8215b48c2c2021ab8407c856570ace9b50 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.index.sasi.disk;
import java.io.File;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.ThreadLocalRandom;
import java.util.stream.Collectors;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.cql3.Operator;
import org.apache.cassandra.db.BufferDecoratedKey;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.dht.Murmur3Partitioner;
import org.apache.cassandra.index.sasi.plan.Expression;
import org.apache.cassandra.index.sasi.utils.CombinedTerm;
import org.apache.cassandra.index.sasi.utils.CombinedTermIterator;
import org.apache.cassandra.index.sasi.utils.OnDiskIndexIterator;
import org.apache.cassandra.index.sasi.utils.RangeIterator;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.db.marshal.Int32Type;
import org.apache.cassandra.db.marshal.LongType;
import org.apache.cassandra.db.marshal.UTF8Type;
import org.apache.cassandra.io.util.DataOutputBuffer;
import org.apache.cassandra.utils.MurmurHash;
import org.apache.cassandra.utils.Pair;
import com.carrotsearch.hppc.LongSet;
import com.carrotsearch.hppc.cursors.LongCursor;
import com.google.common.base.Function;
import com.google.common.collect.Iterators;
import com.google.common.collect.Sets;
import junit.framework.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
public class OnDiskIndexTest
{
@BeforeClass
public static void setupDD()
{
DatabaseDescriptor.daemonInitialization();
}
@Test
public void testStringSAConstruction() throws Exception
{
Map<ByteBuffer, TokenTreeBuilder> data = new HashMap<ByteBuffer, TokenTreeBuilder>()
{{
put(UTF8Type.instance.decompose("scat"), keyBuilder(1L));
put(UTF8Type.instance.decompose("mat"), keyBuilder(2L));
put(UTF8Type.instance.decompose("fat"), keyBuilder(3L));
put(UTF8Type.instance.decompose("cat"), keyBuilder(1L, 4L));
put(UTF8Type.instance.decompose("till"), keyBuilder(2L, 6L));
put(UTF8Type.instance.decompose("bill"), keyBuilder(5L));
put(UTF8Type.instance.decompose("foo"), keyBuilder(7L));
put(UTF8Type.instance.decompose("bar"), keyBuilder(9L, 10L));
put(UTF8Type.instance.decompose("michael"), keyBuilder(11L, 12L, 1L));
put(UTF8Type.instance.decompose("am"), keyBuilder(15L));
}};
OnDiskIndexBuilder builder = new OnDiskIndexBuilder(UTF8Type.instance, UTF8Type.instance, OnDiskIndexBuilder.Mode.CONTAINS);
for (Map.Entry<ByteBuffer, TokenTreeBuilder> e : data.entrySet())
addAll(builder, e.getKey(), e.getValue());
File index = File.createTempFile("on-disk-sa-string", "db");
index.deleteOnExit();
builder.finish(index);
OnDiskIndex onDisk = new OnDiskIndex(index, UTF8Type.instance, new KeyConverter());
// first check if we can find exact matches
for (Map.Entry<ByteBuffer, TokenTreeBuilder> e : data.entrySet())
{
if (UTF8Type.instance.getString(e.getKey()).equals("cat"))
continue; // cat is embedded into scat, we'll test it in next section
Assert.assertEquals("Key was: " + UTF8Type.instance.compose(e.getKey()), convert(e.getValue()), convert(onDisk.search(expressionFor(UTF8Type.instance, e.getKey()))));
}
// check that cat returns positions for scat & cat
Assert.assertEquals(convert(1, 4), convert(onDisk.search(expressionFor("cat"))));
// random suffix queries
Assert.assertEquals(convert(9, 10), convert(onDisk.search(expressionFor("ar"))));
Assert.assertEquals(convert(1, 2, 3, 4), convert(onDisk.search(expressionFor("at"))));
Assert.assertEquals(convert(1, 11, 12), convert(onDisk.search(expressionFor("mic"))));
Assert.assertEquals(convert(1, 11, 12), convert(onDisk.search(expressionFor("ae"))));
Assert.assertEquals(convert(2, 5, 6), convert(onDisk.search(expressionFor("ll"))));
Assert.assertEquals(convert(1, 2, 5, 6, 11, 12), convert(onDisk.search(expressionFor("l"))));
Assert.assertEquals(convert(7), convert(onDisk.search(expressionFor("oo"))));
Assert.assertEquals(convert(7), convert(onDisk.search(expressionFor("o"))));
Assert.assertEquals(convert(1, 2, 3, 4, 6), convert(onDisk.search(expressionFor("t"))));
Assert.assertEquals(convert(1, 2, 11, 12), convert(onDisk.search(expressionFor("m", Operator.LIKE_PREFIX))));
Assert.assertEquals(Collections.<DecoratedKey>emptySet(), convert(onDisk.search(expressionFor("hello"))));
onDisk.close();
}
@Test
public void testIntegerSAConstruction() throws Exception
{
final Map<ByteBuffer, TokenTreeBuilder> data = new HashMap<ByteBuffer, TokenTreeBuilder>()
{{
put(Int32Type.instance.decompose(5), keyBuilder(1L));
put(Int32Type.instance.decompose(7), keyBuilder(2L));
put(Int32Type.instance.decompose(1), keyBuilder(3L));
put(Int32Type.instance.decompose(3), keyBuilder(1L, 4L));
put(Int32Type.instance.decompose(8), keyBuilder(2L, 6L));
put(Int32Type.instance.decompose(10), keyBuilder(5L));
put(Int32Type.instance.decompose(6), keyBuilder(7L));
put(Int32Type.instance.decompose(4), keyBuilder(9L, 10L));
put(Int32Type.instance.decompose(0), keyBuilder(11L, 12L, 1L));
}};
OnDiskIndexBuilder builder = new OnDiskIndexBuilder(UTF8Type.instance, Int32Type.instance, OnDiskIndexBuilder.Mode.PREFIX);
for (Map.Entry<ByteBuffer, TokenTreeBuilder> e : data.entrySet())
addAll(builder, e.getKey(), e.getValue());
File index = File.createTempFile("on-disk-sa-int", "db");
index.deleteOnExit();
builder.finish(index);
OnDiskIndex onDisk = new OnDiskIndex(index, Int32Type.instance, new KeyConverter());
for (Map.Entry<ByteBuffer, TokenTreeBuilder> e : data.entrySet())
{
Assert.assertEquals(convert(e.getValue()), convert(onDisk.search(expressionFor(Operator.EQ, Int32Type.instance, e.getKey()))));
}
List<ByteBuffer> sortedNumbers = new ArrayList<ByteBuffer>()
{{
addAll(data.keySet().stream().collect(Collectors.toList()));
}};
Collections.sort(sortedNumbers, Int32Type.instance::compare);
// test full iteration
int idx = 0;
for (OnDiskIndex.DataTerm term : onDisk)
{
ByteBuffer number = sortedNumbers.get(idx++);
Assert.assertEquals(number, term.getTerm());
Assert.assertEquals(convert(data.get(number)), convert(term.getTokens()));
}
// test partial iteration (descending)
idx = 3; // start from the 3rd element
Iterator<OnDiskIndex.DataTerm> partialIter = onDisk.iteratorAt(sortedNumbers.get(idx), OnDiskIndex.IteratorOrder.DESC, true);
while (partialIter.hasNext())
{
OnDiskIndex.DataTerm term = partialIter.next();
ByteBuffer number = sortedNumbers.get(idx++);
Assert.assertEquals(number, term.getTerm());
Assert.assertEquals(convert(data.get(number)), convert(term.getTokens()));
}
idx = 3; // start from the 3rd element exclusive
partialIter = onDisk.iteratorAt(sortedNumbers.get(idx++), OnDiskIndex.IteratorOrder.DESC, false);
while (partialIter.hasNext())
{
OnDiskIndex.DataTerm term = partialIter.next();
ByteBuffer number = sortedNumbers.get(idx++);
Assert.assertEquals(number, term.getTerm());
Assert.assertEquals(convert(data.get(number)), convert(term.getTokens()));
}
// test partial iteration (ascending)
idx = 6; // start from the 6rd element
partialIter = onDisk.iteratorAt(sortedNumbers.get(idx), OnDiskIndex.IteratorOrder.ASC, true);
while (partialIter.hasNext())
{
OnDiskIndex.DataTerm term = partialIter.next();
ByteBuffer number = sortedNumbers.get(idx--);
Assert.assertEquals(number, term.getTerm());
Assert.assertEquals(convert(data.get(number)), convert(term.getTokens()));
}
idx = 6; // start from the 6rd element exclusive
partialIter = onDisk.iteratorAt(sortedNumbers.get(idx--), OnDiskIndex.IteratorOrder.ASC, false);
while (partialIter.hasNext())
{
OnDiskIndex.DataTerm term = partialIter.next();
ByteBuffer number = sortedNumbers.get(idx--);
Assert.assertEquals(number, term.getTerm());
Assert.assertEquals(convert(data.get(number)), convert(term.getTokens()));
}
onDisk.close();
List<ByteBuffer> iterCheckNums = new ArrayList<ByteBuffer>()
{{
add(Int32Type.instance.decompose(3));
add(Int32Type.instance.decompose(9));
add(Int32Type.instance.decompose(14));
add(Int32Type.instance.decompose(42));
}};
OnDiskIndexBuilder iterTest = new OnDiskIndexBuilder(UTF8Type.instance, Int32Type.instance, OnDiskIndexBuilder.Mode.PREFIX);
for (int i = 0; i < iterCheckNums.size(); i++)
iterTest.add(iterCheckNums.get(i), keyAt((long) i), i);
File iterIndex = File.createTempFile("sa-iter", ".db");
iterIndex.deleteOnExit();
iterTest.finish(iterIndex);
onDisk = new OnDiskIndex(iterIndex, Int32Type.instance, new KeyConverter());
ByteBuffer number = Int32Type.instance.decompose(1);
Assert.assertEquals(0, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.ASC, false)));
Assert.assertEquals(0, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.ASC, true)));
Assert.assertEquals(4, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.DESC, false)));
Assert.assertEquals(4, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.DESC, true)));
number = Int32Type.instance.decompose(44);
Assert.assertEquals(4, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.ASC, false)));
Assert.assertEquals(4, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.ASC, true)));
Assert.assertEquals(0, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.DESC, false)));
Assert.assertEquals(0, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.DESC, true)));
number = Int32Type.instance.decompose(20);
Assert.assertEquals(3, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.ASC, false)));
Assert.assertEquals(3, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.ASC, true)));
Assert.assertEquals(1, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.DESC, false)));
Assert.assertEquals(1, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.DESC, true)));
number = Int32Type.instance.decompose(5);
Assert.assertEquals(1, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.ASC, false)));
Assert.assertEquals(1, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.ASC, true)));
Assert.assertEquals(3, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.DESC, false)));
Assert.assertEquals(3, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.DESC, true)));
number = Int32Type.instance.decompose(10);
Assert.assertEquals(2, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.ASC, false)));
Assert.assertEquals(2, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.ASC, true)));
Assert.assertEquals(2, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.DESC, false)));
Assert.assertEquals(2, Iterators.size(onDisk.iteratorAt(number, OnDiskIndex.IteratorOrder.DESC, true)));
onDisk.close();
}
@Test
public void testMultiSuffixMatches() throws Exception
{
OnDiskIndexBuilder builder = new OnDiskIndexBuilder(UTF8Type.instance, UTF8Type.instance, OnDiskIndexBuilder.Mode.CONTAINS)
{{
addAll(this, UTF8Type.instance.decompose("Eliza"), keyBuilder(1L, 2L));
addAll(this, UTF8Type.instance.decompose("Elizabeth"), keyBuilder(3L, 4L));
addAll(this, UTF8Type.instance.decompose("Aliza"), keyBuilder(5L, 6L));
addAll(this, UTF8Type.instance.decompose("Taylor"), keyBuilder(7L, 8L));
addAll(this, UTF8Type.instance.decompose("Pavel"), keyBuilder(9L, 10L));
}};
File index = File.createTempFile("on-disk-sa-multi-suffix-match", ".db");
index.deleteOnExit();
builder.finish(index);
OnDiskIndex onDisk = new OnDiskIndex(index, UTF8Type.instance, new KeyConverter());
Assert.assertEquals(convert(1, 2, 3, 4, 5, 6), convert(onDisk.search(expressionFor("liz"))));
Assert.assertEquals(convert(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), convert(onDisk.search(expressionFor("a"))));
Assert.assertEquals(convert(5, 6), convert(onDisk.search(expressionFor("A"))));
Assert.assertEquals(convert(1, 2, 3, 4), convert(onDisk.search(expressionFor("E"))));
Assert.assertEquals(convert(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), convert(onDisk.search(expressionFor("l"))));
Assert.assertEquals(convert(3, 4), convert(onDisk.search(expressionFor("bet"))));
Assert.assertEquals(convert(3, 4, 9, 10), convert(onDisk.search(expressionFor("e"))));
Assert.assertEquals(convert(7, 8), convert(onDisk.search(expressionFor("yl"))));
Assert.assertEquals(convert(7, 8), convert(onDisk.search(expressionFor("T"))));
Assert.assertEquals(convert(1, 2, 3, 4, 5, 6), convert(onDisk.search(expressionFor("za"))));
Assert.assertEquals(convert(3, 4), convert(onDisk.search(expressionFor("ab"))));
Assert.assertEquals(Collections.<DecoratedKey>emptySet(), convert(onDisk.search(expressionFor("Pi"))));
Assert.assertEquals(Collections.<DecoratedKey>emptySet(), convert(onDisk.search(expressionFor("ethz"))));
Assert.assertEquals(Collections.<DecoratedKey>emptySet(), convert(onDisk.search(expressionFor("liw"))));
Assert.assertEquals(Collections.<DecoratedKey>emptySet(), convert(onDisk.search(expressionFor("Taw"))));
Assert.assertEquals(Collections.<DecoratedKey>emptySet(), convert(onDisk.search(expressionFor("Av"))));
onDisk.close();
}
@Test
public void testSparseMode() throws Exception
{
OnDiskIndexBuilder builder = new OnDiskIndexBuilder(UTF8Type.instance, LongType.instance, OnDiskIndexBuilder.Mode.SPARSE);
final long start = System.currentTimeMillis();
final int numIterations = 100000;
for (long i = 0; i < numIterations; i++)
builder.add(LongType.instance.decompose(start + i), keyAt(i), i);
File index = File.createTempFile("on-disk-sa-sparse", "db");
index.deleteOnExit();
builder.finish(index);
OnDiskIndex onDisk = new OnDiskIndex(index, LongType.instance, new KeyConverter());
ThreadLocalRandom random = ThreadLocalRandom.current();
for (long step = start; step < (start + numIterations); step += 1000)
{
boolean lowerInclusive = random.nextBoolean();
boolean upperInclusive = random.nextBoolean();
long limit = random.nextLong(step, start + numIterations);
RangeIterator<Long, Token> rows = onDisk.search(expressionFor(step, lowerInclusive, limit, upperInclusive));
long lowerKey = step - start;
long upperKey = lowerKey + (limit - step);
if (!lowerInclusive)
lowerKey += 1;
if (upperInclusive)
upperKey += 1;
Set<DecoratedKey> actual = convert(rows);
for (long key = lowerKey; key < upperKey; key++)
Assert.assertTrue("key" + key + " wasn't found", actual.contains(keyAt(key)));
Assert.assertEquals((upperKey - lowerKey), actual.size());
}
// let's also explicitly test whole range search
RangeIterator<Long, Token> rows = onDisk.search(expressionFor(start, true, start + numIterations, true));
Set<DecoratedKey> actual = convert(rows);
Assert.assertEquals(numIterations, actual.size());
}
@Test
public void testNotEqualsQueryForStrings() throws Exception
{
Map<ByteBuffer, TokenTreeBuilder> data = new HashMap<ByteBuffer, TokenTreeBuilder>()
{{
put(UTF8Type.instance.decompose("Pavel"), keyBuilder(1L, 2L));
put(UTF8Type.instance.decompose("Jason"), keyBuilder(3L));
put(UTF8Type.instance.decompose("Jordan"), keyBuilder(4L));
put(UTF8Type.instance.decompose("Michael"), keyBuilder(5L, 6L));
put(UTF8Type.instance.decompose("Vijay"), keyBuilder(7L));
put(UTF8Type.instance.decompose("Travis"), keyBuilder(8L));
put(UTF8Type.instance.decompose("Aleksey"), keyBuilder(9L, 10L));
}};
OnDiskIndexBuilder builder = new OnDiskIndexBuilder(UTF8Type.instance, UTF8Type.instance, OnDiskIndexBuilder.Mode.PREFIX);
for (Map.Entry<ByteBuffer, TokenTreeBuilder> e : data.entrySet())
addAll(builder, e.getKey(), e.getValue());
File index = File.createTempFile("on-disk-sa-except-test", "db");
index.deleteOnExit();
builder.finish(index);
OnDiskIndex onDisk = new OnDiskIndex(index, UTF8Type.instance, new KeyConverter());
// test whole words first
Assert.assertEquals(convert(3, 4, 5, 6, 7, 8, 9, 10), convert(onDisk.search(expressionForNot("Aleksey", "Vijay", "Pavel"))));
Assert.assertEquals(convert(3, 4, 7, 8, 9, 10), convert(onDisk.search(expressionForNot("Aleksey", "Vijay", "Pavel", "Michael"))));
Assert.assertEquals(convert(3, 4, 7, 9, 10), convert(onDisk.search(expressionForNot("Aleksey", "Vijay", "Pavel", "Michael", "Travis"))));
// now test prefixes
Assert.assertEquals(convert(3, 4, 5, 6, 7, 8, 9, 10), convert(onDisk.search(expressionForNot("Aleksey", "Vijay", "Pav"))));
Assert.assertEquals(convert(3, 4, 7, 8, 9, 10), convert(onDisk.search(expressionForNot("Aleksey", "Vijay", "Pavel", "Mic"))));
Assert.assertEquals(convert(3, 4, 7, 9, 10), convert(onDisk.search(expressionForNot("Aleksey", "Vijay", "Pavel", "Micha", "Tr"))));
onDisk.close();
}
@Test
public void testNotEqualsQueryForNumbers() throws Exception
{
final Map<ByteBuffer, TokenTreeBuilder> data = new HashMap<ByteBuffer, TokenTreeBuilder>()
{{
put(Int32Type.instance.decompose(5), keyBuilder(1L));
put(Int32Type.instance.decompose(7), keyBuilder(2L));
put(Int32Type.instance.decompose(1), keyBuilder(3L));
put(Int32Type.instance.decompose(3), keyBuilder(1L, 4L));
put(Int32Type.instance.decompose(8), keyBuilder(8L, 6L));
put(Int32Type.instance.decompose(10), keyBuilder(5L));
put(Int32Type.instance.decompose(6), keyBuilder(7L));
put(Int32Type.instance.decompose(4), keyBuilder(9L, 10L));
put(Int32Type.instance.decompose(0), keyBuilder(11L, 12L, 1L));
}};
OnDiskIndexBuilder builder = new OnDiskIndexBuilder(UTF8Type.instance, Int32Type.instance, OnDiskIndexBuilder.Mode.PREFIX);
for (Map.Entry<ByteBuffer, TokenTreeBuilder> e : data.entrySet())
addAll(builder, e.getKey(), e.getValue());
File index = File.createTempFile("on-disk-sa-except-int-test", "db");
index.deleteOnExit();
builder.finish(index);
OnDiskIndex onDisk = new OnDiskIndex(index, Int32Type.instance, new KeyConverter());
Assert.assertEquals(convert(1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12), convert(onDisk.search(expressionForNot(0, 10, 1))));
Assert.assertEquals(convert(1, 2, 4, 5, 7, 9, 10, 11, 12), convert(onDisk.search(expressionForNot(0, 10, 1, 8))));
Assert.assertEquals(convert(1, 2, 4, 5, 7, 11, 12), convert(onDisk.search(expressionForNot(0, 10, 1, 8, 4))));
onDisk.close();
}
@Test
public void testRangeQueryWithExclusions() throws Exception
{
final long lower = 0;
final long upper = 100000;
OnDiskIndexBuilder builder = new OnDiskIndexBuilder(UTF8Type.instance, LongType.instance, OnDiskIndexBuilder.Mode.SPARSE);
for (long i = lower; i <= upper; i++)
builder.add(LongType.instance.decompose(i), keyAt(i), i);
File index = File.createTempFile("on-disk-sa-except-long-ranges", "db");
index.deleteOnExit();
builder.finish(index);
OnDiskIndex onDisk = new OnDiskIndex(index, LongType.instance, new KeyConverter());
ThreadLocalRandom random = ThreadLocalRandom.current();
// single exclusion
// let's do small range first to figure out if searchPoint works properly
validateExclusions(onDisk, lower, 50, Sets.newHashSet(42L));
// now let's do whole data set to test SPARSE searching
validateExclusions(onDisk, lower, upper, Sets.newHashSet(31337L));
// pair of exclusions which would generate a split
validateExclusions(onDisk, lower, random.nextInt(400, 800), Sets.newHashSet(42L, 154L));
validateExclusions(onDisk, lower, upper, Sets.newHashSet(31337L, 54631L));
// 3 exclusions which would generate a split and change bounds
validateExclusions(onDisk, lower, random.nextInt(400, 800), Sets.newHashSet(42L, 154L));
validateExclusions(onDisk, lower, upper, Sets.newHashSet(31337L, 54631L));
validateExclusions(onDisk, lower, random.nextLong(400, upper), Sets.newHashSet(42L, 55L));
validateExclusions(onDisk, lower, random.nextLong(400, upper), Sets.newHashSet(42L, 55L, 93L));
validateExclusions(onDisk, lower, random.nextLong(400, upper), Sets.newHashSet(42L, 55L, 93L, 205L));
Set<Long> exclusions = Sets.newHashSet(3L, 12L, 13L, 14L, 27L, 54L, 81L, 125L, 384L, 771L, 1054L, 2048L, 78834L);
// test that exclusions are properly bound by lower/upper of the expression
Assert.assertEquals(392, validateExclusions(onDisk, lower, 400, exclusions, false));
Assert.assertEquals(101, validateExclusions(onDisk, lower, 100, Sets.newHashSet(-10L, -5L, -1L), false));
validateExclusions(onDisk, lower, upper, exclusions);
Assert.assertEquals(100000, convert(onDisk.search(new Expression("", LongType.instance)
.add(Operator.NEQ, LongType.instance.decompose(100L)))).size());
Assert.assertEquals(49, convert(onDisk.search(new Expression("", LongType.instance)
.add(Operator.LT, LongType.instance.decompose(50L))
.add(Operator.NEQ, LongType.instance.decompose(10L)))).size());
Assert.assertEquals(99998, convert(onDisk.search(new Expression("", LongType.instance)
.add(Operator.GT, LongType.instance.decompose(1L))
.add(Operator.NEQ, LongType.instance.decompose(20L)))).size());
onDisk.close();
}
private void validateExclusions(OnDiskIndex sa, long lower, long upper, Set<Long> exclusions)
{
validateExclusions(sa, lower, upper, exclusions, true);
}
private int validateExclusions(OnDiskIndex sa, long lower, long upper, Set<Long> exclusions, boolean checkCount)
{
int count = 0;
for (DecoratedKey key : convert(sa.search(rangeWithExclusions(lower, true, upper, true, exclusions))))
{
String keyId = UTF8Type.instance.getString(key.getKey()).split("key")[1];
Assert.assertFalse("key" + keyId + " is present.", exclusions.contains(Long.valueOf(keyId)));
count++;
}
if (checkCount)
Assert.assertEquals(upper - (lower == 0 ? -1 : lower) - exclusions.size(), count);
return count;
}
@Test
public void testDescriptor() throws Exception
{
final Map<ByteBuffer, Pair<DecoratedKey, Long>> data = new HashMap<ByteBuffer, Pair<DecoratedKey, Long>>()
{{
put(Int32Type.instance.decompose(5), Pair.create(keyAt(1L), 1L));
}};
OnDiskIndexBuilder builder1 = new OnDiskIndexBuilder(UTF8Type.instance, Int32Type.instance, OnDiskIndexBuilder.Mode.PREFIX);
OnDiskIndexBuilder builder2 = new OnDiskIndexBuilder(UTF8Type.instance, Int32Type.instance, OnDiskIndexBuilder.Mode.PREFIX);
for (Map.Entry<ByteBuffer, Pair<DecoratedKey, Long>> e : data.entrySet())
{
DecoratedKey key = e.getValue().left;
Long position = e.getValue().right;
builder1.add(e.getKey(), key, position);
builder2.add(e.getKey(), key, position);
}
File index1 = File.createTempFile("on-disk-sa-int", "db");
File index2 = File.createTempFile("on-disk-sa-int2", "db");
index1.deleteOnExit();
index2.deleteOnExit();
builder1.finish(index1);
builder2.finish(new Descriptor(Descriptor.VERSION_AA), index2);
OnDiskIndex onDisk1 = new OnDiskIndex(index1, Int32Type.instance, new KeyConverter());
OnDiskIndex onDisk2 = new OnDiskIndex(index2, Int32Type.instance, new KeyConverter());
ByteBuffer number = Int32Type.instance.decompose(5);
Assert.assertEquals(Collections.singleton(data.get(number).left), convert(onDisk1.search(expressionFor(Operator.EQ, Int32Type.instance, number))));
Assert.assertEquals(Collections.singleton(data.get(number).left), convert(onDisk2.search(expressionFor(Operator.EQ, Int32Type.instance, number))));
Assert.assertEquals(onDisk1.descriptor.version.version, Descriptor.CURRENT_VERSION);
Assert.assertEquals(onDisk2.descriptor.version.version, Descriptor.VERSION_AA);
}
@Test
public void testSuperBlocks() throws Exception
{
Map<ByteBuffer, TokenTreeBuilder> terms = new HashMap<>();
terms.put(UTF8Type.instance.decompose("1234"), keyBuilder(1L, 2L));
terms.put(UTF8Type.instance.decompose("2345"), keyBuilder(3L, 4L));
terms.put(UTF8Type.instance.decompose("3456"), keyBuilder(5L, 6L));
terms.put(UTF8Type.instance.decompose("4567"), keyBuilder(7L, 8L));
terms.put(UTF8Type.instance.decompose("5678"), keyBuilder(9L, 10L));
OnDiskIndexBuilder builder = new OnDiskIndexBuilder(UTF8Type.instance, Int32Type.instance, OnDiskIndexBuilder.Mode.SPARSE);
for (Map.Entry<ByteBuffer, TokenTreeBuilder> entry : terms.entrySet())
addAll(builder, entry.getKey(), entry.getValue());
File index = File.createTempFile("on-disk-sa-try-superblocks", ".db");
index.deleteOnExit();
builder.finish(index);
OnDiskIndex onDisk = new OnDiskIndex(index, Int32Type.instance, new KeyConverter());
OnDiskIndex.OnDiskSuperBlock superBlock = onDisk.dataLevel.getSuperBlock(0);
Iterator<Token> iter = superBlock.iterator();
Long lastToken = null;
while (iter.hasNext())
{
Token token = iter.next();
if (lastToken != null)
Assert.assertTrue(lastToken.compareTo(token.get()) < 0);
lastToken = token.get();
}
}
@Test
public void testSuperBlockRetrieval() throws Exception
{
OnDiskIndexBuilder builder = new OnDiskIndexBuilder(UTF8Type.instance, LongType.instance, OnDiskIndexBuilder.Mode.SPARSE);
for (long i = 0; i < 100000; i++)
builder.add(LongType.instance.decompose(i), keyAt(i), i);
File index = File.createTempFile("on-disk-sa-multi-superblock-match", ".db");
index.deleteOnExit();
builder.finish(index);
OnDiskIndex onDiskIndex = new OnDiskIndex(index, LongType.instance, new KeyConverter());
testSearchRangeWithSuperBlocks(onDiskIndex, 0, 500);
testSearchRangeWithSuperBlocks(onDiskIndex, 300, 93456);
testSearchRangeWithSuperBlocks(onDiskIndex, 210, 1700);
testSearchRangeWithSuperBlocks(onDiskIndex, 530, 3200);
Random random = new Random(0xdeadbeef);
for (int i = 0; i < 100000; i += random.nextInt(1500)) // random steps with max of 1500 elements
{
for (int j = 0; j < 3; j++)
testSearchRangeWithSuperBlocks(onDiskIndex, i, ThreadLocalRandom.current().nextInt(i, 100000));
}
}
public void putAll(SortedMap<Long, LongSet> offsets, TokenTreeBuilder ttb)
{
for (Pair<Long, LongSet> entry : ttb)
offsets.put(entry.left, entry.right);
}
@Test
public void testCombiningOfThePartitionedSA() throws Exception
{
OnDiskIndexBuilder builderA = new OnDiskIndexBuilder(UTF8Type.instance, LongType.instance, OnDiskIndexBuilder.Mode.PREFIX);
OnDiskIndexBuilder builderB = new OnDiskIndexBuilder(UTF8Type.instance, LongType.instance, OnDiskIndexBuilder.Mode.PREFIX);
TreeMap<Long, TreeMap<Long, LongSet>> expected = new TreeMap<>();
for (long i = 0; i <= 100; i++)
{
TreeMap<Long, LongSet> offsets = expected.get(i);
if (offsets == null)
expected.put(i, (offsets = new TreeMap<>()));
builderA.add(LongType.instance.decompose(i), keyAt(i), i);
putAll(offsets, keyBuilder(i));
}
for (long i = 50; i < 100; i++)
{
TreeMap<Long, LongSet> offsets = expected.get(i);
if (offsets == null)
expected.put(i, (offsets = new TreeMap<>()));
long position = 100L + i;
builderB.add(LongType.instance.decompose(i), keyAt(position), position);
putAll(offsets, keyBuilder(100L + i));
}
File indexA = File.createTempFile("on-disk-sa-partition-a", ".db");
indexA.deleteOnExit();
File indexB = File.createTempFile("on-disk-sa-partition-b", ".db");
indexB.deleteOnExit();
builderA.finish(indexA);
builderB.finish(indexB);
OnDiskIndex a = new OnDiskIndex(indexA, LongType.instance, new KeyConverter());
OnDiskIndex b = new OnDiskIndex(indexB, LongType.instance, new KeyConverter());
RangeIterator<OnDiskIndex.DataTerm, CombinedTerm> union = OnDiskIndexIterator.union(a, b);
TreeMap<Long, TreeMap<Long, LongSet>> actual = new TreeMap<>();
while (union.hasNext())
{
CombinedTerm term = union.next();
Long composedTerm = LongType.instance.compose(term.getTerm());
TreeMap<Long, LongSet> offsets = actual.get(composedTerm);
if (offsets == null)
actual.put(composedTerm, (offsets = new TreeMap<>()));
putAll(offsets, term.getTokenTreeBuilder());
}
Assert.assertEquals(actual, expected);
File indexC = File.createTempFile("on-disk-sa-partition-final", ".db");
indexC.deleteOnExit();
OnDiskIndexBuilder combined = new OnDiskIndexBuilder(UTF8Type.instance, LongType.instance, OnDiskIndexBuilder.Mode.PREFIX);
combined.finish(Pair.create(keyAt(0).getKey(), keyAt(100).getKey()), indexC, new CombinedTermIterator(a, b));
OnDiskIndex c = new OnDiskIndex(indexC, LongType.instance, new KeyConverter());
union = OnDiskIndexIterator.union(c);
actual.clear();
while (union.hasNext())
{
CombinedTerm term = union.next();
Long composedTerm = LongType.instance.compose(term.getTerm());
TreeMap<Long, LongSet> offsets = actual.get(composedTerm);
if (offsets == null)
actual.put(composedTerm, (offsets = new TreeMap<>()));
putAll(offsets, term.getTokenTreeBuilder());
}
Assert.assertEquals(actual, expected);
a.close();
b.close();
}
@Test
public void testPrefixSearchWithCONTAINSMode() throws Exception
{
Map<ByteBuffer, TokenTreeBuilder> data = new HashMap<ByteBuffer, TokenTreeBuilder>()
{{
put(UTF8Type.instance.decompose("lady gaga"), keyBuilder(1L));
// Partial term for 'lady of bells'
DataOutputBuffer ladyOfBellsBuffer = new DataOutputBuffer();
ladyOfBellsBuffer.writeShort(UTF8Type.instance.decompose("lady of bells").remaining() | (1 << OnDiskIndexBuilder.IS_PARTIAL_BIT));
ladyOfBellsBuffer.write(UTF8Type.instance.decompose("lady of bells"));
put(ladyOfBellsBuffer.asNewBuffer(), keyBuilder(2L));
put(UTF8Type.instance.decompose("lady pank"), keyBuilder(3L));
}};
OnDiskIndexBuilder builder = new OnDiskIndexBuilder(UTF8Type.instance, UTF8Type.instance, OnDiskIndexBuilder.Mode.CONTAINS);
for (Map.Entry<ByteBuffer, TokenTreeBuilder> e : data.entrySet())
addAll(builder, e.getKey(), e.getValue());
File index = File.createTempFile("on-disk-sa-prefix-contains-search", "db");
index.deleteOnExit();
builder.finish(index);
OnDiskIndex onDisk = new OnDiskIndex(index, UTF8Type.instance, new KeyConverter());
// check that lady% return lady gaga (1) and lady pank (3) but not lady of bells(2)
Assert.assertEquals(convert(1, 3), convert(onDisk.search(expressionFor("lady", Operator.LIKE_PREFIX))));
onDisk.close();
}
private void testSearchRangeWithSuperBlocks(OnDiskIndex onDiskIndex, long start, long end)
{
RangeIterator<Long, Token> tokens = onDiskIndex.search(expressionFor(start, true, end, false));
// no results should be produced only if range is empty
if (tokens == null)
{
Assert.assertEquals(0, end - start);
return;
}
int keyCount = 0;
Long lastToken = null;
while (tokens.hasNext())
{
Token token = tokens.next();
Iterator<DecoratedKey> keys = token.iterator();
// each of the values should have exactly a single key
Assert.assertTrue(keys.hasNext());
keys.next();
Assert.assertFalse(keys.hasNext());
// and it's last should always smaller than current
if (lastToken != null)
Assert.assertTrue("last should be less than current", lastToken.compareTo(token.get()) < 0);
lastToken = token.get();
keyCount++;
}
Assert.assertEquals(end - start, keyCount);
}
private static DecoratedKey keyAt(long rawKey)
{
ByteBuffer key = ByteBuffer.wrap(("key" + rawKey).getBytes());
return new BufferDecoratedKey(new Murmur3Partitioner.LongToken(MurmurHash.hash2_64(key, key.position(), key.remaining(), 0)), key);
}
private static TokenTreeBuilder keyBuilder(Long... keys)
{
TokenTreeBuilder builder = new DynamicTokenTreeBuilder();
for (final Long key : keys)
{
DecoratedKey dk = keyAt(key);
builder.add((Long) dk.getToken().getTokenValue(), key);
}
return builder.finish();
}
private static Set<DecoratedKey> convert(TokenTreeBuilder offsets)
{
Set<DecoratedKey> result = new HashSet<>();
Iterator<Pair<Long, LongSet>> offsetIter = offsets.iterator();
while (offsetIter.hasNext())
{
LongSet v = offsetIter.next().right;
for (LongCursor offset : v)
result.add(keyAt(offset.value));
}
return result;
}
private static Set<DecoratedKey> convert(long... keyOffsets)
{
Set<DecoratedKey> result = new HashSet<>();
for (long offset : keyOffsets)
result.add(keyAt(offset));
return result;
}
private static Set<DecoratedKey> convert(RangeIterator<Long, Token> results)
{
if (results == null)
return Collections.emptySet();
Set<DecoratedKey> keys = new TreeSet<>(DecoratedKey.comparator);
while (results.hasNext())
{
for (DecoratedKey key : results.next())
keys.add(key);
}
return keys;
}
private static Expression expressionFor(long lower, boolean lowerInclusive, long upper, boolean upperInclusive)
{
Expression expression = new Expression("", LongType.instance);
expression.add(lowerInclusive ? Operator.GTE : Operator.GT, LongType.instance.decompose(lower));
expression.add(upperInclusive ? Operator.LTE : Operator.LT, LongType.instance.decompose(upper));
return expression;
}
private static Expression expressionFor(AbstractType<?> validator, ByteBuffer term)
{
return expressionFor(Operator.LIKE_CONTAINS, validator, term);
}
private static Expression expressionFor(Operator op, AbstractType<?> validator, ByteBuffer term)
{
Expression expression = new Expression("", validator);
expression.add(op, term);
return expression;
}
private static Expression expressionForNot(AbstractType<?> validator, ByteBuffer lower, ByteBuffer upper, Iterable<ByteBuffer> terms)
{
Expression expression = new Expression("", validator);
expression.setOp(Expression.Op.RANGE);
expression.setLower(new Expression.Bound(lower, true));
expression.setUpper(new Expression.Bound(upper, true));
for (ByteBuffer term : terms)
expression.add(Operator.NEQ, term);
return expression;
}
private static Expression expressionForNot(Integer lower, Integer upper, Integer... terms)
{
return expressionForNot(Int32Type.instance,
Int32Type.instance.decompose(lower),
Int32Type.instance.decompose(upper),
Arrays.asList(terms).stream().map(Int32Type.instance::decompose).collect(Collectors.toList()));
}
private static Expression rangeWithExclusions(long lower, boolean lowerInclusive, long upper, boolean upperInclusive, Set<Long> exclusions)
{
Expression expression = expressionFor(lower, lowerInclusive, upper, upperInclusive);
for (long e : exclusions)
expression.add(Operator.NEQ, LongType.instance.decompose(e));
return expression;
}
private static Expression expressionForNot(String lower, String upper, String... terms)
{
return expressionForNot(UTF8Type.instance,
UTF8Type.instance.decompose(lower),
UTF8Type.instance.decompose(upper),
Arrays.asList(terms).stream().map(UTF8Type.instance::decompose).collect(Collectors.toList()));
}
private static Expression expressionFor(String term)
{
return expressionFor(term, Operator.LIKE_CONTAINS);
}
private static Expression expressionFor(String term, Operator op)
{
return expressionFor(op, UTF8Type.instance, UTF8Type.instance.decompose(term));
}
private static void addAll(OnDiskIndexBuilder builder, ByteBuffer term, TokenTreeBuilder tokens)
{
for (Pair<Long, LongSet> token : tokens)
{
for (long position : token.right.toArray())
builder.add(term, keyAt(position), position);
}
}
private static class KeyConverter implements Function<Long, DecoratedKey>
{
@Override
public DecoratedKey apply(Long offset)
{
return keyAt(offset);
}
}
}