blob: 02985394c16a3d01067f490bcf1aa08f6bfe3a3f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.index.sasi.disk;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.index.sasi.plan.Expression.Op;
import org.apache.cassandra.index.sasi.sa.IndexedTerm;
import org.apache.cassandra.index.sasi.sa.IntegralSA;
import org.apache.cassandra.index.sasi.sa.SA;
import org.apache.cassandra.index.sasi.sa.TermIterator;
import org.apache.cassandra.index.sasi.sa.SuffixSA;
import org.apache.cassandra.db.marshal.*;
import org.apache.cassandra.io.FSWriteError;
import org.apache.cassandra.io.util.*;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.Pair;
import com.carrotsearch.hppc.LongArrayList;
import com.carrotsearch.hppc.LongSet;
import com.carrotsearch.hppc.ShortArrayList;
import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class OnDiskIndexBuilder
{
private static final Logger logger = LoggerFactory.getLogger(OnDiskIndexBuilder.class);
public enum Mode
{
PREFIX(EnumSet.of(Op.EQ, Op.MATCH, Op.PREFIX, Op.NOT_EQ, Op.RANGE)),
CONTAINS(EnumSet.of(Op.EQ, Op.MATCH, Op.CONTAINS, Op.PREFIX, Op.SUFFIX, Op.NOT_EQ)),
SPARSE(EnumSet.of(Op.EQ, Op.NOT_EQ, Op.RANGE));
Set<Op> supportedOps;
Mode(Set<Op> ops)
{
supportedOps = ops;
}
public static Mode mode(String mode)
{
return Mode.valueOf(mode.toUpperCase());
}
public boolean supports(Op op)
{
return supportedOps.contains(op);
}
}
public enum TermSize
{
INT(4), LONG(8), UUID(16), VARIABLE(-1);
public final int size;
TermSize(int size)
{
this.size = size;
}
public boolean isConstant()
{
return this != VARIABLE;
}
public static TermSize of(int size)
{
switch (size)
{
case -1:
return VARIABLE;
case 4:
return INT;
case 8:
return LONG;
case 16:
return UUID;
default:
throw new IllegalStateException("unknown state: " + size);
}
}
public static TermSize sizeOf(AbstractType<?> comparator)
{
if (comparator instanceof Int32Type || comparator instanceof FloatType)
return INT;
if (comparator instanceof LongType || comparator instanceof DoubleType
|| comparator instanceof TimestampType || comparator instanceof DateType)
return LONG;
if (comparator instanceof TimeUUIDType || comparator instanceof UUIDType)
return UUID;
return VARIABLE;
}
}
public static final int BLOCK_SIZE = 4096;
public static final int MAX_TERM_SIZE = 1024;
public static final int SUPER_BLOCK_SIZE = 64;
public static final int IS_PARTIAL_BIT = 15;
private static final SequentialWriterOption WRITER_OPTION = SequentialWriterOption.newBuilder()
.bufferSize(BLOCK_SIZE)
.build();
private final List<MutableLevel<InMemoryPointerTerm>> levels = new ArrayList<>();
private MutableLevel<InMemoryDataTerm> dataLevel;
private final TermSize termSize;
private final AbstractType<?> keyComparator, termComparator;
private final Map<ByteBuffer, TokenTreeBuilder> terms;
private final Mode mode;
private final boolean marksPartials;
private ByteBuffer minKey, maxKey;
private long estimatedBytes;
public OnDiskIndexBuilder(AbstractType<?> keyComparator, AbstractType<?> comparator, Mode mode)
{
this(keyComparator, comparator, mode, true);
}
public OnDiskIndexBuilder(AbstractType<?> keyComparator, AbstractType<?> comparator, Mode mode, boolean marksPartials)
{
this.keyComparator = keyComparator;
this.termComparator = comparator;
this.terms = new HashMap<>();
this.termSize = TermSize.sizeOf(comparator);
this.mode = mode;
this.marksPartials = marksPartials;
}
public OnDiskIndexBuilder add(ByteBuffer term, DecoratedKey key, long keyPosition)
{
if (term.remaining() >= MAX_TERM_SIZE)
{
logger.error("Rejecting value (value size {}, maximum size {}).",
FBUtilities.prettyPrintMemory(term.remaining()),
FBUtilities.prettyPrintMemory(Short.MAX_VALUE));
return this;
}
TokenTreeBuilder tokens = terms.get(term);
if (tokens == null)
{
terms.put(term, (tokens = new DynamicTokenTreeBuilder()));
// on-heap size estimates from jol
// 64 bytes for TTB + 48 bytes for TreeMap in TTB + size bytes for the term (map key)
estimatedBytes += 64 + 48 + term.remaining();
}
tokens.add((Long) key.getToken().getTokenValue(), keyPosition);
// calculate key range (based on actual key values) for current index
minKey = (minKey == null || keyComparator.compare(minKey, key.getKey()) > 0) ? key.getKey() : minKey;
maxKey = (maxKey == null || keyComparator.compare(maxKey, key.getKey()) < 0) ? key.getKey() : maxKey;
// 60 ((boolean(1)*4) + (long(8)*4) + 24) bytes for the LongOpenHashSet created when the keyPosition was added
// + 40 bytes for the TreeMap.Entry + 8 bytes for the token (key).
// in the case of hash collision for the token we may overestimate but this is extremely rare
estimatedBytes += 60 + 40 + 8;
return this;
}
public long estimatedMemoryUse()
{
return estimatedBytes;
}
private void addTerm(InMemoryDataTerm term, SequentialWriter out) throws IOException
{
InMemoryPointerTerm ptr = dataLevel.add(term);
if (ptr == null)
return;
int levelIdx = 0;
for (;;)
{
MutableLevel<InMemoryPointerTerm> level = getIndexLevel(levelIdx++, out);
if ((ptr = level.add(ptr)) == null)
break;
}
}
public boolean isEmpty()
{
return terms.isEmpty();
}
public void finish(Pair<ByteBuffer, ByteBuffer> range, File file, TermIterator terms)
{
finish(Descriptor.CURRENT, range, file, terms);
}
/**
* Finishes up index building process by creating/populating index file.
*
* @param indexFile The file to write index contents to.
*
* @return true if index was written successfully, false otherwise (e.g. if index was empty).
*
* @throws FSWriteError on I/O error.
*/
public boolean finish(File indexFile) throws FSWriteError
{
return finish(Descriptor.CURRENT, indexFile);
}
@VisibleForTesting
protected boolean finish(Descriptor descriptor, File file) throws FSWriteError
{
// no terms means there is nothing to build
if (terms.isEmpty())
{
try
{
file.createNewFile();
}
catch (IOException e)
{
throw new FSWriteError(e, file);
}
return false;
}
// split terms into suffixes only if it's text, otherwise (even if CONTAINS is set) use terms in original form
SA sa = ((termComparator instanceof UTF8Type || termComparator instanceof AsciiType) && mode == Mode.CONTAINS)
? new SuffixSA(termComparator, mode) : new IntegralSA(termComparator, mode);
for (Map.Entry<ByteBuffer, TokenTreeBuilder> term : terms.entrySet())
sa.add(term.getKey(), term.getValue());
finish(descriptor, Pair.create(minKey, maxKey), file, sa.finish());
return true;
}
@SuppressWarnings("resource")
protected void finish(Descriptor descriptor, Pair<ByteBuffer, ByteBuffer> range, File file, TermIterator terms)
{
SequentialWriter out = null;
try
{
out = new SequentialWriter(file, WRITER_OPTION);
out.writeUTF(descriptor.version.toString());
out.writeShort(termSize.size);
// min, max term (useful to find initial scan range from search expressions)
ByteBufferUtil.writeWithShortLength(terms.minTerm(), out);
ByteBufferUtil.writeWithShortLength(terms.maxTerm(), out);
// min, max keys covered by index (useful when searching across multiple indexes)
ByteBufferUtil.writeWithShortLength(range.left, out);
ByteBufferUtil.writeWithShortLength(range.right, out);
out.writeUTF(mode.toString());
out.writeBoolean(marksPartials);
out.skipBytes((int) (BLOCK_SIZE - out.position()));
dataLevel = mode == Mode.SPARSE ? new DataBuilderLevel(out, new MutableDataBlock(termComparator, mode))
: new MutableLevel<>(out, new MutableDataBlock(termComparator, mode));
while (terms.hasNext())
{
Pair<IndexedTerm, TokenTreeBuilder> term = terms.next();
addTerm(new InMemoryDataTerm(term.left, term.right), out);
}
dataLevel.finalFlush();
for (MutableLevel l : levels)
l.flush(); // flush all of the buffers
// and finally write levels index
final long levelIndexPosition = out.position();
out.writeInt(levels.size());
for (int i = levels.size() - 1; i >= 0; i--)
levels.get(i).flushMetadata();
dataLevel.flushMetadata();
out.writeLong(levelIndexPosition);
// sync contents of the output and disk,
// since it's not done implicitly on close
out.sync();
}
catch (IOException e)
{
throw new FSWriteError(e, file);
}
finally
{
FileUtils.closeQuietly(out);
}
}
private MutableLevel<InMemoryPointerTerm> getIndexLevel(int idx, SequentialWriter out)
{
if (levels.size() == 0)
levels.add(new MutableLevel<>(out, new MutableBlock<>()));
if (levels.size() - 1 < idx)
{
int toAdd = idx - (levels.size() - 1);
for (int i = 0; i < toAdd; i++)
levels.add(new MutableLevel<>(out, new MutableBlock<>()));
}
return levels.get(idx);
}
protected static void alignToBlock(SequentialWriter out) throws IOException
{
long endOfBlock = out.position();
if ((endOfBlock & (BLOCK_SIZE - 1)) != 0) // align on the block boundary if needed
out.skipBytes((int) (FBUtilities.align(endOfBlock, BLOCK_SIZE) - endOfBlock));
}
private class InMemoryTerm
{
protected final IndexedTerm term;
public InMemoryTerm(IndexedTerm term)
{
this.term = term;
}
public int serializedSize()
{
return (termSize.isConstant() ? 0 : 2) + term.getBytes().remaining();
}
public void serialize(DataOutputPlus out) throws IOException
{
if (termSize.isConstant())
{
out.write(term.getBytes());
}
else
{
out.writeShort(term.getBytes().remaining() | ((marksPartials && term.isPartial() ? 1 : 0) << IS_PARTIAL_BIT));
out.write(term.getBytes());
}
}
}
private class InMemoryPointerTerm extends InMemoryTerm
{
protected final int blockCnt;
public InMemoryPointerTerm(IndexedTerm term, int blockCnt)
{
super(term);
this.blockCnt = blockCnt;
}
public int serializedSize()
{
return super.serializedSize() + 4;
}
public void serialize(DataOutputPlus out) throws IOException
{
super.serialize(out);
out.writeInt(blockCnt);
}
}
private class InMemoryDataTerm extends InMemoryTerm
{
private final TokenTreeBuilder keys;
public InMemoryDataTerm(IndexedTerm term, TokenTreeBuilder keys)
{
super(term);
this.keys = keys;
}
}
private class MutableLevel<T extends InMemoryTerm>
{
private final LongArrayList blockOffsets = new LongArrayList();
protected final SequentialWriter out;
private final MutableBlock<T> inProcessBlock;
private InMemoryPointerTerm lastTerm;
public MutableLevel(SequentialWriter out, MutableBlock<T> block)
{
this.out = out;
this.inProcessBlock = block;
}
/**
* @return If we flushed a block, return the last term of that block; else, null.
*/
public InMemoryPointerTerm add(T term) throws IOException
{
InMemoryPointerTerm toPromote = null;
if (!inProcessBlock.hasSpaceFor(term))
{
flush();
toPromote = lastTerm;
}
inProcessBlock.add(term);
lastTerm = new InMemoryPointerTerm(term.term, blockOffsets.size());
return toPromote;
}
public void flush() throws IOException
{
blockOffsets.add(out.position());
inProcessBlock.flushAndClear(out);
}
public void finalFlush() throws IOException
{
flush();
}
public void flushMetadata() throws IOException
{
flushMetadata(blockOffsets);
}
protected void flushMetadata(LongArrayList longArrayList) throws IOException
{
out.writeInt(longArrayList.size());
for (int i = 0; i < longArrayList.size(); i++)
out.writeLong(longArrayList.get(i));
}
}
/** builds standard data blocks and super blocks, as well */
private class DataBuilderLevel extends MutableLevel<InMemoryDataTerm>
{
private final LongArrayList superBlockOffsets = new LongArrayList();
/** count of regular data blocks written since current super block was init'd */
private int dataBlocksCnt;
private TokenTreeBuilder superBlockTree;
public DataBuilderLevel(SequentialWriter out, MutableBlock<InMemoryDataTerm> block)
{
super(out, block);
superBlockTree = new DynamicTokenTreeBuilder();
}
public InMemoryPointerTerm add(InMemoryDataTerm term) throws IOException
{
InMemoryPointerTerm ptr = super.add(term);
if (ptr != null)
{
dataBlocksCnt++;
flushSuperBlock(false);
}
superBlockTree.add(term.keys);
return ptr;
}
public void flushSuperBlock(boolean force) throws IOException
{
if (dataBlocksCnt == SUPER_BLOCK_SIZE || (force && !superBlockTree.isEmpty()))
{
superBlockOffsets.add(out.position());
superBlockTree.finish().write(out);
alignToBlock(out);
dataBlocksCnt = 0;
superBlockTree = new DynamicTokenTreeBuilder();
}
}
public void finalFlush() throws IOException
{
super.flush();
flushSuperBlock(true);
}
public void flushMetadata() throws IOException
{
super.flushMetadata();
flushMetadata(superBlockOffsets);
}
}
private static class MutableBlock<T extends InMemoryTerm>
{
protected final DataOutputBufferFixed buffer;
protected final ShortArrayList offsets;
public MutableBlock()
{
buffer = new DataOutputBufferFixed(BLOCK_SIZE);
offsets = new ShortArrayList();
}
public final void add(T term) throws IOException
{
offsets.add((short) buffer.position());
addInternal(term);
}
protected void addInternal(T term) throws IOException
{
term.serialize(buffer);
}
public boolean hasSpaceFor(T element)
{
return sizeAfter(element) < BLOCK_SIZE;
}
protected int sizeAfter(T element)
{
return getWatermark() + 4 + element.serializedSize();
}
protected int getWatermark()
{
return 4 + offsets.size() * 2 + (int) buffer.position();
}
public void flushAndClear(SequentialWriter out) throws IOException
{
out.writeInt(offsets.size());
for (int i = 0; i < offsets.size(); i++)
out.writeShort(offsets.get(i));
out.write(buffer.buffer());
alignToBlock(out);
offsets.clear();
buffer.clear();
}
}
private static class MutableDataBlock extends MutableBlock<InMemoryDataTerm>
{
private static final int MAX_KEYS_SPARSE = 5;
private final AbstractType<?> comparator;
private final Mode mode;
private int offset = 0;
private final List<TokenTreeBuilder> containers = new ArrayList<>();
private TokenTreeBuilder combinedIndex;
public MutableDataBlock(AbstractType<?> comparator, Mode mode)
{
this.comparator = comparator;
this.mode = mode;
this.combinedIndex = initCombinedIndex();
}
protected void addInternal(InMemoryDataTerm term) throws IOException
{
TokenTreeBuilder keys = term.keys;
if (mode == Mode.SPARSE)
{
if (keys.getTokenCount() > MAX_KEYS_SPARSE)
throw new IOException(String.format("Term - '%s' belongs to more than %d keys in %s mode, which is not allowed.",
comparator.getString(term.term.getBytes()), MAX_KEYS_SPARSE, mode.name()));
writeTerm(term, keys);
}
else
{
writeTerm(term, offset);
offset += keys.serializedSize();
containers.add(keys);
}
if (mode == Mode.SPARSE)
combinedIndex.add(keys);
}
protected int sizeAfter(InMemoryDataTerm element)
{
return super.sizeAfter(element) + ptrLength(element);
}
public void flushAndClear(SequentialWriter out) throws IOException
{
super.flushAndClear(out);
out.writeInt(mode == Mode.SPARSE ? offset : -1);
if (containers.size() > 0)
{
for (TokenTreeBuilder tokens : containers)
tokens.write(out);
}
if (mode == Mode.SPARSE && combinedIndex != null)
combinedIndex.finish().write(out);
alignToBlock(out);
containers.clear();
combinedIndex = initCombinedIndex();
offset = 0;
}
private int ptrLength(InMemoryDataTerm term)
{
return (term.keys.getTokenCount() > 5)
? 5 // 1 byte type + 4 byte offset to the tree
: 1 + (8 * (int) term.keys.getTokenCount()); // 1 byte size + n 8 byte tokens
}
private void writeTerm(InMemoryTerm term, TokenTreeBuilder keys) throws IOException
{
term.serialize(buffer);
buffer.writeByte((byte) keys.getTokenCount());
for (Pair<Long, LongSet> key : keys)
buffer.writeLong(key.left);
}
private void writeTerm(InMemoryTerm term, int offset) throws IOException
{
term.serialize(buffer);
buffer.writeByte(0x0);
buffer.writeInt(offset);
}
private TokenTreeBuilder initCombinedIndex()
{
return mode == Mode.SPARSE ? new DynamicTokenTreeBuilder() : null;
}
}
}