blob: 3a32f9892c32f81e7e512d511bd1b0a66105ccfa [file] [log] [blame]
package org.apache.lucene.index.values;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Comparator;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.index.values.Bytes.BytesBaseSortedSource;
import org.apache.lucene.index.values.Bytes.BytesReaderBase;
import org.apache.lucene.index.values.Bytes.BytesWriterBase;
import org.apache.lucene.index.values.FixedDerefBytesImpl.Reader.DerefBytesEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.ByteBlockPool.Allocator;
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
import org.apache.lucene.util.BytesRefHash.TrackingDirectBytesStartArray;
import org.apache.lucene.util.packed.PackedInts;
// Stores fixed-length byte[] by deref, ie when two docs
// have the same value, they store only 1 byte[]
/**
* @lucene.experimental
*/
class FixedSortedBytesImpl {
static final String CODEC_NAME = "FixedSortedBytes";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
static class Writer extends BytesWriterBase {
private int size = -1;
private int[] docToEntry;
private final Comparator<BytesRef> comp;
private final BytesRefHash hash = new BytesRefHash(pool,
BytesRefHash.DEFAULT_CAPACITY, new TrackingDirectBytesStartArray(
BytesRefHash.DEFAULT_CAPACITY, bytesUsed));
public Writer(Directory dir, String id, Comparator<BytesRef> comp,
AtomicLong bytesUsed) throws IOException {
this(dir, id, comp, new DirectTrackingAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed),
bytesUsed);
}
public Writer(Directory dir, String id, Comparator<BytesRef> comp,
Allocator allocator, AtomicLong bytesUsed) throws IOException {
super(dir, id, CODEC_NAME, VERSION_CURRENT, true,
new ByteBlockPool(allocator), bytesUsed);
docToEntry = new int[1];
// docToEntry[0] = -1;
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT);
this.comp = comp;
}
@Override
public void add(int docID, BytesRef bytes) throws IOException {
if (bytes.length == 0)
return; // default - skip it
if (size == -1) {
size = bytes.length;
datOut.writeInt(size);
} else if (bytes.length != size) {
throw new IllegalArgumentException("expected bytes size=" + size
+ " but got " + bytes.length);
}
if (docID >= docToEntry.length) {
final int[] newArray = new int[ArrayUtil.oversize(1 + docID,
RamUsageEstimator.NUM_BYTES_INT)];
System.arraycopy(docToEntry, 0, newArray, 0, docToEntry.length);
bytesUsed.addAndGet((newArray.length - docToEntry.length)
* RamUsageEstimator.NUM_BYTES_INT);
docToEntry = newArray;
}
int e = hash.add(bytes);
docToEntry[docID] = 1 + (e < 0 ? (-e) - 1 : e);
}
// Important that we get docCount, in case there were
// some last docs that we didn't see
@Override
public void finish(int docCount) throws IOException {
try {
if (size == -1) {// no data added
datOut.writeInt(size);
}
final int[] sortedEntries = hash.sort(comp);
final int count = hash.size();
int[] address = new int[count];
// first dump bytes data, recording address as we go
for (int i = 0; i < count; i++) {
final int e = sortedEntries[i];
final BytesRef bytes = hash.get(e, new BytesRef());
assert bytes.length == size;
datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length);
address[e] = 1 + i;
}
idxOut.writeInt(count);
// next write index
PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount,
PackedInts.bitsRequired(count));
final int limit;
if (docCount > docToEntry.length) {
limit = docToEntry.length;
} else {
limit = docCount;
}
for (int i = 0; i < limit; i++) {
final int e = docToEntry[i];
if (e == 0) {
// null is encoded as zero
w.add(0);
} else {
assert e > 0 && e <= count : "index must 0 > && <= " + count
+ " was: " + e;
w.add(address[e - 1]);
}
}
for (int i = limit; i < docCount; i++) {
w.add(0);
}
w.finish();
} finally {
super.finish(docCount);
bytesUsed.addAndGet((-docToEntry.length)
* RamUsageEstimator.NUM_BYTES_INT);
docToEntry = null;
hash.close();
}
}
}
public static class Reader extends BytesReaderBase {
private final int size;
public Reader(Directory dir, String id, int maxDoc) throws IOException {
super(dir, id, CODEC_NAME, VERSION_START, true);
size = datIn.readInt();
}
@Override
public org.apache.lucene.index.values.IndexDocValues.Source load()
throws IOException {
return loadSorted(null);
}
@Override
public SortedSource loadSorted(Comparator<BytesRef> comp)
throws IOException {
final IndexInput idxInput = cloneIndex();
final IndexInput datInput = cloneData();
datInput.seek(CodecUtil.headerLength(CODEC_NAME) + 4);
idxInput.seek(CodecUtil.headerLength(CODEC_NAME));
return new Source(datInput, idxInput, size, idxInput.readInt(), comp);
}
private static class Source extends BytesBaseSortedSource {
private final PackedInts.Reader index;
private final int numValue;
private final int size;
public Source(IndexInput datIn, IndexInput idxIn, int size,
int numValues, Comparator<BytesRef> comp) throws IOException {
super(datIn, idxIn, comp, new PagedBytes(PAGED_BYTES_BITS), size
* numValues);
this.size = size;
this.numValue = numValues;
index = PackedInts.getReader(idxIn);
closeIndexInput();
}
@Override
public int ord(int docID) {
return (int) index.get(docID) -1;
}
@Override
public int getByValue(BytesRef bytes, BytesRef tmpRef) {
return binarySearch(bytes, tmpRef, 0, numValue - 1);
}
@Override
public int getValueCount() {
return numValue;
}
@Override
protected BytesRef deref(int ord, BytesRef bytesRef) {
return data.fillSlice(bytesRef, (ord * size), size);
}
@Override
public ValueType type() {
return ValueType.BYTES_FIXED_SORTED;
}
@Override
protected int maxDoc() {
return index.size();
}
}
@Override
public ValuesEnum getEnum(AttributeSource source) throws IOException {
// do unsorted
return new DerefBytesEnum(source, cloneData(), cloneIndex(), size);
}
@Override
public ValueType type() {
return ValueType.BYTES_FIXED_SORTED;
}
}
}