blob: adf8191adc777b2ed660e8d932b3b443793f864e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blockterms;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
/**
* TermsIndexReader for simple every Nth terms indexes.
*
* @see FixedGapTermsIndexWriter
* @lucene.experimental
*/
public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
// NOTE: long is overkill here, but we use this in a
// number of places to multiply out the actual ord, and we
// will overflow int during those multiplies. So to avoid
// having to upgrade each multiple to long in multiple
// places (error prone), we use long here:
private final long indexInterval;
private final int packedIntsVersion;
private final int blocksize;
private final static int PAGED_BYTES_BITS = 15;
// all fields share this single logical byte[]
private final PagedBytes.Reader termBytesReader;
final HashMap<String,FieldIndexData> fields = new HashMap<>();
public FixedGapTermsIndexReader(SegmentReadState state) throws IOException {
final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);
String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix,
FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION);
final IndexInput in = state.directory.openInput(fileName, state.context);
boolean success = false;
try {
CodecUtil.checkIndexHeader(in, FixedGapTermsIndexWriter.CODEC_NAME,
FixedGapTermsIndexWriter.VERSION_CURRENT,
FixedGapTermsIndexWriter.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.checksumEntireFile(in);
indexInterval = in.readVInt();
if (indexInterval < 1) {
throw new CorruptIndexException("invalid indexInterval: " + indexInterval, in);
}
packedIntsVersion = in.readVInt();
blocksize = in.readVInt();
seekDir(in);
// Read directory
final int numFields = in.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, in);
}
//System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields);
for(int i=0;i<numFields;i++) {
final int field = in.readVInt();
final long numIndexTerms = in.readVInt(); // TODO: change this to a vLong if we fix writer to support > 2B index terms
if (numIndexTerms < 0) {
throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms, in);
}
final long termsStart = in.readVLong();
final long indexStart = in.readVLong();
final long packedIndexStart = in.readVLong();
final long packedOffsetsStart = in.readVLong();
if (packedIndexStart < indexStart) {
throw new CorruptIndexException("invalid packedIndexStart: " + packedIndexStart + " indexStart: " + indexStart + "numIndexTerms: " + numIndexTerms, in);
}
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
FieldIndexData previous = fields.put(fieldInfo.name, new FieldIndexData(in, termBytes, indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in);
}
}
success = true;
} finally {
if (success) {
IOUtils.close(in);
} else {
IOUtils.closeWhileHandlingException(in);
}
termBytesReader = termBytes.freeze(true);
}
}
private class IndexEnum extends FieldIndexEnum {
private final FieldIndexData fieldIndex;
private final BytesRef term = new BytesRef();
private long ord;
public IndexEnum(FieldIndexData fieldIndex) {
this.fieldIndex = fieldIndex;
}
@Override
public BytesRef term() {
return term;
}
@Override
public long seek(BytesRef target) {
long lo = 0; // binary search
long hi = fieldIndex.numIndexTerms - 1;
while (hi >= lo) {
long mid = (lo + hi) >>> 1;
final long offset = fieldIndex.termOffsets.get(mid);
final int length = (int) (fieldIndex.termOffsets.get(1+mid) - offset);
termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
int delta = target.compareTo(term);
if (delta < 0) {
hi = mid - 1;
} else if (delta > 0) {
lo = mid + 1;
} else {
assert mid >= 0;
ord = mid*indexInterval;
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid);
}
}
if (hi < 0) {
assert hi == -1;
hi = 0;
}
final long offset = fieldIndex.termOffsets.get(hi);
final int length = (int) (fieldIndex.termOffsets.get(1+hi) - offset);
termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
ord = hi*indexInterval;
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi);
}
@Override
public long next() {
final long idx = 1 + (ord / indexInterval);
if (idx >= fieldIndex.numIndexTerms) {
return -1;
}
ord += indexInterval;
final long offset = fieldIndex.termOffsets.get(idx);
final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
}
@Override
public long ord() {
return ord;
}
@Override
public long seek(long ord) {
long idx = ord / indexInterval;
// caller must ensure ord is in bounds
assert idx < fieldIndex.numIndexTerms;
final long offset = fieldIndex.termOffsets.get(idx);
final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
this.ord = idx * indexInterval;
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
}
}
@Override
public boolean supportsOrd() {
return true;
}
private final class FieldIndexData implements Accountable {
// where this field's terms begin in the packed byte[]
// data
final long termBytesStart;
// offset into index termBytes
final MonotonicBlockPackedReader termOffsets;
// index pointers into main terms dict
final MonotonicBlockPackedReader termsDictOffsets;
final long numIndexTerms;
final long termsStart;
public FieldIndexData(IndexInput in, PagedBytes termBytes, long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, long numIndexTerms) throws IOException {
this.termsStart = termsStart;
termBytesStart = termBytes.getPointer();
IndexInput clone = in.clone();
clone.seek(indexStart);
this.numIndexTerms = numIndexTerms;
assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms;
// slurp in the images from disk:
try {
final long numTermBytes = packedIndexStart - indexStart;
termBytes.copy(clone, numTermBytes);
// records offsets into main terms dict file
termsDictOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, numIndexTerms, false);
// records offsets into byte[] term data
termOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false);
} finally {
clone.close();
}
}
@Override
public long ramBytesUsed() {
return ((termOffsets!=null)? termOffsets.ramBytesUsed() : 0) +
((termsDictOffsets!=null)? termsDictOffsets.ramBytesUsed() : 0);
}
@Override
public Collection<Accountable> getChildResources() {
List<Accountable> resources = new ArrayList<>();
if (termOffsets != null) {
resources.add(Accountables.namedAccountable("term lengths", termOffsets));
}
if (termsDictOffsets != null) {
resources.add(Accountables.namedAccountable("offsets", termsDictOffsets));
}
return Collections.unmodifiableList(resources);
}
@Override
public String toString() {
return "FixedGapTermIndex(indexterms=" + numIndexTerms + ")";
}
}
@Override
public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
return new IndexEnum(fields.get(fieldInfo.name));
}
@Override
public void close() throws IOException {}
private void seekDir(IndexInput input) throws IOException {
input.seek(input.length() - CodecUtil.footerLength() - 8);
long dirOffset = input.readLong();
input.seek(dirOffset);
}
@Override
public long ramBytesUsed() {
long sizeInBytes = ((termBytesReader!=null)? termBytesReader.ramBytesUsed() : 0);
for(FieldIndexData entry : fields.values()) {
sizeInBytes += entry.ramBytesUsed();
}
return sizeInBytes;
}
@Override
public Collection<Accountable> getChildResources() {
return Accountables.namedAccountables("field", fields);
}
@Override
public String toString() {
return getClass().getSimpleName() + "(fields=" + fields.size() + ",interval=" + indexInterval + ")";
}
}