blob: 39ad17d01f8e5c5f823086acbfebeeb06c6058d9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blockterms;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
/**
* TermsIndexReader for simple every Nth terms indexes.
*
* @see FixedGapTermsIndexWriter
* @lucene.experimental
*/
public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
// NOTE: long is overkill here, but we use this in a
// number of places to multiply out the actual ord, and we
// will overflow int during those multiplies. So to avoid
// having to upgrade each multiple to long in multiple
// places (error prone), we use long here:
private final long indexInterval;
private final int packedIntsVersion;
private final int blocksize;
private static final int PAGED_BYTES_BITS = 15;
// all fields share this single logical byte[]
private final PagedBytes.Reader termBytesReader;
final HashMap<String, FieldIndexData> fields = new HashMap<>();
public FixedGapTermsIndexReader(SegmentReadState state) throws IOException {
final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);
String fileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name,
state.segmentSuffix,
FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION);
final IndexInput in = state.directory.openInput(fileName, state.context);
boolean success = false;
try {
CodecUtil.checkIndexHeader(
in,
FixedGapTermsIndexWriter.CODEC_NAME,
FixedGapTermsIndexWriter.VERSION_CURRENT,
FixedGapTermsIndexWriter.VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
CodecUtil.checksumEntireFile(in);
indexInterval = in.readVInt();
if (indexInterval < 1) {
throw new CorruptIndexException("invalid indexInterval: " + indexInterval, in);
}
packedIntsVersion = in.readVInt();
blocksize = in.readVInt();
seekDir(in);
// Read directory
final int numFields = in.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, in);
}
// System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" +
// numFields);
for (int i = 0; i < numFields; i++) {
final int field = in.readVInt();
final long numIndexTerms =
in.readVInt(); // TODO: change this to a vLong if we fix writer to support > 2B index
// terms
if (numIndexTerms < 0) {
throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms, in);
}
final long termsStart = in.readVLong();
final long indexStart = in.readVLong();
final long packedIndexStart = in.readVLong();
final long packedOffsetsStart = in.readVLong();
if (packedIndexStart < indexStart) {
throw new CorruptIndexException(
"invalid packedIndexStart: "
+ packedIndexStart
+ " indexStart: "
+ indexStart
+ "numIndexTerms: "
+ numIndexTerms,
in);
}
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
FieldIndexData previous =
fields.put(
fieldInfo.name,
new FieldIndexData(
in,
termBytes,
indexStart,
termsStart,
packedIndexStart,
packedOffsetsStart,
numIndexTerms));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in);
}
}
success = true;
} finally {
if (success) {
IOUtils.close(in);
} else {
IOUtils.closeWhileHandlingException(in);
}
termBytesReader = termBytes.freeze(true);
}
}
private class IndexEnum extends FieldIndexEnum {
private final FieldIndexData fieldIndex;
private final BytesRef term = new BytesRef();
private long ord;
public IndexEnum(FieldIndexData fieldIndex) {
this.fieldIndex = fieldIndex;
}
@Override
public BytesRef term() {
return term;
}
@Override
public long seek(BytesRef target) {
long lo = 0; // binary search
long hi = fieldIndex.numIndexTerms - 1;
while (hi >= lo) {
long mid = (lo + hi) >>> 1;
final long offset = fieldIndex.termOffsets.get(mid);
final int length = (int) (fieldIndex.termOffsets.get(1 + mid) - offset);
termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
int delta = target.compareTo(term);
if (delta < 0) {
hi = mid - 1;
} else if (delta > 0) {
lo = mid + 1;
} else {
assert mid >= 0;
ord = mid * indexInterval;
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid);
}
}
if (hi < 0) {
assert hi == -1;
hi = 0;
}
final long offset = fieldIndex.termOffsets.get(hi);
final int length = (int) (fieldIndex.termOffsets.get(1 + hi) - offset);
termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
ord = hi * indexInterval;
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi);
}
@Override
public long next() {
final long idx = 1 + (ord / indexInterval);
if (idx >= fieldIndex.numIndexTerms) {
return -1;
}
ord += indexInterval;
final long offset = fieldIndex.termOffsets.get(idx);
final int length = (int) (fieldIndex.termOffsets.get(1 + idx) - offset);
termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
}
@Override
public long ord() {
return ord;
}
@Override
public long seek(long ord) {
long idx = ord / indexInterval;
// caller must ensure ord is in bounds
assert idx < fieldIndex.numIndexTerms;
final long offset = fieldIndex.termOffsets.get(idx);
final int length = (int) (fieldIndex.termOffsets.get(1 + idx) - offset);
termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
this.ord = idx * indexInterval;
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
}
}
@Override
public boolean supportsOrd() {
return true;
}
private final class FieldIndexData implements Accountable {
// where this field's terms begin in the packed byte[]
// data
final long termBytesStart;
// offset into index termBytes
final MonotonicBlockPackedReader termOffsets;
// index pointers into main terms dict
final MonotonicBlockPackedReader termsDictOffsets;
final long numIndexTerms;
final long termsStart;
public FieldIndexData(
IndexInput in,
PagedBytes termBytes,
long indexStart,
long termsStart,
long packedIndexStart,
long packedOffsetsStart,
long numIndexTerms)
throws IOException {
this.termsStart = termsStart;
termBytesStart = termBytes.getPointer();
IndexInput clone = in.clone();
clone.seek(indexStart);
this.numIndexTerms = numIndexTerms;
assert this.numIndexTerms > 0 : "numIndexTerms=" + numIndexTerms;
// slurp in the images from disk:
try {
final long numTermBytes = packedIndexStart - indexStart;
termBytes.copy(clone, numTermBytes);
// records offsets into main terms dict file
termsDictOffsets =
MonotonicBlockPackedReader.of(
clone, packedIntsVersion, blocksize, numIndexTerms, false);
// records offsets into byte[] term data
termOffsets =
MonotonicBlockPackedReader.of(
clone, packedIntsVersion, blocksize, 1 + numIndexTerms, false);
} finally {
clone.close();
}
}
@Override
public long ramBytesUsed() {
return ((termOffsets != null) ? termOffsets.ramBytesUsed() : 0)
+ ((termsDictOffsets != null) ? termsDictOffsets.ramBytesUsed() : 0);
}
@Override
public Collection<Accountable> getChildResources() {
List<Accountable> resources = new ArrayList<>();
if (termOffsets != null) {
resources.add(Accountables.namedAccountable("term lengths", termOffsets));
}
if (termsDictOffsets != null) {
resources.add(Accountables.namedAccountable("offsets", termsDictOffsets));
}
return Collections.unmodifiableList(resources);
}
@Override
public String toString() {
return "FixedGapTermIndex(indexterms=" + numIndexTerms + ")";
}
}
@Override
public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
return new IndexEnum(fields.get(fieldInfo.name));
}
@Override
public void close() throws IOException {}
private void seekDir(IndexInput input) throws IOException {
input.seek(input.length() - CodecUtil.footerLength() - 8);
long dirOffset = input.readLong();
input.seek(dirOffset);
}
@Override
public long ramBytesUsed() {
long sizeInBytes = ((termBytesReader != null) ? termBytesReader.ramBytesUsed() : 0);
for (FieldIndexData entry : fields.values()) {
sizeInBytes += entry.ramBytesUsed();
}
return sizeInBytes;
}
@Override
public Collection<Accountable> getChildResources() {
return Accountables.namedAccountables("field", fields);
}
@Override
public String toString() {
return getClass().getSimpleName()
+ "(fields="
+ fields.size()
+ ",interval="
+ indexInterval
+ ")";
}
}