| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.codecs.blockterms; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.List; |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.index.CorruptIndexException; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.index.IndexFileNames; |
| import org.apache.lucene.index.SegmentReadState; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.Accountables; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.PagedBytes; |
| import org.apache.lucene.util.packed.MonotonicBlockPackedReader; |
| |
| /** |
| * TermsIndexReader for simple every Nth terms indexes. |
| * |
| * @see FixedGapTermsIndexWriter |
| * @lucene.experimental |
| */ |
| public class FixedGapTermsIndexReader extends TermsIndexReaderBase { |
| |
| // NOTE: long is overkill here, but we use this in a |
| // number of places to multiply out the actual ord, and we |
| // will overflow int during those multiplies. So to avoid |
| // having to upgrade each multiple to long in multiple |
| // places (error prone), we use long here: |
| private final long indexInterval; |
| |
| private final int packedIntsVersion; |
| private final int blocksize; |
| |
| private static final int PAGED_BYTES_BITS = 15; |
| |
| // all fields share this single logical byte[] |
| private final PagedBytes.Reader termBytesReader; |
| |
| final HashMap<String, FieldIndexData> fields = new HashMap<>(); |
| |
| public FixedGapTermsIndexReader(SegmentReadState state) throws IOException { |
| final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS); |
| |
| String fileName = |
| IndexFileNames.segmentFileName( |
| state.segmentInfo.name, |
| state.segmentSuffix, |
| FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION); |
| final IndexInput in = state.directory.openInput(fileName, state.context); |
| |
| boolean success = false; |
| |
| try { |
| |
| CodecUtil.checkIndexHeader( |
| in, |
| FixedGapTermsIndexWriter.CODEC_NAME, |
| FixedGapTermsIndexWriter.VERSION_CURRENT, |
| FixedGapTermsIndexWriter.VERSION_CURRENT, |
| state.segmentInfo.getId(), |
| state.segmentSuffix); |
| |
| CodecUtil.checksumEntireFile(in); |
| |
| indexInterval = in.readVInt(); |
| if (indexInterval < 1) { |
| throw new CorruptIndexException("invalid indexInterval: " + indexInterval, in); |
| } |
| packedIntsVersion = in.readVInt(); |
| blocksize = in.readVInt(); |
| |
| seekDir(in); |
| |
| // Read directory |
| final int numFields = in.readVInt(); |
| if (numFields < 0) { |
| throw new CorruptIndexException("invalid numFields: " + numFields, in); |
| } |
| // System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + |
| // numFields); |
| for (int i = 0; i < numFields; i++) { |
| final int field = in.readVInt(); |
| final long numIndexTerms = |
| in.readVInt(); // TODO: change this to a vLong if we fix writer to support > 2B index |
| // terms |
| if (numIndexTerms < 0) { |
| throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms, in); |
| } |
| final long termsStart = in.readVLong(); |
| final long indexStart = in.readVLong(); |
| final long packedIndexStart = in.readVLong(); |
| final long packedOffsetsStart = in.readVLong(); |
| if (packedIndexStart < indexStart) { |
| throw new CorruptIndexException( |
| "invalid packedIndexStart: " |
| + packedIndexStart |
| + " indexStart: " |
| + indexStart |
| + "numIndexTerms: " |
| + numIndexTerms, |
| in); |
| } |
| final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); |
| FieldIndexData previous = |
| fields.put( |
| fieldInfo.name, |
| new FieldIndexData( |
| in, |
| termBytes, |
| indexStart, |
| termsStart, |
| packedIndexStart, |
| packedOffsetsStart, |
| numIndexTerms)); |
| if (previous != null) { |
| throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in); |
| } |
| } |
| success = true; |
| } finally { |
| if (success) { |
| IOUtils.close(in); |
| } else { |
| IOUtils.closeWhileHandlingException(in); |
| } |
| termBytesReader = termBytes.freeze(true); |
| } |
| } |
| |
| private class IndexEnum extends FieldIndexEnum { |
| private final FieldIndexData fieldIndex; |
| private final BytesRef term = new BytesRef(); |
| private long ord; |
| |
| public IndexEnum(FieldIndexData fieldIndex) { |
| this.fieldIndex = fieldIndex; |
| } |
| |
| @Override |
| public BytesRef term() { |
| return term; |
| } |
| |
| @Override |
| public long seek(BytesRef target) { |
| long lo = 0; // binary search |
| long hi = fieldIndex.numIndexTerms - 1; |
| |
| while (hi >= lo) { |
| long mid = (lo + hi) >>> 1; |
| |
| final long offset = fieldIndex.termOffsets.get(mid); |
| final int length = (int) (fieldIndex.termOffsets.get(1 + mid) - offset); |
| termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); |
| |
| int delta = target.compareTo(term); |
| if (delta < 0) { |
| hi = mid - 1; |
| } else if (delta > 0) { |
| lo = mid + 1; |
| } else { |
| assert mid >= 0; |
| ord = mid * indexInterval; |
| return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid); |
| } |
| } |
| |
| if (hi < 0) { |
| assert hi == -1; |
| hi = 0; |
| } |
| |
| final long offset = fieldIndex.termOffsets.get(hi); |
| final int length = (int) (fieldIndex.termOffsets.get(1 + hi) - offset); |
| termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); |
| |
| ord = hi * indexInterval; |
| return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi); |
| } |
| |
| @Override |
| public long next() { |
| final long idx = 1 + (ord / indexInterval); |
| if (idx >= fieldIndex.numIndexTerms) { |
| return -1; |
| } |
| ord += indexInterval; |
| |
| final long offset = fieldIndex.termOffsets.get(idx); |
| final int length = (int) (fieldIndex.termOffsets.get(1 + idx) - offset); |
| termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); |
| return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx); |
| } |
| |
| @Override |
| public long ord() { |
| return ord; |
| } |
| |
| @Override |
| public long seek(long ord) { |
| long idx = ord / indexInterval; |
| // caller must ensure ord is in bounds |
| assert idx < fieldIndex.numIndexTerms; |
| final long offset = fieldIndex.termOffsets.get(idx); |
| final int length = (int) (fieldIndex.termOffsets.get(1 + idx) - offset); |
| termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); |
| this.ord = idx * indexInterval; |
| return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx); |
| } |
| } |
| |
| @Override |
| public boolean supportsOrd() { |
| return true; |
| } |
| |
| private final class FieldIndexData implements Accountable { |
| // where this field's terms begin in the packed byte[] |
| // data |
| final long termBytesStart; |
| |
| // offset into index termBytes |
| final MonotonicBlockPackedReader termOffsets; |
| |
| // index pointers into main terms dict |
| final MonotonicBlockPackedReader termsDictOffsets; |
| |
| final long numIndexTerms; |
| final long termsStart; |
| |
| public FieldIndexData( |
| IndexInput in, |
| PagedBytes termBytes, |
| long indexStart, |
| long termsStart, |
| long packedIndexStart, |
| long packedOffsetsStart, |
| long numIndexTerms) |
| throws IOException { |
| |
| this.termsStart = termsStart; |
| termBytesStart = termBytes.getPointer(); |
| |
| IndexInput clone = in.clone(); |
| clone.seek(indexStart); |
| |
| this.numIndexTerms = numIndexTerms; |
| assert this.numIndexTerms > 0 : "numIndexTerms=" + numIndexTerms; |
| |
| // slurp in the images from disk: |
| |
| try { |
| final long numTermBytes = packedIndexStart - indexStart; |
| termBytes.copy(clone, numTermBytes); |
| |
| // records offsets into main terms dict file |
| termsDictOffsets = |
| MonotonicBlockPackedReader.of( |
| clone, packedIntsVersion, blocksize, numIndexTerms, false); |
| |
| // records offsets into byte[] term data |
| termOffsets = |
| MonotonicBlockPackedReader.of( |
| clone, packedIntsVersion, blocksize, 1 + numIndexTerms, false); |
| } finally { |
| clone.close(); |
| } |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return ((termOffsets != null) ? termOffsets.ramBytesUsed() : 0) |
| + ((termsDictOffsets != null) ? termsDictOffsets.ramBytesUsed() : 0); |
| } |
| |
| @Override |
| public Collection<Accountable> getChildResources() { |
| List<Accountable> resources = new ArrayList<>(); |
| if (termOffsets != null) { |
| resources.add(Accountables.namedAccountable("term lengths", termOffsets)); |
| } |
| if (termsDictOffsets != null) { |
| resources.add(Accountables.namedAccountable("offsets", termsDictOffsets)); |
| } |
| return Collections.unmodifiableList(resources); |
| } |
| |
| @Override |
| public String toString() { |
| return "FixedGapTermIndex(indexterms=" + numIndexTerms + ")"; |
| } |
| } |
| |
| @Override |
| public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) { |
| return new IndexEnum(fields.get(fieldInfo.name)); |
| } |
| |
| @Override |
| public void close() throws IOException {} |
| |
| private void seekDir(IndexInput input) throws IOException { |
| input.seek(input.length() - CodecUtil.footerLength() - 8); |
| long dirOffset = input.readLong(); |
| input.seek(dirOffset); |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| long sizeInBytes = ((termBytesReader != null) ? termBytesReader.ramBytesUsed() : 0); |
| for (FieldIndexData entry : fields.values()) { |
| sizeInBytes += entry.ramBytesUsed(); |
| } |
| return sizeInBytes; |
| } |
| |
| @Override |
| public Collection<Accountable> getChildResources() { |
| return Accountables.namedAccountables("field", fields); |
| } |
| |
| @Override |
| public String toString() { |
| return getClass().getSimpleName() |
| + "(fields=" |
| + fields.size() |
| + ",interval=" |
| + indexInterval |
| + ")"; |
| } |
| } |