lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexReader.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.codecs.blockterms;


 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;

 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.Accountables;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.PagedBytes;
 import org.apache.lucene.util.packed.MonotonicBlockPackedReader;

 /**
  * TermsIndexReader for simple every Nth terms indexes.
  *
  * @see FixedGapTermsIndexWriter
  * @lucene.experimental
  */
 public class FixedGapTermsIndexReader extends TermsIndexReaderBase {

   // NOTE: long is overkill here, but we use this in a
   // number of places to multiply out the actual ord, and we
   // will overflow int during those multiplies.  So to avoid
   // having to upgrade each multiple to long in multiple
   // places (error prone), we use long here:
   private final long indexInterval;

   private final int packedIntsVersion;
   private final int blocksize;

   private final static int PAGED_BYTES_BITS = 15;

   // all fields share this single logical byte[]
   private final PagedBytes.Reader termBytesReader;

   final HashMap<String,FieldIndexData> fields = new HashMap<>();

   public FixedGapTermsIndexReader(SegmentReadState state) throws IOException {
     final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);

     String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
                                                      state.segmentSuffix,
                                                      FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION);
     final IndexInput in = state.directory.openInput(fileName, state.context);

     boolean success = false;

     try {

       CodecUtil.checkIndexHeader(in, FixedGapTermsIndexWriter.CODEC_NAME,
                                        FixedGapTermsIndexWriter.VERSION_CURRENT,
                                        FixedGapTermsIndexWriter.VERSION_CURRENT,
                                        state.segmentInfo.getId(), state.segmentSuffix);

       CodecUtil.checksumEntireFile(in);

       indexInterval = in.readVInt();
       if (indexInterval < 1) {
         throw new CorruptIndexException("invalid indexInterval: " + indexInterval, in);
       }
       packedIntsVersion = in.readVInt();
       blocksize = in.readVInt();

       seekDir(in);

       // Read directory
       final int numFields = in.readVInt();
       if (numFields < 0) {
         throw new CorruptIndexException("invalid numFields: " + numFields, in);
       }
       //System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields);
       for(int i=0;i<numFields;i++) {
         final int field = in.readVInt();
         final long numIndexTerms = in.readVInt(); // TODO: change this to a vLong if we fix writer to support > 2B index terms
         if (numIndexTerms < 0) {
           throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms, in);
         }
         final long termsStart = in.readVLong();
         final long indexStart = in.readVLong();
         final long packedIndexStart = in.readVLong();
         final long packedOffsetsStart = in.readVLong();
         if (packedIndexStart < indexStart) {
           throw new CorruptIndexException("invalid packedIndexStart: " + packedIndexStart + " indexStart: " + indexStart + "numIndexTerms: " + numIndexTerms, in);
         }
         final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
         FieldIndexData previous = fields.put(fieldInfo.name, new FieldIndexData(in, termBytes, indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms));
         if (previous != null) {
           throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in);
         }
       }
       success = true;
     } finally {
       if (success) {
         IOUtils.close(in);
       } else {
         IOUtils.closeWhileHandlingException(in);
       }
       termBytesReader = termBytes.freeze(true);
     }
   }

   private class IndexEnum extends FieldIndexEnum {
     private final FieldIndexData fieldIndex;
     private final BytesRef term = new BytesRef();
     private long ord;

     public IndexEnum(FieldIndexData fieldIndex) {
       this.fieldIndex = fieldIndex;
     }

     @Override
     public BytesRef term() {
       return term;
     }

     @Override
     public long seek(BytesRef target) {
       long lo = 0;          // binary search
       long hi = fieldIndex.numIndexTerms - 1;

       while (hi >= lo) {
         long mid = (lo + hi) >>> 1;

         final long offset = fieldIndex.termOffsets.get(mid);
         final int length = (int) (fieldIndex.termOffsets.get(1+mid) - offset);
         termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);

         int delta = target.compareTo(term);
         if (delta < 0) {
           hi = mid - 1;
         } else if (delta > 0) {
           lo = mid + 1;
         } else {
           assert mid >= 0;
           ord = mid*indexInterval;
           return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid);
         }
       }

       if (hi < 0) {
         assert hi == -1;
         hi = 0;
       }

       final long offset = fieldIndex.termOffsets.get(hi);
       final int length = (int) (fieldIndex.termOffsets.get(1+hi) - offset);
       termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);

       ord = hi*indexInterval;
       return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi);
     }

     @Override
     public long next() {
       final long idx = 1 + (ord / indexInterval);
       if (idx >= fieldIndex.numIndexTerms) {
         return -1;
       }
       ord += indexInterval;

       final long offset = fieldIndex.termOffsets.get(idx);
       final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
       termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
       return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
     }

     @Override
     public long ord() {
       return ord;
     }

     @Override
     public long seek(long ord) {
       long idx = ord / indexInterval;
       // caller must ensure ord is in bounds
       assert idx < fieldIndex.numIndexTerms;
       final long offset = fieldIndex.termOffsets.get(idx);
       final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
       termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
       this.ord = idx * indexInterval;
       return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
     }
   }

   @Override
   public boolean supportsOrd() {
     return true;
   }

   private final class FieldIndexData implements Accountable {
     // where this field's terms begin in the packed byte[]
     // data
     final long termBytesStart;

     // offset into index termBytes
     final MonotonicBlockPackedReader termOffsets;

     // index pointers into main terms dict
     final MonotonicBlockPackedReader termsDictOffsets;

     final long numIndexTerms;
     final long termsStart;

     public FieldIndexData(IndexInput in, PagedBytes termBytes, long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, long numIndexTerms) throws IOException {

       this.termsStart = termsStart;
       termBytesStart = termBytes.getPointer();

       IndexInput clone = in.clone();
       clone.seek(indexStart);

       this.numIndexTerms = numIndexTerms;
       assert this.numIndexTerms  > 0: "numIndexTerms=" + numIndexTerms;

       // slurp in the images from disk:

       try {
         final long numTermBytes = packedIndexStart - indexStart;
         termBytes.copy(clone, numTermBytes);

         // records offsets into main terms dict file
         termsDictOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, numIndexTerms, false);

         // records offsets into byte[] term data
         termOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false);
       } finally {
         clone.close();
       }
     }

     @Override
     public long ramBytesUsed() {
       return ((termOffsets!=null)? termOffsets.ramBytesUsed() : 0) +
           ((termsDictOffsets!=null)? termsDictOffsets.ramBytesUsed() : 0);
     }

     @Override
     public Collection<Accountable> getChildResources() {
       List<Accountable> resources = new ArrayList<>();
       if (termOffsets != null) {
         resources.add(Accountables.namedAccountable("term lengths", termOffsets));
       }
       if (termsDictOffsets != null) {
         resources.add(Accountables.namedAccountable("offsets", termsDictOffsets));
       }
       return Collections.unmodifiableList(resources);
     }

     @Override
     public String toString() {
       return "FixedGapTermIndex(indexterms=" + numIndexTerms + ")";
     }
   }

   @Override
   public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
     return new IndexEnum(fields.get(fieldInfo.name));
   }

   @Override
   public void close() throws IOException {}

   private void seekDir(IndexInput input) throws IOException {
     input.seek(input.length() - CodecUtil.footerLength() - 8);
     long dirOffset = input.readLong();
     input.seek(dirOffset);
   }

   @Override
   public long ramBytesUsed() {
     long sizeInBytes = ((termBytesReader!=null)? termBytesReader.ramBytesUsed() : 0);
     for(FieldIndexData entry : fields.values()) {
       sizeInBytes += entry.ramBytesUsed();
     }
     return sizeInBytes;
   }

   @Override
   public Collection<Accountable> getChildResources() {
     return Accountables.namedAccountables("field", fields);
   }

   @Override
   public String toString() {
     return getClass().getSimpleName() + "(fields=" + fields.size() + ",interval=" + indexInterval + ")";
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.codecs.blockterms;


	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.List;

	import org.apache.lucene.codecs.CodecUtil;
	import org.apache.lucene.index.CorruptIndexException;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.index.IndexFileNames;
	import org.apache.lucene.index.SegmentReadState;
	import org.apache.lucene.store.IndexInput;
	import org.apache.lucene.util.Accountable;
	import org.apache.lucene.util.Accountables;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.IOUtils;
	import org.apache.lucene.util.PagedBytes;
	import org.apache.lucene.util.packed.MonotonicBlockPackedReader;

	/**
	* TermsIndexReader for simple every Nth terms indexes.
	*
	* @see FixedGapTermsIndexWriter
	* @lucene.experimental
	*/
	public class FixedGapTermsIndexReader extends TermsIndexReaderBase {

	// NOTE: long is overkill here, but we use this in a
	// number of places to multiply out the actual ord, and we
	// will overflow int during those multiplies. So to avoid
	// having to upgrade each multiple to long in multiple
	// places (error prone), we use long here:
	private final long indexInterval;

	private final int packedIntsVersion;
	private final int blocksize;

	private final static int PAGED_BYTES_BITS = 15;

	// all fields share this single logical byte[]
	private final PagedBytes.Reader termBytesReader;

	final HashMap<String,FieldIndexData> fields = new HashMap<>();

	public FixedGapTermsIndexReader(SegmentReadState state) throws IOException {
	final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);

	String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
	state.segmentSuffix,
	FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION);
	final IndexInput in = state.directory.openInput(fileName, state.context);

	boolean success = false;

	try {

	CodecUtil.checkIndexHeader(in, FixedGapTermsIndexWriter.CODEC_NAME,
	FixedGapTermsIndexWriter.VERSION_CURRENT,
	FixedGapTermsIndexWriter.VERSION_CURRENT,
	state.segmentInfo.getId(), state.segmentSuffix);

	CodecUtil.checksumEntireFile(in);

	indexInterval = in.readVInt();
	if (indexInterval < 1) {
	throw new CorruptIndexException("invalid indexInterval: " + indexInterval, in);
	}
	packedIntsVersion = in.readVInt();
	blocksize = in.readVInt();

	seekDir(in);

	// Read directory
	final int numFields = in.readVInt();
	if (numFields < 0) {
	throw new CorruptIndexException("invalid numFields: " + numFields, in);
	}
	//System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields);
	for(int i=0;i<numFields;i++) {
	final int field = in.readVInt();
	final long numIndexTerms = in.readVInt(); // TODO: change this to a vLong if we fix writer to support > 2B index terms
	if (numIndexTerms < 0) {
	throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms, in);
	}
	final long termsStart = in.readVLong();
	final long indexStart = in.readVLong();
	final long packedIndexStart = in.readVLong();
	final long packedOffsetsStart = in.readVLong();
	if (packedIndexStart < indexStart) {
	throw new CorruptIndexException("invalid packedIndexStart: " + packedIndexStart + " indexStart: " + indexStart + "numIndexTerms: " + numIndexTerms, in);
	}
	final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
	FieldIndexData previous = fields.put(fieldInfo.name, new FieldIndexData(in, termBytes, indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms));
	if (previous != null) {
	throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in);
	}
	}
	success = true;
	} finally {
	if (success) {
	IOUtils.close(in);
	} else {
	IOUtils.closeWhileHandlingException(in);
	}
	termBytesReader = termBytes.freeze(true);
	}
	}

	private class IndexEnum extends FieldIndexEnum {
	private final FieldIndexData fieldIndex;
	private final BytesRef term = new BytesRef();
	private long ord;

	public IndexEnum(FieldIndexData fieldIndex) {
	this.fieldIndex = fieldIndex;
	}

	@Override
	public BytesRef term() {
	return term;
	}

	@Override
	public long seek(BytesRef target) {
	long lo = 0; // binary search
	long hi = fieldIndex.numIndexTerms - 1;

	while (hi >= lo) {
	long mid = (lo + hi) >>> 1;

	final long offset = fieldIndex.termOffsets.get(mid);
	final int length = (int) (fieldIndex.termOffsets.get(1+mid) - offset);
	termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);

	int delta = target.compareTo(term);
	if (delta < 0) {
	hi = mid - 1;
	} else if (delta > 0) {
	lo = mid + 1;
	} else {
	assert mid >= 0;
	ord = mid*indexInterval;
	return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid);
	}
	}

	if (hi < 0) {
	assert hi == -1;
	hi = 0;
	}

	final long offset = fieldIndex.termOffsets.get(hi);
	final int length = (int) (fieldIndex.termOffsets.get(1+hi) - offset);
	termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);

	ord = hi*indexInterval;
	return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi);
	}

	@Override
	public long next() {
	final long idx = 1 + (ord / indexInterval);
	if (idx >= fieldIndex.numIndexTerms) {
	return -1;
	}
	ord += indexInterval;

	final long offset = fieldIndex.termOffsets.get(idx);
	final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
	termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
	return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
	}

	@Override
	public long ord() {
	return ord;
	}

	@Override
	public long seek(long ord) {
	long idx = ord / indexInterval;
	// caller must ensure ord is in bounds
	assert idx < fieldIndex.numIndexTerms;
	final long offset = fieldIndex.termOffsets.get(idx);
	final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
	termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
	this.ord = idx * indexInterval;
	return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
	}
	}

	@Override
	public boolean supportsOrd() {
	return true;
	}

	private final class FieldIndexData implements Accountable {
	// where this field's terms begin in the packed byte[]
	// data
	final long termBytesStart;

	// offset into index termBytes
	final MonotonicBlockPackedReader termOffsets;

	// index pointers into main terms dict
	final MonotonicBlockPackedReader termsDictOffsets;

	final long numIndexTerms;
	final long termsStart;

	public FieldIndexData(IndexInput in, PagedBytes termBytes, long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, long numIndexTerms) throws IOException {

	this.termsStart = termsStart;
	termBytesStart = termBytes.getPointer();

	IndexInput clone = in.clone();
	clone.seek(indexStart);

	this.numIndexTerms = numIndexTerms;
	assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms;

	// slurp in the images from disk:

	try {
	final long numTermBytes = packedIndexStart - indexStart;
	termBytes.copy(clone, numTermBytes);

	// records offsets into main terms dict file
	termsDictOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, numIndexTerms, false);

	// records offsets into byte[] term data
	termOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false);
	} finally {
	clone.close();
	}
	}

	@Override
	public long ramBytesUsed() {
	return ((termOffsets!=null)? termOffsets.ramBytesUsed() : 0) +
	((termsDictOffsets!=null)? termsDictOffsets.ramBytesUsed() : 0);
	}

	@Override
	public Collection<Accountable> getChildResources() {
	List<Accountable> resources = new ArrayList<>();
	if (termOffsets != null) {
	resources.add(Accountables.namedAccountable("term lengths", termOffsets));
	}
	if (termsDictOffsets != null) {
	resources.add(Accountables.namedAccountable("offsets", termsDictOffsets));
	}
	return Collections.unmodifiableList(resources);
	}

	@Override
	public String toString() {
	return "FixedGapTermIndex(indexterms=" + numIndexTerms + ")";
	}
	}

	@Override
	public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
	return new IndexEnum(fields.get(fieldInfo.name));
	}

	@Override
	public void close() throws IOException {}

	private void seekDir(IndexInput input) throws IOException {
	input.seek(input.length() - CodecUtil.footerLength() - 8);
	long dirOffset = input.readLong();
	input.seek(dirOffset);
	}

	@Override
	public long ramBytesUsed() {
	long sizeInBytes = ((termBytesReader!=null)? termBytesReader.ramBytesUsed() : 0);
	for(FieldIndexData entry : fields.values()) {
	sizeInBytes += entry.ramBytesUsed();
	}
	return sizeInBytes;
	}

	@Override
	public Collection<Accountable> getChildResources() {
	return Accountables.namedAccountables("field", fields);
	}

	@Override
	public String toString() {
	return getClass().getSimpleName() + "(fields=" + fields.size() + ",interval=" + indexInterval + ")";
	}
	}