lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/FixedGapTermsIndexWriter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.codecs.blockterms;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.TermStats;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.store.ByteBuffersDataOutput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
 import org.apache.lucene.util.packed.PackedInts;

 /**
  * Selects every Nth term as and index term, and hold term bytes (mostly) fully expanded in memory.
  * This terms index supports seeking by ord. See {@link VariableGapTermsIndexWriter} for a more
  * memory efficient terms index that does not support seeking by ord.
  *
  * @lucene.experimental
  */
 public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
   protected IndexOutput out;

   /** Extension of terms index file */
   static final String TERMS_INDEX_EXTENSION = "tii";

   static final String CODEC_NAME = "FixedGapTermsIndex";
   static final int VERSION_START = 4;
   static final int VERSION_CURRENT = VERSION_START;

   static final int BLOCKSIZE = 4096;
   private final int termIndexInterval;
   public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;

   private final List<SimpleFieldWriter> fields = new ArrayList<>();

   public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
     this(state, DEFAULT_TERM_INDEX_INTERVAL);
   }

   public FixedGapTermsIndexWriter(SegmentWriteState state, int termIndexInterval)
       throws IOException {
     if (termIndexInterval <= 0) {
       throw new IllegalArgumentException("invalid termIndexInterval: " + termIndexInterval);
     }
     this.termIndexInterval = termIndexInterval;
     final String indexFileName =
         IndexFileNames.segmentFileName(
             state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
     out = state.directory.createOutput(indexFileName, state.context);
     boolean success = false;
     try {
       CodecUtil.writeIndexHeader(
           out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
       out.writeVInt(termIndexInterval);
       out.writeVInt(PackedInts.VERSION_CURRENT);
       out.writeVInt(BLOCKSIZE);
       success = true;
     } finally {
       if (!success) {
         IOUtils.closeWhileHandlingException(out);
       }
     }
   }

   @Override
   public FieldWriter addField(FieldInfo field, long termsFilePointer) {
     // System.out.println("FGW: addFfield=" + field.name);
     SimpleFieldWriter writer = new SimpleFieldWriter(field, termsFilePointer);
     fields.add(writer);
     return writer;
   }

   /**
    * NOTE: if your codec does not sort in unicode code point order, you must override this method,
    * to simply return indexedTerm.length.
    */
   protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
     // As long as codec sorts terms in unicode codepoint
     // order, we can safely strip off the non-distinguishing
     // suffix to save RAM in the loaded terms index.
     return StringHelper.sortKeyLength(priorTerm, indexedTerm);
   }

   private class SimpleFieldWriter extends FieldWriter {
     final FieldInfo fieldInfo;
     int numIndexTerms;
     final long indexStart;
     final long termsStart;
     long packedIndexStart;
     long packedOffsetsStart;
     private long numTerms;

     private ByteBuffersDataOutput offsetsBuffer = ByteBuffersDataOutput.newResettableInstance();
     private MonotonicBlockPackedWriter termOffsets =
         new MonotonicBlockPackedWriter(offsetsBuffer, BLOCKSIZE);
     private long currentOffset;

     private ByteBuffersDataOutput addressBuffer = ByteBuffersDataOutput.newResettableInstance();
     private MonotonicBlockPackedWriter termAddresses =
         new MonotonicBlockPackedWriter(addressBuffer, BLOCKSIZE);

     private final BytesRefBuilder lastTerm = new BytesRefBuilder();

     SimpleFieldWriter(FieldInfo fieldInfo, long termsFilePointer) {
       this.fieldInfo = fieldInfo;
       indexStart = out.getFilePointer();
       termsStart = termsFilePointer;
       // we write terms+1 offsets, term n's length is n+1 - n
       try {
         termOffsets.add(0L);
       } catch (IOException bogus) {
         throw new RuntimeException(bogus);
       }
     }

     @Override
     public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
       // First term is first indexed term:
       // System.out.println("FGW: checkIndexTerm text=" + text.utf8ToString());
       if (0 == (numTerms++ % termIndexInterval)) {
         return true;
       } else {
         if (0 == numTerms % termIndexInterval) {
           // save last term just before next index term so we
           // can compute wasted suffix
           lastTerm.copyBytes(text);
         }
         return false;
       }
     }

     @Override
     public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
       final int indexedTermLength;
       if (numIndexTerms == 0) {
         // no previous term: no bytes to write
         indexedTermLength = 0;
       } else {
         indexedTermLength = indexedTermPrefixLength(lastTerm.get(), text);
       }
       // System.out.println("FGW: add text=" + text.utf8ToString() + " " + text + " fp=" +
       // termsFilePointer);

       // write only the min prefix that shows the diff
       // against prior term
       out.writeBytes(text.bytes, text.offset, indexedTermLength);

       // save delta terms pointer
       termAddresses.add(termsFilePointer - termsStart);

       // save term length (in bytes)
       assert indexedTermLength <= Short.MAX_VALUE;
       currentOffset += indexedTermLength;
       termOffsets.add(currentOffset);

       lastTerm.copyBytes(text);
       numIndexTerms++;
     }

     @Override
     public void finish(long termsFilePointer) throws IOException {

       // write primary terms dict offsets
       packedIndexStart = out.getFilePointer();

       // relative to our indexStart
       termAddresses.finish();
       addressBuffer.copyTo(out);

       packedOffsetsStart = out.getFilePointer();

       // write offsets into the byte[] terms
       termOffsets.finish();
       offsetsBuffer.copyTo(out);

       // our referrer holds onto us, while other fields are
       // being written, so don't tie up this RAM:
       termOffsets = termAddresses = null;
       addressBuffer = null;
       offsetsBuffer = null;
     }
   }

   @Override
   public void close() throws IOException {
     if (out != null) {
       boolean success = false;
       try {
         final long dirStart = out.getFilePointer();
         final int fieldCount = fields.size();

         int nonNullFieldCount = 0;
         for (int i = 0; i < fieldCount; i++) {
           SimpleFieldWriter field = fields.get(i);
           if (field.numIndexTerms > 0) {
             nonNullFieldCount++;
           }
         }

         out.writeVInt(nonNullFieldCount);
         for (int i = 0; i < fieldCount; i++) {
           SimpleFieldWriter field = fields.get(i);
           if (field.numIndexTerms > 0) {
             out.writeVInt(field.fieldInfo.number);
             out.writeVInt(field.numIndexTerms);
             out.writeVLong(field.termsStart);
             out.writeVLong(field.indexStart);
             out.writeVLong(field.packedIndexStart);
             out.writeVLong(field.packedOffsetsStart);
           }
         }
         writeTrailer(dirStart);
         CodecUtil.writeFooter(out);
         success = true;
       } finally {
         if (success) {
           IOUtils.close(out);
         } else {
           IOUtils.closeWhileHandlingException(out);
         }
         out = null;
       }
     }
   }

   private void writeTrailer(long dirStart) throws IOException {
     out.writeLong(dirStart);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.codecs.blockterms;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;
	import org.apache.lucene.codecs.CodecUtil;
	import org.apache.lucene.codecs.TermStats;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.index.IndexFileNames;
	import org.apache.lucene.index.SegmentWriteState;
	import org.apache.lucene.store.ByteBuffersDataOutput;
	import org.apache.lucene.store.IndexOutput;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefBuilder;
	import org.apache.lucene.util.IOUtils;
	import org.apache.lucene.util.StringHelper;
	import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
	import org.apache.lucene.util.packed.PackedInts;

	/**
	* Selects every Nth term as and index term, and hold term bytes (mostly) fully expanded in memory.
	* This terms index supports seeking by ord. See {@link VariableGapTermsIndexWriter} for a more
	* memory efficient terms index that does not support seeking by ord.
	*
	* @lucene.experimental
	*/
	public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
	protected IndexOutput out;

	/** Extension of terms index file */
	static final String TERMS_INDEX_EXTENSION = "tii";

	static final String CODEC_NAME = "FixedGapTermsIndex";
	static final int VERSION_START = 4;
	static final int VERSION_CURRENT = VERSION_START;

	static final int BLOCKSIZE = 4096;
	private final int termIndexInterval;
	public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;

	private final List<SimpleFieldWriter> fields = new ArrayList<>();

	public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
	this(state, DEFAULT_TERM_INDEX_INTERVAL);
	}

	public FixedGapTermsIndexWriter(SegmentWriteState state, int termIndexInterval)
	throws IOException {
	if (termIndexInterval <= 0) {
	throw new IllegalArgumentException("invalid termIndexInterval: " + termIndexInterval);
	}
	this.termIndexInterval = termIndexInterval;
	final String indexFileName =
	IndexFileNames.segmentFileName(
	state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
	out = state.directory.createOutput(indexFileName, state.context);
	boolean success = false;
	try {
	CodecUtil.writeIndexHeader(
	out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
	out.writeVInt(termIndexInterval);
	out.writeVInt(PackedInts.VERSION_CURRENT);
	out.writeVInt(BLOCKSIZE);
	success = true;
	} finally {
	if (!success) {
	IOUtils.closeWhileHandlingException(out);
	}
	}
	}

	@Override
	public FieldWriter addField(FieldInfo field, long termsFilePointer) {
	// System.out.println("FGW: addFfield=" + field.name);
	SimpleFieldWriter writer = new SimpleFieldWriter(field, termsFilePointer);
	fields.add(writer);
	return writer;
	}

	/**
	* NOTE: if your codec does not sort in unicode code point order, you must override this method,
	* to simply return indexedTerm.length.
	*/
	protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
	// As long as codec sorts terms in unicode codepoint
	// order, we can safely strip off the non-distinguishing
	// suffix to save RAM in the loaded terms index.
	return StringHelper.sortKeyLength(priorTerm, indexedTerm);
	}

	private class SimpleFieldWriter extends FieldWriter {
	final FieldInfo fieldInfo;
	int numIndexTerms;
	final long indexStart;
	final long termsStart;
	long packedIndexStart;
	long packedOffsetsStart;
	private long numTerms;

	private ByteBuffersDataOutput offsetsBuffer = ByteBuffersDataOutput.newResettableInstance();
	private MonotonicBlockPackedWriter termOffsets =
	new MonotonicBlockPackedWriter(offsetsBuffer, BLOCKSIZE);
	private long currentOffset;

	private ByteBuffersDataOutput addressBuffer = ByteBuffersDataOutput.newResettableInstance();
	private MonotonicBlockPackedWriter termAddresses =
	new MonotonicBlockPackedWriter(addressBuffer, BLOCKSIZE);

	private final BytesRefBuilder lastTerm = new BytesRefBuilder();

	SimpleFieldWriter(FieldInfo fieldInfo, long termsFilePointer) {
	this.fieldInfo = fieldInfo;
	indexStart = out.getFilePointer();
	termsStart = termsFilePointer;
	// we write terms+1 offsets, term n's length is n+1 - n
	try {
	termOffsets.add(0L);
	} catch (IOException bogus) {
	throw new RuntimeException(bogus);
	}
	}

	@Override
	public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
	// First term is first indexed term:
	// System.out.println("FGW: checkIndexTerm text=" + text.utf8ToString());
	if (0 == (numTerms++ % termIndexInterval)) {
	return true;
	} else {
	if (0 == numTerms % termIndexInterval) {
	// save last term just before next index term so we
	// can compute wasted suffix
	lastTerm.copyBytes(text);
	}
	return false;
	}
	}

	@Override
	public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
	final int indexedTermLength;
	if (numIndexTerms == 0) {
	// no previous term: no bytes to write
	indexedTermLength = 0;
	} else {
	indexedTermLength = indexedTermPrefixLength(lastTerm.get(), text);
	}
	// System.out.println("FGW: add text=" + text.utf8ToString() + " " + text + " fp=" +
	// termsFilePointer);

	// write only the min prefix that shows the diff
	// against prior term
	out.writeBytes(text.bytes, text.offset, indexedTermLength);

	// save delta terms pointer
	termAddresses.add(termsFilePointer - termsStart);

	// save term length (in bytes)
	assert indexedTermLength <= Short.MAX_VALUE;
	currentOffset += indexedTermLength;
	termOffsets.add(currentOffset);

	lastTerm.copyBytes(text);
	numIndexTerms++;
	}

	@Override
	public void finish(long termsFilePointer) throws IOException {

	// write primary terms dict offsets
	packedIndexStart = out.getFilePointer();

	// relative to our indexStart
	termAddresses.finish();
	addressBuffer.copyTo(out);

	packedOffsetsStart = out.getFilePointer();

	// write offsets into the byte[] terms
	termOffsets.finish();
	offsetsBuffer.copyTo(out);

	// our referrer holds onto us, while other fields are
	// being written, so don't tie up this RAM:
	termOffsets = termAddresses = null;
	addressBuffer = null;
	offsetsBuffer = null;
	}
	}

	@Override
	public void close() throws IOException {
	if (out != null) {
	boolean success = false;
	try {
	final long dirStart = out.getFilePointer();
	final int fieldCount = fields.size();

	int nonNullFieldCount = 0;
	for (int i = 0; i < fieldCount; i++) {
	SimpleFieldWriter field = fields.get(i);
	if (field.numIndexTerms > 0) {
	nonNullFieldCount++;
	}
	}

	out.writeVInt(nonNullFieldCount);
	for (int i = 0; i < fieldCount; i++) {
	SimpleFieldWriter field = fields.get(i);
	if (field.numIndexTerms > 0) {
	out.writeVInt(field.fieldInfo.number);
	out.writeVInt(field.numIndexTerms);
	out.writeVLong(field.termsStart);
	out.writeVLong(field.indexStart);
	out.writeVLong(field.packedIndexStart);
	out.writeVLong(field.packedOffsetsStart);
	}
	}
	writeTrailer(dirStart);
	CodecUtil.writeFooter(out);
	success = true;
	} finally {
	if (success) {
	IOUtils.close(out);
	} else {
	IOUtils.closeWhileHandlingException(out);
	}
	out = null;
	}
	}
	}

	private void writeTrailer(long dirStart) throws IOException {
	out.writeLong(dirStart);
	}
	}