lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.codecs.blockterms;


 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.TermStats;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
 import org.apache.lucene.util.fst.Util;

 /**
  * Selects index terms according to provided pluggable
  * {@link IndexTermSelector}, and stores them in a prefix trie that's
  * loaded entirely in RAM stored as an FST.  This terms
  * index only supports unsigned byte term sort order
  * (unicode codepoint order when the bytes are UTF8).
  *
  * @lucene.experimental */
 public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
   protected IndexOutput out;

   /** Extension of terms index file */
   static final String TERMS_INDEX_EXTENSION = "tiv";

   final static String CODEC_NAME = "VariableGapTermsIndex";
   final static int VERSION_START = 3;
   final static int VERSION_CURRENT = VERSION_START;

   private final List<FSTFieldWriter> fields = new ArrayList<>();

   @SuppressWarnings("unused") private final FieldInfos fieldInfos; // unread
   private final IndexTermSelector policy;

   /**
    * Hook for selecting which terms should be placed in the terms index.
    * <p>
    * {@link #newField} is called at the start of each new field, and
    * {@link #isIndexTerm} for each term in that field.
    *
    * @lucene.experimental
    */
   public static abstract class IndexTermSelector {
     /**
      * Called sequentially on every term being written,
      * returning true if this term should be indexed
      */
     public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
     /**
      * Called when a new field is started.
      */
     public abstract void newField(FieldInfo fieldInfo);
   }

   /** Same policy as {@link FixedGapTermsIndexWriter} */
   public static final class EveryNTermSelector extends IndexTermSelector {
     private int count;
     private final int interval;

     public EveryNTermSelector(int interval) {
       this.interval = interval;
       // First term is first indexed term:
       count = interval;
     }

     @Override
     public boolean isIndexTerm(BytesRef term, TermStats stats) {
       if (count >= interval) {
         count = 1;
         return true;
       } else {
         count++;
         return false;
       }
     }

     @Override
     public void newField(FieldInfo fieldInfo) {
       count = interval;
     }
   }

   /** Sets an index term when docFreq &gt;= docFreqThresh, or
    *  every interval terms.  This should reduce seek time
    *  to high docFreq terms.  */
   public static final class EveryNOrDocFreqTermSelector extends IndexTermSelector {
     private int count;
     private final int docFreqThresh;
     private final int interval;

     public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval) {
       this.interval = interval;
       this.docFreqThresh = docFreqThresh;

       // First term is first indexed term:
       count = interval;
     }

     @Override
     public boolean isIndexTerm(BytesRef term, TermStats stats) {
       if (stats.docFreq >= docFreqThresh || count >= interval) {
         count = 1;
         return true;
       } else {
         count++;
         return false;
       }
     }

     @Override
     public void newField(FieldInfo fieldInfo) {
       count = interval;
     }
   }

   // TODO: it'd be nice to let the FST builder prune based
   // on term count of each node (the prune1/prune2 that it
   // accepts), and build the index based on that.  This
   // should result in a more compact terms index, more like
   // a prefix trie than the other selectors, because it
   // only stores enough leading bytes to get down to N
   // terms that may complete that prefix.  It becomes
   // "deeper" when terms are dense, and "shallow" when they
   // are less dense.
   //
   // However, it's not easy to make that work this this
   // API, because that pruning doesn't immediately know on
   // seeing each term whether that term will be a seek point
   // or not.  It requires some non-causality in the API, ie
   // only on seeing some number of future terms will the
   // builder decide which past terms are seek points.
   // Somehow the API'd need to be able to return a "I don't
   // know" value, eg like a Future, which only later on is
   // flipped (frozen) to true or false.
   //
   // We could solve this with a 2-pass approach, where the
   // first pass would build an FSA (no outputs) solely to
   // determine which prefixes are the 'leaves' in the
   // pruning. The 2nd pass would then look at this prefix
   // trie to mark the seek points and build the FST mapping
   // to the true output.
   //
   // But, one downside to this approach is that it'd result
   // in uneven index term selection.  EG with prune1=10, the
   // resulting index terms could be as frequent as every 10
   // terms or as rare as every <maxArcCount> * 10 (eg 2560),
   // in the extremes.

   public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy) throws IOException {
     final String indexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
     out = state.directory.createOutput(indexFileName, state.context);
     boolean success = false;
     try {
       fieldInfos = state.fieldInfos;
       this.policy = policy;
       CodecUtil.writeIndexHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
       success = true;
     } finally {
       if (!success) {
         IOUtils.closeWhileHandlingException(out);
       }
     }
   }

   @Override
   public FieldWriter addField(FieldInfo field, long termsFilePointer) throws IOException {
     ////System.out.println("VGW: field=" + field.name);
     policy.newField(field);
     FSTFieldWriter writer = new FSTFieldWriter(field, termsFilePointer);
     fields.add(writer);
     return writer;
   }

   /** NOTE: if your codec does not sort in unicode code
    *  point order, you must override this method, to simply
    *  return indexedTerm.length. */
   protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
     // As long as codec sorts terms in unicode codepoint
     // order, we can safely strip off the non-distinguishing
     // suffix to save RAM in the loaded terms index.
     final int idxTermOffset = indexedTerm.offset;
     final int priorTermOffset = priorTerm.offset;
     final int limit = Math.min(priorTerm.length, indexedTerm.length);
     for(int byteIdx=0;byteIdx<limit;byteIdx++) {
       if (priorTerm.bytes[priorTermOffset+byteIdx] != indexedTerm.bytes[idxTermOffset+byteIdx]) {
         return byteIdx+1;
       }
     }
     return Math.min(1+priorTerm.length, indexedTerm.length);
   }

   private class FSTFieldWriter extends FieldWriter {
     private final Builder<Long> fstBuilder;
     private final PositiveIntOutputs fstOutputs;
     private final long startTermsFilePointer;

     final FieldInfo fieldInfo;
     FST<Long> fst;
     final long indexStart;

     private final BytesRefBuilder lastTerm = new BytesRefBuilder();
     private boolean first = true;

     public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
       this.fieldInfo = fieldInfo;
       fstOutputs = PositiveIntOutputs.getSingleton();
       fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
       indexStart = out.getFilePointer();
       ////System.out.println("VGW: field=" + fieldInfo.name);

       // Always put empty string in
       fstBuilder.add(new IntsRef(), termsFilePointer);
       startTermsFilePointer = termsFilePointer;
     }

     @Override
     public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
       //System.out.println("VGW: index term=" + text.utf8ToString());
       // NOTE: we must force the first term per field to be
       // indexed, in case policy doesn't:
       if (policy.isIndexTerm(text, stats) || first) {
         first = false;
         //System.out.println("  YES");
         return true;
       } else {
         lastTerm.copyBytes(text);
         return false;
       }
     }

     private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();

     @Override
     public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
       if (text.length == 0) {
         // We already added empty string in ctor
         assert termsFilePointer == startTermsFilePointer;
         return;
       }
       final int lengthSave = text.length;
       text.length = indexedTermPrefixLength(lastTerm.get(), text);
       try {
         fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
       } finally {
         text.length = lengthSave;
       }
       lastTerm.copyBytes(text);
     }

     @Override
     public void finish(long termsFilePointer) throws IOException {
       fst = fstBuilder.finish();
       if (fst != null) {
         fst.save(out, out);
       }
     }
   }

   @Override
   public void close() throws IOException {
     if (out != null) {
       try {
         final long dirStart = out.getFilePointer();
         final int fieldCount = fields.size();

         int nonNullFieldCount = 0;
         for(int i=0;i<fieldCount;i++) {
           FSTFieldWriter field = fields.get(i);
           if (field.fst != null) {
             nonNullFieldCount++;
           }
         }

         out.writeVInt(nonNullFieldCount);
         for(int i=0;i<fieldCount;i++) {
           FSTFieldWriter field = fields.get(i);
           if (field.fst != null) {
             out.writeVInt(field.fieldInfo.number);
             out.writeVLong(field.indexStart);
           }
         }
         writeTrailer(dirStart);
         CodecUtil.writeFooter(out);
       } finally {
         out.close();
         out = null;
       }
     }
   }

   private void writeTrailer(long dirStart) throws IOException {
     out.writeLong(dirStart);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.codecs.blockterms;


	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.lucene.codecs.CodecUtil;
	import org.apache.lucene.codecs.TermStats;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.index.FieldInfos;
	import org.apache.lucene.index.IndexFileNames;
	import org.apache.lucene.index.SegmentWriteState;
	import org.apache.lucene.store.IndexOutput;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefBuilder;
	import org.apache.lucene.util.IOUtils;
	import org.apache.lucene.util.IntsRef;
	import org.apache.lucene.util.IntsRefBuilder;
	import org.apache.lucene.util.fst.Builder;
	import org.apache.lucene.util.fst.FST;
	import org.apache.lucene.util.fst.PositiveIntOutputs;
	import org.apache.lucene.util.fst.Util;

	/**
	* Selects index terms according to provided pluggable
	* {@link IndexTermSelector}, and stores them in a prefix trie that's
	* loaded entirely in RAM stored as an FST. This terms
	* index only supports unsigned byte term sort order
	* (unicode codepoint order when the bytes are UTF8).
	*
	* @lucene.experimental */
	public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
	protected IndexOutput out;

	/** Extension of terms index file */
	static final String TERMS_INDEX_EXTENSION = "tiv";

	final static String CODEC_NAME = "VariableGapTermsIndex";
	final static int VERSION_START = 3;
	final static int VERSION_CURRENT = VERSION_START;

	private final List<FSTFieldWriter> fields = new ArrayList<>();

	@SuppressWarnings("unused") private final FieldInfos fieldInfos; // unread
	private final IndexTermSelector policy;

	/**
	* Hook for selecting which terms should be placed in the terms index.
	* <p>
	* {@link #newField} is called at the start of each new field, and
	* {@link #isIndexTerm} for each term in that field.
	*
	* @lucene.experimental
	*/
	public static abstract class IndexTermSelector {
	/**
	* Called sequentially on every term being written,
	* returning true if this term should be indexed
	*/
	public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
	/**
	* Called when a new field is started.
	*/
	public abstract void newField(FieldInfo fieldInfo);
	}

	/** Same policy as {@link FixedGapTermsIndexWriter} */
	public static final class EveryNTermSelector extends IndexTermSelector {
	private int count;
	private final int interval;

	public EveryNTermSelector(int interval) {
	this.interval = interval;
	// First term is first indexed term:
	count = interval;
	}

	@Override
	public boolean isIndexTerm(BytesRef term, TermStats stats) {
	if (count >= interval) {
	count = 1;
	return true;
	} else {
	count++;
	return false;
	}
	}

	@Override
	public void newField(FieldInfo fieldInfo) {
	count = interval;
	}
	}

	/** Sets an index term when docFreq >= docFreqThresh, or
	* every interval terms. This should reduce seek time
	* to high docFreq terms. */
	public static final class EveryNOrDocFreqTermSelector extends IndexTermSelector {
	private int count;
	private final int docFreqThresh;
	private final int interval;

	public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval) {
	this.interval = interval;
	this.docFreqThresh = docFreqThresh;

	// First term is first indexed term:
	count = interval;
	}

	@Override
	public boolean isIndexTerm(BytesRef term, TermStats stats) {
	if (stats.docFreq >= docFreqThresh \|\| count >= interval) {
	count = 1;
	return true;
	} else {
	count++;
	return false;
	}
	}

	@Override
	public void newField(FieldInfo fieldInfo) {
	count = interval;
	}
	}

	// TODO: it'd be nice to let the FST builder prune based
	// on term count of each node (the prune1/prune2 that it
	// accepts), and build the index based on that. This
	// should result in a more compact terms index, more like
	// a prefix trie than the other selectors, because it
	// only stores enough leading bytes to get down to N
	// terms that may complete that prefix. It becomes
	// "deeper" when terms are dense, and "shallow" when they
	// are less dense.
	//
	// However, it's not easy to make that work this this
	// API, because that pruning doesn't immediately know on
	// seeing each term whether that term will be a seek point
	// or not. It requires some non-causality in the API, ie
	// only on seeing some number of future terms will the
	// builder decide which past terms are seek points.
	// Somehow the API'd need to be able to return a "I don't
	// know" value, eg like a Future, which only later on is
	// flipped (frozen) to true or false.
	//
	// We could solve this with a 2-pass approach, where the
	// first pass would build an FSA (no outputs) solely to
	// determine which prefixes are the 'leaves' in the
	// pruning. The 2nd pass would then look at this prefix
	// trie to mark the seek points and build the FST mapping
	// to the true output.
	//
	// But, one downside to this approach is that it'd result
	// in uneven index term selection. EG with prune1=10, the
	// resulting index terms could be as frequent as every 10
	// terms or as rare as every <maxArcCount> * 10 (eg 2560),
	// in the extremes.

	public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy) throws IOException {
	final String indexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
	out = state.directory.createOutput(indexFileName, state.context);
	boolean success = false;
	try {
	fieldInfos = state.fieldInfos;
	this.policy = policy;
	CodecUtil.writeIndexHeader(out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
	success = true;
	} finally {
	if (!success) {
	IOUtils.closeWhileHandlingException(out);
	}
	}
	}

	@Override
	public FieldWriter addField(FieldInfo field, long termsFilePointer) throws IOException {
	////System.out.println("VGW: field=" + field.name);
	policy.newField(field);
	FSTFieldWriter writer = new FSTFieldWriter(field, termsFilePointer);
	fields.add(writer);
	return writer;
	}

	/** NOTE: if your codec does not sort in unicode code
	* point order, you must override this method, to simply
	* return indexedTerm.length. */
	protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
	// As long as codec sorts terms in unicode codepoint
	// order, we can safely strip off the non-distinguishing
	// suffix to save RAM in the loaded terms index.
	final int idxTermOffset = indexedTerm.offset;
	final int priorTermOffset = priorTerm.offset;
	final int limit = Math.min(priorTerm.length, indexedTerm.length);
	for(int byteIdx=0;byteIdx<limit;byteIdx++) {
	if (priorTerm.bytes[priorTermOffset+byteIdx] != indexedTerm.bytes[idxTermOffset+byteIdx]) {
	return byteIdx+1;
	}
	}
	return Math.min(1+priorTerm.length, indexedTerm.length);
	}

	private class FSTFieldWriter extends FieldWriter {
	private final Builder<Long> fstBuilder;
	private final PositiveIntOutputs fstOutputs;
	private final long startTermsFilePointer;

	final FieldInfo fieldInfo;
	FST<Long> fst;
	final long indexStart;

	private final BytesRefBuilder lastTerm = new BytesRefBuilder();
	private boolean first = true;

	public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
	this.fieldInfo = fieldInfo;
	fstOutputs = PositiveIntOutputs.getSingleton();
	fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
	indexStart = out.getFilePointer();
	////System.out.println("VGW: field=" + fieldInfo.name);

	// Always put empty string in
	fstBuilder.add(new IntsRef(), termsFilePointer);
	startTermsFilePointer = termsFilePointer;
	}

	@Override
	public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
	//System.out.println("VGW: index term=" + text.utf8ToString());
	// NOTE: we must force the first term per field to be
	// indexed, in case policy doesn't:
	if (policy.isIndexTerm(text, stats) \|\| first) {
	first = false;
	//System.out.println(" YES");
	return true;
	} else {
	lastTerm.copyBytes(text);
	return false;
	}
	}

	private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();

	@Override
	public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
	if (text.length == 0) {
	// We already added empty string in ctor
	assert termsFilePointer == startTermsFilePointer;
	return;
	}
	final int lengthSave = text.length;
	text.length = indexedTermPrefixLength(lastTerm.get(), text);
	try {
	fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
	} finally {
	text.length = lengthSave;
	}
	lastTerm.copyBytes(text);
	}

	@Override
	public void finish(long termsFilePointer) throws IOException {
	fst = fstBuilder.finish();
	if (fst != null) {
	fst.save(out, out);
	}
	}
	}

	@Override
	public void close() throws IOException {
	if (out != null) {
	try {
	final long dirStart = out.getFilePointer();
	final int fieldCount = fields.size();

	int nonNullFieldCount = 0;
	for(int i=0;i<fieldCount;i++) {
	FSTFieldWriter field = fields.get(i);
	if (field.fst != null) {
	nonNullFieldCount++;
	}
	}

	out.writeVInt(nonNullFieldCount);
	for(int i=0;i<fieldCount;i++) {
	FSTFieldWriter field = fields.get(i);
	if (field.fst != null) {
	out.writeVInt(field.fieldInfo.number);
	out.writeVLong(field.indexStart);
	}
	}
	writeTrailer(dirStart);
	CodecUtil.writeFooter(out);
	} finally {
	out.close();
	out = null;
	}
	}
	}

	private void writeTrailer(long dirStart) throws IOException {
	out.writeLong(dirStart);
	}
	}