lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.codecs.memory;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.lucene.codecs.BlockTermState;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.FieldsConsumer;
 import org.apache.lucene.codecs.NormsProducer;
 import org.apache.lucene.codecs.PostingsWriterBase;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.ByteBuffersDataOutput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.Util;

 /**
  * FST-based term dict, using metadata as FST output.
  *
  * <p>The FST directly holds the mapping between &lt;term, metadata&gt;.
  *
  * <p>Term metadata consists of three parts: 1. term statistics: docFreq, totalTermFreq; 2.
  * monotonic long[], e.g. the pointer to the postings list for that term; 3. generic byte[], e.g.
  * other information need by postings reader.
  *
  * <p>File:
  *
  * <ul>
  *   <li><code>.tst</code>: <a href="#Termdictionary">Term Dictionary</a>
  * </ul>
  *
  * <p><a id="Termdictionary"></a>
  *
  * <h2>Term Dictionary</h2>
  *
  * <p>The .tst contains a list of FSTs, one for each field. The FST maps a term to its corresponding
  * statistics (e.g. docfreq) and metadata (e.g. information for postings list reader like file
  * pointer to postings list).
  *
  * <p>Typically the metadata is separated into two parts:
  *
  * <ul>
  *   <li>Monotonical long array: Some metadata will always be ascending in order with the
  *       corresponding term. This part is used by FST to share outputs between arcs.
  *   <li>Generic byte array: Used to store non-monotonic metadata.
  * </ul>
  *
  * File format:
  *
  * <ul>
  *   <li>TermsDict(.tst) --&gt; Header, <i>PostingsHeader</i>, FieldSummary, DirOffset
  *   <li>FieldSummary --&gt; NumFields, &lt;FieldNumber, NumTerms, SumTotalTermFreq?, SumDocFreq,
  *       DocCount, LongsSize, TermFST &gt;<sup>NumFields</sup>
  *   <li>TermFST --&gt; {@link FST FST&lt;TermData&gt;}
  *   <li>TermData --&gt; Flag, BytesSize?, LongDelta<sup>LongsSize</sup>?,
  *       Byte<sup>BytesSize</sup>?, &lt; DocFreq[Same?], (TotalTermFreq-DocFreq) &gt; ?
  *   <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
  *   <li>DirOffset --&gt; {@link DataOutput#writeLong Uint64}
  *   <li>DocFreq, LongsSize, BytesSize, NumFields, FieldNumber, DocCount --&gt; {@link
  *       DataOutput#writeVInt VInt}
  *   <li>TotalTermFreq, NumTerms, SumTotalTermFreq, SumDocFreq, LongDelta --&gt; {@link
  *       DataOutput#writeVLong VLong}
  * </ul>
  *
  * <p>Notes:
  *
  * <ul>
  *   <li>The format of PostingsHeader and generic meta bytes are customized by the specific postings
  *       implementation: they contain arbitrary per-file data (such as parameters or versioning
  *       information), and per-term data (non-monotonic ones like pulsed postings data).
  *   <li>The format of TermData is determined by FST, typically monotonic metadata will be dense
  *       around shallow arcs, while in deeper arcs only generic bytes and term statistics exist.
  *   <li>The byte Flag is used to indicate which part of metadata exists on current arc. Specially
  *       the monotonic part is omitted when it is an array of 0s.
  *   <li>Since LongsSize is per-field fixed, it is only written once in field summary.
  * </ul>
  *
  * @lucene.experimental
  */
 public class FSTTermsWriter extends FieldsConsumer {
   static final String TERMS_EXTENSION = "tfp";
   static final String TERMS_CODEC_NAME = "FSTTerms";
   public static final int TERMS_VERSION_START = 2;
   public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_START;

   final PostingsWriterBase postingsWriter;
   final FieldInfos fieldInfos;
   IndexOutput out;
   final int maxDoc;
   final List<FieldMetaData> fields = new ArrayList<>();

   public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter)
       throws IOException {
     final String termsFileName =
         IndexFileNames.segmentFileName(
             state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);

     this.postingsWriter = postingsWriter;
     this.fieldInfos = state.fieldInfos;
     this.out = state.directory.createOutput(termsFileName, state.context);
     this.maxDoc = state.segmentInfo.maxDoc();

     boolean success = false;
     try {
       CodecUtil.writeIndexHeader(
           out,
           TERMS_CODEC_NAME,
           TERMS_VERSION_CURRENT,
           state.segmentInfo.getId(),
           state.segmentSuffix);

       this.postingsWriter.init(out, state);
       success = true;
     } finally {
       if (!success) {
         IOUtils.closeWhileHandlingException(out);
       }
     }
   }

   private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
     out.writeLong(dirStart);
   }

   @Override
   public void write(Fields fields, NormsProducer norms) throws IOException {
     for (String field : fields) {
       Terms terms = fields.terms(field);
       if (terms == null) {
         continue;
       }
       FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
       boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
       TermsEnum termsEnum = terms.iterator();
       TermsWriter termsWriter = new TermsWriter(fieldInfo);

       long sumTotalTermFreq = 0;
       long sumDocFreq = 0;
       FixedBitSet docsSeen = new FixedBitSet(maxDoc);

       while (true) {
         BytesRef term = termsEnum.next();
         if (term == null) {
           break;
         }

         BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen, norms);
         if (termState != null) {
           termsWriter.finishTerm(term, termState);
           sumTotalTermFreq += termState.totalTermFreq;
           sumDocFreq += termState.docFreq;
         }
       }

       termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
     }
   }

   @Override
   public void close() throws IOException {
     if (out != null) {
       boolean success = false;
       try {
         // write field summary
         final long dirStart = out.getFilePointer();

         out.writeVInt(fields.size());
         for (FieldMetaData field : fields) {
           out.writeVInt(field.fieldInfo.number);
           out.writeVLong(field.numTerms);
           if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
             out.writeVLong(field.sumTotalTermFreq);
           }
           out.writeVLong(field.sumDocFreq);
           out.writeVInt(field.docCount);
           field.dict.save(out, out);
         }
         writeTrailer(out, dirStart);
         CodecUtil.writeFooter(out);
         success = true;
       } finally {
         if (success) {
           IOUtils.close(out, postingsWriter);
         } else {
           IOUtils.closeWhileHandlingException(out, postingsWriter);
         }
         out = null;
       }
     }
   }

   private static class FieldMetaData {
     public final FieldInfo fieldInfo;
     public final long numTerms;
     public final long sumTotalTermFreq;
     public final long sumDocFreq;
     public final int docCount;
     public final FST<FSTTermOutputs.TermData> dict;

     public FieldMetaData(
         FieldInfo fieldInfo,
         long numTerms,
         long sumTotalTermFreq,
         long sumDocFreq,
         int docCount,
         FST<FSTTermOutputs.TermData> fst) {
       this.fieldInfo = fieldInfo;
       this.numTerms = numTerms;
       this.sumTotalTermFreq = sumTotalTermFreq;
       this.sumDocFreq = sumDocFreq;
       this.docCount = docCount;
       this.dict = fst;
     }
   }

   final class TermsWriter {
     private final FSTCompiler<FSTTermOutputs.TermData> fstCompiler;
     private final FSTTermOutputs outputs;
     private final FieldInfo fieldInfo;
     private long numTerms;

     private final IntsRefBuilder scratchTerm = new IntsRefBuilder();
     private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance();

     TermsWriter(FieldInfo fieldInfo) {
       this.numTerms = 0;
       this.fieldInfo = fieldInfo;
       postingsWriter.setField(fieldInfo);
       this.outputs = new FSTTermOutputs(fieldInfo);
       this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
     }

     public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
       // write term meta data into fst
       final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData();
       meta.bytes = null;
       meta.docFreq = state.docFreq;
       meta.totalTermFreq = state.totalTermFreq;
       postingsWriter.encodeTerm(metaWriter, fieldInfo, state, true);
       if (metaWriter.size() > 0) {
         meta.bytes = metaWriter.toArrayCopy();
         metaWriter.reset();
       }
       fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta);
       numTerms++;
     }

     public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
       // save FST dict
       if (numTerms > 0) {
         final FST<FSTTermOutputs.TermData> fst = fstCompiler.compile();
         fields.add(
             new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, fst));
       }
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.codecs.memory;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;
	import org.apache.lucene.codecs.BlockTermState;
	import org.apache.lucene.codecs.CodecUtil;
	import org.apache.lucene.codecs.FieldsConsumer;
	import org.apache.lucene.codecs.NormsProducer;
	import org.apache.lucene.codecs.PostingsWriterBase;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.index.FieldInfos;
	import org.apache.lucene.index.Fields;
	import org.apache.lucene.index.IndexFileNames;
	import org.apache.lucene.index.IndexOptions;
	import org.apache.lucene.index.SegmentWriteState;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.store.ByteBuffersDataOutput;
	import org.apache.lucene.store.DataOutput;
	import org.apache.lucene.store.IndexOutput;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.FixedBitSet;
	import org.apache.lucene.util.IOUtils;
	import org.apache.lucene.util.IntsRefBuilder;
	import org.apache.lucene.util.fst.FST;
	import org.apache.lucene.util.fst.FSTCompiler;
	import org.apache.lucene.util.fst.Util;

	/**
	* FST-based term dict, using metadata as FST output.
	*
	* <p>The FST directly holds the mapping between <term, metadata>.
	*
	* <p>Term metadata consists of three parts: 1. term statistics: docFreq, totalTermFreq; 2.
	* monotonic long[], e.g. the pointer to the postings list for that term; 3. generic byte[], e.g.
	* other information need by postings reader.
	*
	* <p>File:
	*
	* <ul>
	* <li><code>.tst</code>: <a href="#Termdictionary">Term Dictionary</a>
	* </ul>
	*
	* <p><a id="Termdictionary"></a>
	*
	* <h2>Term Dictionary</h2>
	*
	* <p>The .tst contains a list of FSTs, one for each field. The FST maps a term to its corresponding
	* statistics (e.g. docfreq) and metadata (e.g. information for postings list reader like file
	* pointer to postings list).
	*
	* <p>Typically the metadata is separated into two parts:
	*
	* <ul>
	* <li>Monotonical long array: Some metadata will always be ascending in order with the
	* corresponding term. This part is used by FST to share outputs between arcs.
	* <li>Generic byte array: Used to store non-monotonic metadata.
	* </ul>
	*
	* File format:
	*
	* <ul>
	* <li>TermsDict(.tst) --> Header, <i>PostingsHeader</i>, FieldSummary, DirOffset
	* <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, SumDocFreq,
	* DocCount, LongsSize, TermFST ><sup>NumFields</sup>
	* <li>TermFST --> {@link FST FST<TermData>}
	* <li>TermData --> Flag, BytesSize?, LongDelta<sup>LongsSize</sup>?,
	* Byte<sup>BytesSize</sup>?, < DocFreq[Same?], (TotalTermFreq-DocFreq) > ?
	* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
	* <li>DirOffset --> {@link DataOutput#writeLong Uint64}
	* <li>DocFreq, LongsSize, BytesSize, NumFields, FieldNumber, DocCount --> {@link
	* DataOutput#writeVInt VInt}
	* <li>TotalTermFreq, NumTerms, SumTotalTermFreq, SumDocFreq, LongDelta --> {@link
	* DataOutput#writeVLong VLong}
	* </ul>
	*
	* <p>Notes:
	*
	* <ul>
	* <li>The format of PostingsHeader and generic meta bytes are customized by the specific postings
	* implementation: they contain arbitrary per-file data (such as parameters or versioning
	* information), and per-term data (non-monotonic ones like pulsed postings data).
	* <li>The format of TermData is determined by FST, typically monotonic metadata will be dense
	* around shallow arcs, while in deeper arcs only generic bytes and term statistics exist.
	* <li>The byte Flag is used to indicate which part of metadata exists on current arc. Specially
	* the monotonic part is omitted when it is an array of 0s.
	* <li>Since LongsSize is per-field fixed, it is only written once in field summary.
	* </ul>
	*
	* @lucene.experimental
	*/
	public class FSTTermsWriter extends FieldsConsumer {
	static final String TERMS_EXTENSION = "tfp";
	static final String TERMS_CODEC_NAME = "FSTTerms";
	public static final int TERMS_VERSION_START = 2;
	public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_START;

	final PostingsWriterBase postingsWriter;
	final FieldInfos fieldInfos;
	IndexOutput out;
	final int maxDoc;
	final List<FieldMetaData> fields = new ArrayList<>();

	public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter)
	throws IOException {
	final String termsFileName =
	IndexFileNames.segmentFileName(
	state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);

	this.postingsWriter = postingsWriter;
	this.fieldInfos = state.fieldInfos;
	this.out = state.directory.createOutput(termsFileName, state.context);
	this.maxDoc = state.segmentInfo.maxDoc();

	boolean success = false;
	try {
	CodecUtil.writeIndexHeader(
	out,
	TERMS_CODEC_NAME,
	TERMS_VERSION_CURRENT,
	state.segmentInfo.getId(),
	state.segmentSuffix);

	this.postingsWriter.init(out, state);
	success = true;
	} finally {
	if (!success) {
	IOUtils.closeWhileHandlingException(out);
	}
	}
	}

	private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
	out.writeLong(dirStart);
	}

	@Override
	public void write(Fields fields, NormsProducer norms) throws IOException {
	for (String field : fields) {
	Terms terms = fields.terms(field);
	if (terms == null) {
	continue;
	}
	FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
	boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
	TermsEnum termsEnum = terms.iterator();
	TermsWriter termsWriter = new TermsWriter(fieldInfo);

	long sumTotalTermFreq = 0;
	long sumDocFreq = 0;
	FixedBitSet docsSeen = new FixedBitSet(maxDoc);

	while (true) {
	BytesRef term = termsEnum.next();
	if (term == null) {
	break;
	}

	BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen, norms);
	if (termState != null) {
	termsWriter.finishTerm(term, termState);
	sumTotalTermFreq += termState.totalTermFreq;
	sumDocFreq += termState.docFreq;
	}
	}

	termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
	}
	}

	@Override
	public void close() throws IOException {
	if (out != null) {
	boolean success = false;
	try {
	// write field summary
	final long dirStart = out.getFilePointer();

	out.writeVInt(fields.size());
	for (FieldMetaData field : fields) {
	out.writeVInt(field.fieldInfo.number);
	out.writeVLong(field.numTerms);
	if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
	out.writeVLong(field.sumTotalTermFreq);
	}
	out.writeVLong(field.sumDocFreq);
	out.writeVInt(field.docCount);
	field.dict.save(out, out);
	}
	writeTrailer(out, dirStart);
	CodecUtil.writeFooter(out);
	success = true;
	} finally {
	if (success) {
	IOUtils.close(out, postingsWriter);
	} else {
	IOUtils.closeWhileHandlingException(out, postingsWriter);
	}
	out = null;
	}
	}
	}

	private static class FieldMetaData {
	public final FieldInfo fieldInfo;
	public final long numTerms;
	public final long sumTotalTermFreq;
	public final long sumDocFreq;
	public final int docCount;
	public final FST<FSTTermOutputs.TermData> dict;

	public FieldMetaData(
	FieldInfo fieldInfo,
	long numTerms,
	long sumTotalTermFreq,
	long sumDocFreq,
	int docCount,
	FST<FSTTermOutputs.TermData> fst) {
	this.fieldInfo = fieldInfo;
	this.numTerms = numTerms;
	this.sumTotalTermFreq = sumTotalTermFreq;
	this.sumDocFreq = sumDocFreq;
	this.docCount = docCount;
	this.dict = fst;
	}
	}

	final class TermsWriter {
	private final FSTCompiler<FSTTermOutputs.TermData> fstCompiler;
	private final FSTTermOutputs outputs;
	private final FieldInfo fieldInfo;
	private long numTerms;

	private final IntsRefBuilder scratchTerm = new IntsRefBuilder();
	private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance();

	TermsWriter(FieldInfo fieldInfo) {
	this.numTerms = 0;
	this.fieldInfo = fieldInfo;
	postingsWriter.setField(fieldInfo);
	this.outputs = new FSTTermOutputs(fieldInfo);
	this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
	}

	public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
	// write term meta data into fst
	final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData();
	meta.bytes = null;
	meta.docFreq = state.docFreq;
	meta.totalTermFreq = state.totalTermFreq;
	postingsWriter.encodeTerm(metaWriter, fieldInfo, state, true);
	if (metaWriter.size() > 0) {
	meta.bytes = metaWriter.toArrayCopy();
	metaWriter.reset();
	}
	fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta);
	numTerms++;
	}

	public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
	// save FST dict
	if (numTerms > 0) {
	final FST<FSTTermOutputs.TermData> fst = fstCompiler.compile();
	fields.add(
	new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, fst));
	}
	}
	}
	}