lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java - lucene-solr - Git at Google

 package org.apache.lucene.codecs;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;

 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;

 /**
  * Extension of {@link PostingsWriterBase}, adding a push
  * API for writing each element of the postings.  This API
  * is somewhat analagous to an XML SAX API, while {@link
  * PostingsWriterBase} is more like an XML DOM API.
  *
  * @see PostingsReaderBase
  * @lucene.experimental
  */
 // TODO: find a better name; this defines the API that the
 // terms dict impls use to talk to a postings impl.
 // TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
 public abstract class PushPostingsWriterBase extends PostingsWriterBase {

   // Reused in writeTerm
   private DocsEnum docsEnum;
   private DocsAndPositionsEnum posEnum;
   private int enumFlags;

   /** {@link FieldInfo} of current field being written. */
   protected FieldInfo fieldInfo;

   /** {@link IndexOptions} of current field being
       written */
   protected IndexOptions indexOptions;

   /** True if the current field writes freqs. */
   protected boolean writeFreqs;

   /** True if the current field writes positions. */
   protected boolean writePositions;

   /** True if the current field writes payloads. */
   protected boolean writePayloads;

   /** True if the current field writes offsets. */
   protected boolean writeOffsets;

   /** Sole constructor. (For invocation by subclass
    *  constructors, typically implicit.) */
   protected PushPostingsWriterBase() {
   }

   /** Called once after startup, before any terms have been
    *  added.  Implementations typically write a header to
    *  the provided {@code termsOut}. */
   public abstract void init(IndexOutput termsOut) throws IOException;

   /** Return a newly created empty TermState */
   public abstract BlockTermState newTermState() throws IOException;

   /** Start a new term.  Note that a matching call to {@link
    *  #finishTerm(BlockTermState)} is done, only if the term has at least one
    *  document. */
   public abstract void startTerm() throws IOException;

   /** Finishes the current term.  The provided {@link
    *  BlockTermState} contains the term's summary statistics,
    *  and will holds metadata from PBF when returned */
   public abstract void finishTerm(BlockTermState state) throws IOException;

   /**
    * Encode metadata as long[] and byte[]. {@code absolute} controls whether
    * current term is delta encoded according to latest term.
    * Usually elements in {@code longs} are file pointers, so each one always
    * increases when a new term is consumed. {@code out} is used to write generic
    * bytes, which are not monotonic.
    *
    * NOTE: sometimes long[] might contain "don't care" values that are unused, e.g.
    * the pointer to postings list may not be defined for some terms but is defined
    * for others, if it is designed to inline  some postings data in term dictionary.
    * In this case, the postings writer should always use the last value, so that each
    * element in metadata long[] remains monotonic.
    */
   public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;

   /**
    * Sets the current field for writing, and returns the
    * fixed length of long[] metadata (which is fixed per
    * field), called when the writing switches to another field. */
   // TODO: better name?
   public int setField(FieldInfo fieldInfo) {
     this.fieldInfo = fieldInfo;
     indexOptions = fieldInfo.getIndexOptions();

     writeFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
     writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
     writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
     writePayloads = fieldInfo.hasPayloads();

     if (writeFreqs == false) {
       enumFlags = 0;
     } else if (writePositions == false) {
       enumFlags = DocsEnum.FLAG_FREQS;
     } else if (writeOffsets == false) {
       if (writePayloads) {
         enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
       } else {
         enumFlags = 0;
       }
     } else {
       if (writePayloads) {
         enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS;
       } else {
         enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
       }
     }

     return 0;
   }

   @Override
   public final BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException {
     startTerm();
     if (writePositions == false) {
       docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
     } else {
       posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
       docsEnum = posEnum;
     }
     assert docsEnum != null;

     int docFreq = 0;
     long totalTermFreq = 0;
     while (true) {
       int docID = docsEnum.nextDoc();
       if (docID == DocsEnum.NO_MORE_DOCS) {
         break;
       }
       docFreq++;
       docsSeen.set(docID);
       int freq;
       if (writeFreqs) {
         freq = docsEnum.freq();
         totalTermFreq += freq;
       } else {
         freq = -1;
       }
       startDoc(docID, freq);

       if (writePositions) {
         for(int i=0;i<freq;i++) {
           int pos = posEnum.nextPosition();
           BytesRef payload = writePayloads ? posEnum.getPayload() : null;
           int startOffset;
           int endOffset;
           if (writeOffsets) {
             startOffset = posEnum.startOffset();
             endOffset = posEnum.endOffset();
           } else {
             startOffset = -1;
             endOffset = -1;
           }
           addPosition(pos, payload, startOffset, endOffset);
         }
       }

       finishDoc();
     }

     if (docFreq == 0) {
       return null;
     } else {
       BlockTermState state = newTermState();
       state.docFreq = docFreq;
       state.totalTermFreq = writeFreqs ? totalTermFreq : -1;
       finishTerm(state);
       return state;
     }
   }

   /** Adds a new doc in this term.
    * <code>freq</code> will be -1 when term frequencies are omitted
    * for the field. */
   public abstract void startDoc(int docID, int freq) throws IOException;

   /** Add a new position & payload, and start/end offset.  A
    *  null payload means no payload; a non-null payload with
    *  zero length also means no payload.  Caller may reuse
    *  the {@link BytesRef} for the payload between calls
    *  (method must fully consume the payload). <code>startOffset</code>
    *  and <code>endOffset</code> will be -1 when offsets are not indexed. */
   public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;

   /** Called when we are done adding positions & payloads
    *  for each doc. */
   public abstract void finishDoc() throws IOException;
 }
	package org.apache.lucene.codecs;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;

	import org.apache.lucene.index.DocsAndPositionsEnum;
	import org.apache.lucene.index.DocsEnum;
	import org.apache.lucene.index.FieldInfo.IndexOptions;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.store.DataOutput;
	import org.apache.lucene.store.IndexOutput;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.FixedBitSet;

	/**
	* Extension of {@link PostingsWriterBase}, adding a push
	* API for writing each element of the postings. This API
	* is somewhat analagous to an XML SAX API, while {@link
	* PostingsWriterBase} is more like an XML DOM API.
	*
	* @see PostingsReaderBase
	* @lucene.experimental
	*/
	// TODO: find a better name; this defines the API that the
	// terms dict impls use to talk to a postings impl.
	// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
	public abstract class PushPostingsWriterBase extends PostingsWriterBase {

	// Reused in writeTerm
	private DocsEnum docsEnum;
	private DocsAndPositionsEnum posEnum;
	private int enumFlags;

	/** {@link FieldInfo} of current field being written. */
	protected FieldInfo fieldInfo;

	/** {@link IndexOptions} of current field being
	written */
	protected IndexOptions indexOptions;

	/** True if the current field writes freqs. */
	protected boolean writeFreqs;

	/** True if the current field writes positions. */
	protected boolean writePositions;

	/** True if the current field writes payloads. */
	protected boolean writePayloads;

	/** True if the current field writes offsets. */
	protected boolean writeOffsets;

	/** Sole constructor. (For invocation by subclass
	* constructors, typically implicit.) */
	protected PushPostingsWriterBase() {
	}

	/** Called once after startup, before any terms have been
	* added. Implementations typically write a header to
	* the provided {@code termsOut}. */
	public abstract void init(IndexOutput termsOut) throws IOException;

	/** Return a newly created empty TermState */
	public abstract BlockTermState newTermState() throws IOException;

	/** Start a new term. Note that a matching call to {@link
	* #finishTerm(BlockTermState)} is done, only if the term has at least one
	* document. */
	public abstract void startTerm() throws IOException;

	/** Finishes the current term. The provided {@link
	* BlockTermState} contains the term's summary statistics,
	* and will holds metadata from PBF when returned */
	public abstract void finishTerm(BlockTermState state) throws IOException;

	/**
	* Encode metadata as long[] and byte[]. {@code absolute} controls whether
	* current term is delta encoded according to latest term.
	* Usually elements in {@code longs} are file pointers, so each one always
	* increases when a new term is consumed. {@code out} is used to write generic
	* bytes, which are not monotonic.
	*
	* NOTE: sometimes long[] might contain "don't care" values that are unused, e.g.
	* the pointer to postings list may not be defined for some terms but is defined
	* for others, if it is designed to inline some postings data in term dictionary.
	* In this case, the postings writer should always use the last value, so that each
	* element in metadata long[] remains monotonic.
	*/
	public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;

	/**
	* Sets the current field for writing, and returns the
	* fixed length of long[] metadata (which is fixed per
	* field), called when the writing switches to another field. */
	// TODO: better name?
	public int setField(FieldInfo fieldInfo) {
	this.fieldInfo = fieldInfo;
	indexOptions = fieldInfo.getIndexOptions();

	writeFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
	writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
	writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
	writePayloads = fieldInfo.hasPayloads();

	if (writeFreqs == false) {
	enumFlags = 0;
	} else if (writePositions == false) {
	enumFlags = DocsEnum.FLAG_FREQS;
	} else if (writeOffsets == false) {
	if (writePayloads) {
	enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
	} else {
	enumFlags = 0;
	}
	} else {
	if (writePayloads) {
	enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS \| DocsAndPositionsEnum.FLAG_OFFSETS;
	} else {
	enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
	}
	}

	return 0;
	}

	@Override
	public final BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException {
	startTerm();
	if (writePositions == false) {
	docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
	} else {
	posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
	docsEnum = posEnum;
	}
	assert docsEnum != null;

	int docFreq = 0;
	long totalTermFreq = 0;
	while (true) {
	int docID = docsEnum.nextDoc();
	if (docID == DocsEnum.NO_MORE_DOCS) {
	break;
	}
	docFreq++;
	docsSeen.set(docID);
	int freq;
	if (writeFreqs) {
	freq = docsEnum.freq();
	totalTermFreq += freq;
	} else {
	freq = -1;
	}
	startDoc(docID, freq);

	if (writePositions) {
	for(int i=0;i<freq;i++) {
	int pos = posEnum.nextPosition();
	BytesRef payload = writePayloads ? posEnum.getPayload() : null;
	int startOffset;
	int endOffset;
	if (writeOffsets) {
	startOffset = posEnum.startOffset();
	endOffset = posEnum.endOffset();
	} else {
	startOffset = -1;
	endOffset = -1;
	}
	addPosition(pos, payload, startOffset, endOffset);
	}
	}

	finishDoc();
	}

	if (docFreq == 0) {
	return null;
	} else {
	BlockTermState state = newTermState();
	state.docFreq = docFreq;
	state.totalTermFreq = writeFreqs ? totalTermFreq : -1;
	finishTerm(state);
	return state;
	}
	}

	/** Adds a new doc in this term.
	* <code>freq</code> will be -1 when term frequencies are omitted
	* for the field. */
	public abstract void startDoc(int docID, int freq) throws IOException;

	/** Add a new position & payload, and start/end offset. A
	* null payload means no payload; a non-null payload with
	* zero length also means no payload. Caller may reuse
	* the {@link BytesRef} for the payload between calls
	* (method must fully consume the payload). <code>startOffset</code>
	* and <code>endOffset</code> will be -1 when offsets are not indexed. */
	public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;

	/** Called when we are done adding positions & payloads
	* for each doc. */
	public abstract void finishDoc() throws IOException;
	}