lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java - lucene-solr - Git at Google

 package org.apache.lucene.index;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RamUsageEstimator;

 // TODO: break into separate freq and prox writers as
 // codecs; make separate container (tii/tis/skip/*) that can
 // be configured as any number of files 1..N
 final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implements Comparable<FreqProxTermsWriterPerField> {

   final FreqProxTermsWriter parent;
   final TermsHashPerField termsHashPerField;
   final FieldInfo fieldInfo;
   final DocumentsWriterPerThread.DocState docState;
   final FieldInvertState fieldState;
   boolean hasFreq;
   boolean hasProx;
   boolean hasOffsets;
   PayloadAttribute payloadAttribute;
   OffsetAttribute offsetAttribute;
   long sumTotalTermFreq;
   long sumDocFreq;

   // How many docs have this field:
   int docCount;

   public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriter parent, FieldInfo fieldInfo) {
     this.termsHashPerField = termsHashPerField;
     this.parent = parent;
     this.fieldInfo = fieldInfo;
     docState = termsHashPerField.docState;
     fieldState = termsHashPerField.fieldState;
     setIndexOptions(fieldInfo.getIndexOptions());
   }

   @Override
   int getStreamCount() {
     if (!hasProx) {
       return 1;
     } else {
       return 2;
     }
   }

   @Override
   void finish() {
     sumDocFreq += fieldState.uniqueTermCount;
     sumTotalTermFreq += fieldState.length;
     if (fieldState.length > 0) {
       docCount++;
     }

     if (hasPayloads) {
       fieldInfo.setStorePayloads();
     }
   }

   boolean hasPayloads;

   @Override
   void skippingLongTerm() {}

   @Override
   public int compareTo(FreqProxTermsWriterPerField other) {
     return fieldInfo.name.compareTo(other.fieldInfo.name);
   }

   private void setIndexOptions(IndexOptions indexOptions) {
     if (indexOptions == null) {
       // field could later be updated with indexed=true, so set everything on
       hasFreq = hasProx = hasOffsets = true;
     } else {
       hasFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
       hasProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
       hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
     }
   }

   @Override
   boolean start(IndexableField[] fields, int count) {
     return true;
   }

   @Override
   void start(IndexableField f) {
     if (fieldState.attributeSource.hasAttribute(PayloadAttribute.class)) {
       payloadAttribute = fieldState.attributeSource.getAttribute(PayloadAttribute.class);
     } else {
       payloadAttribute = null;
     }
     if (hasOffsets) {
       offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
     } else {
       offsetAttribute = null;
     }
   }

   void writeProx(final int termID, int proxCode) {
     //System.out.println("writeProx termID=" + termID + " proxCode=" + proxCode);
     assert hasProx;
     final BytesRef payload;
     if (payloadAttribute == null) {
       payload = null;
     } else {
       payload = payloadAttribute.getPayload();
     }

     if (payload != null && payload.length > 0) {
       termsHashPerField.writeVInt(1, (proxCode<<1)|1);
       termsHashPerField.writeVInt(1, payload.length);
       termsHashPerField.writeBytes(1, payload.bytes, payload.offset, payload.length);
       hasPayloads = true;
     } else {
       termsHashPerField.writeVInt(1, proxCode<<1);
     }

     FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
     postings.lastPositions[termID] = fieldState.position;
   }

   void writeOffsets(final int termID, int offsetAccum) {
     assert hasOffsets;
     final int startOffset = offsetAccum + offsetAttribute.startOffset();
     final int endOffset = offsetAccum + offsetAttribute.endOffset();
     //System.out.println("writeOffsets termID=" + termID + " prevOffset=" + prevOffset + " startOff=" + startOffset + " endOff=" + endOffset);
     FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
     assert startOffset - postings.lastOffsets[termID] >= 0;
     termsHashPerField.writeVInt(1, startOffset - postings.lastOffsets[termID]);
     termsHashPerField.writeVInt(1, endOffset - startOffset);

     postings.lastOffsets[termID] = startOffset;
   }

   @Override
   void newTerm(final int termID) {
     // First time we're seeing this term since the last
     // flush
     assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start");

     FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
     postings.lastDocIDs[termID] = docState.docID;
     if (!hasFreq) {
       postings.lastDocCodes[termID] = docState.docID;
     } else {
       postings.lastDocCodes[termID] = docState.docID << 1;
       postings.termFreqs[termID] = 1;
       if (hasProx) {
         writeProx(termID, fieldState.position);
         if (hasOffsets) {
           writeOffsets(termID, fieldState.offset);
         }
       } else {
         assert !hasOffsets;
       }
     }
     fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
     fieldState.uniqueTermCount++;
   }

   @Override
   void addTerm(final int termID) {

     assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start");

     FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;

     assert !hasFreq || postings.termFreqs[termID] > 0;

     if (!hasFreq) {
       assert postings.termFreqs == null;
       if (docState.docID != postings.lastDocIDs[termID]) {
         assert docState.docID > postings.lastDocIDs[termID];
         termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
         postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID];
         postings.lastDocIDs[termID] = docState.docID;
         fieldState.uniqueTermCount++;
       }
     } else if (docState.docID != postings.lastDocIDs[termID]) {
       assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
       // Term not yet seen in the current doc but previously
       // seen in other doc(s) since the last flush

       // Now that we know doc freq for previous doc,
       // write it & lastDocCode
       if (1 == postings.termFreqs[termID]) {
         termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1);
       } else {
         termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
         termsHashPerField.writeVInt(0, postings.termFreqs[termID]);
       }
       postings.termFreqs[termID] = 1;
       fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
       postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
       postings.lastDocIDs[termID] = docState.docID;
       if (hasProx) {
         writeProx(termID, fieldState.position);
         if (hasOffsets) {
           postings.lastOffsets[termID] = 0;
           writeOffsets(termID, fieldState.offset);
         }
       } else {
         assert !hasOffsets;
       }
       fieldState.uniqueTermCount++;
     } else {
       fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.termFreqs[termID]);
       if (hasProx) {
         writeProx(termID, fieldState.position-postings.lastPositions[termID]);
       }
       if (hasOffsets) {
         writeOffsets(termID, fieldState.offset);
       }
     }
   }

   @Override
   ParallelPostingsArray createPostingsArray(int size) {
     return new FreqProxPostingsArray(size, hasFreq, hasProx, hasOffsets);
   }

   static final class FreqProxPostingsArray extends ParallelPostingsArray {
     public FreqProxPostingsArray(int size, boolean writeFreqs, boolean writeProx, boolean writeOffsets) {
       super(size);
       if (writeFreqs) {
         termFreqs = new int[size];
       }
       lastDocIDs = new int[size];
       lastDocCodes = new int[size];
       if (writeProx) {
         lastPositions = new int[size];
         if (writeOffsets) {
           lastOffsets = new int[size];
         }
       } else {
         assert !writeOffsets;
       }
       //System.out.println("PA init freqs=" + writeFreqs + " pos=" + writeProx + " offs=" + writeOffsets);
     }

     int termFreqs[];                                   // # times this term occurs in the current doc
     int lastDocIDs[];                                  // Last docID where this term occurred
     int lastDocCodes[];                                // Code for prior doc
     int lastPositions[];                               // Last position where this term occurred
     int lastOffsets[];                                 // Last endOffset where this term occurred

     @Override
     ParallelPostingsArray newInstance(int size) {
       return new FreqProxPostingsArray(size, termFreqs != null, lastPositions != null, lastOffsets != null);
     }

     @Override
     void copyTo(ParallelPostingsArray toArray, int numToCopy) {
       assert toArray instanceof FreqProxPostingsArray;
       FreqProxPostingsArray to = (FreqProxPostingsArray) toArray;

       super.copyTo(toArray, numToCopy);

       System.arraycopy(lastDocIDs, 0, to.lastDocIDs, 0, numToCopy);
       System.arraycopy(lastDocCodes, 0, to.lastDocCodes, 0, numToCopy);
       if (lastPositions != null) {
         assert to.lastPositions != null;
         System.arraycopy(lastPositions, 0, to.lastPositions, 0, numToCopy);
       }
       if (lastOffsets != null) {
         assert to.lastOffsets != null;
         System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, numToCopy);
       }
       if (termFreqs != null) {
         assert to.termFreqs != null;
         System.arraycopy(termFreqs, 0, to.termFreqs, 0, numToCopy);
       }
     }

     @Override
     int bytesPerPosting() {
       int bytes = ParallelPostingsArray.BYTES_PER_POSTING + 2 * RamUsageEstimator.NUM_BYTES_INT;
       if (lastPositions != null) {
         bytes += RamUsageEstimator.NUM_BYTES_INT;
       }
       if (lastOffsets != null) {
         bytes += RamUsageEstimator.NUM_BYTES_INT;
       }
       if (termFreqs != null) {
         bytes += RamUsageEstimator.NUM_BYTES_INT;
       }

       return bytes;
     }
   }

   public void abort() {}

   BytesRef payload;

   int[] sortedTermIDs;

   void sortPostings() {
     assert sortedTermIDs == null;
     sortedTermIDs = termsHashPerField.sortPostings();
   }
 }
	package org.apache.lucene.index;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
	import org.apache.lucene.index.FieldInfo.IndexOptions;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.RamUsageEstimator;

	// TODO: break into separate freq and prox writers as
	// codecs; make separate container (tii/tis/skip/*) that can
	// be configured as any number of files 1..N
	final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implements Comparable<FreqProxTermsWriterPerField> {

	final FreqProxTermsWriter parent;
	final TermsHashPerField termsHashPerField;
	final FieldInfo fieldInfo;
	final DocumentsWriterPerThread.DocState docState;
	final FieldInvertState fieldState;
	boolean hasFreq;
	boolean hasProx;
	boolean hasOffsets;
	PayloadAttribute payloadAttribute;
	OffsetAttribute offsetAttribute;
	long sumTotalTermFreq;
	long sumDocFreq;

	// How many docs have this field:
	int docCount;

	public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriter parent, FieldInfo fieldInfo) {
	this.termsHashPerField = termsHashPerField;
	this.parent = parent;
	this.fieldInfo = fieldInfo;
	docState = termsHashPerField.docState;
	fieldState = termsHashPerField.fieldState;
	setIndexOptions(fieldInfo.getIndexOptions());
	}

	@Override
	int getStreamCount() {
	if (!hasProx) {
	return 1;
	} else {
	return 2;
	}
	}

	@Override
	void finish() {
	sumDocFreq += fieldState.uniqueTermCount;
	sumTotalTermFreq += fieldState.length;
	if (fieldState.length > 0) {
	docCount++;
	}

	if (hasPayloads) {
	fieldInfo.setStorePayloads();
	}
	}

	boolean hasPayloads;

	@Override
	void skippingLongTerm() {}

	@Override
	public int compareTo(FreqProxTermsWriterPerField other) {
	return fieldInfo.name.compareTo(other.fieldInfo.name);
	}

	private void setIndexOptions(IndexOptions indexOptions) {
	if (indexOptions == null) {
	// field could later be updated with indexed=true, so set everything on
	hasFreq = hasProx = hasOffsets = true;
	} else {
	hasFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
	hasProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
	hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
	}
	}

	@Override
	boolean start(IndexableField[] fields, int count) {
	return true;
	}

	@Override
	void start(IndexableField f) {
	if (fieldState.attributeSource.hasAttribute(PayloadAttribute.class)) {
	payloadAttribute = fieldState.attributeSource.getAttribute(PayloadAttribute.class);
	} else {
	payloadAttribute = null;
	}
	if (hasOffsets) {
	offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
	} else {
	offsetAttribute = null;
	}
	}

	void writeProx(final int termID, int proxCode) {
	//System.out.println("writeProx termID=" + termID + " proxCode=" + proxCode);
	assert hasProx;
	final BytesRef payload;
	if (payloadAttribute == null) {
	payload = null;
	} else {
	payload = payloadAttribute.getPayload();
	}

	if (payload != null && payload.length > 0) {
	termsHashPerField.writeVInt(1, (proxCode<<1)\|1);
	termsHashPerField.writeVInt(1, payload.length);
	termsHashPerField.writeBytes(1, payload.bytes, payload.offset, payload.length);
	hasPayloads = true;
	} else {
	termsHashPerField.writeVInt(1, proxCode<<1);
	}

	FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
	postings.lastPositions[termID] = fieldState.position;
	}

	void writeOffsets(final int termID, int offsetAccum) {
	assert hasOffsets;
	final int startOffset = offsetAccum + offsetAttribute.startOffset();
	final int endOffset = offsetAccum + offsetAttribute.endOffset();
	//System.out.println("writeOffsets termID=" + termID + " prevOffset=" + prevOffset + " startOff=" + startOffset + " endOff=" + endOffset);
	FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
	assert startOffset - postings.lastOffsets[termID] >= 0;
	termsHashPerField.writeVInt(1, startOffset - postings.lastOffsets[termID]);
	termsHashPerField.writeVInt(1, endOffset - startOffset);

	postings.lastOffsets[termID] = startOffset;
	}

	@Override
	void newTerm(final int termID) {
	// First time we're seeing this term since the last
	// flush
	assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start");

	FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
	postings.lastDocIDs[termID] = docState.docID;
	if (!hasFreq) {
	postings.lastDocCodes[termID] = docState.docID;
	} else {
	postings.lastDocCodes[termID] = docState.docID << 1;
	postings.termFreqs[termID] = 1;
	if (hasProx) {
	writeProx(termID, fieldState.position);
	if (hasOffsets) {
	writeOffsets(termID, fieldState.offset);
	}
	} else {
	assert !hasOffsets;
	}
	}
	fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
	fieldState.uniqueTermCount++;
	}

	@Override
	void addTerm(final int termID) {

	assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start");

	FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;

	assert !hasFreq \|\| postings.termFreqs[termID] > 0;

	if (!hasFreq) {
	assert postings.termFreqs == null;
	if (docState.docID != postings.lastDocIDs[termID]) {
	assert docState.docID > postings.lastDocIDs[termID];
	termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
	postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID];
	postings.lastDocIDs[termID] = docState.docID;
	fieldState.uniqueTermCount++;
	}
	} else if (docState.docID != postings.lastDocIDs[termID]) {
	assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
	// Term not yet seen in the current doc but previously
	// seen in other doc(s) since the last flush

	// Now that we know doc freq for previous doc,
	// write it & lastDocCode
	if (1 == postings.termFreqs[termID]) {
	termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]\|1);
	} else {
	termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
	termsHashPerField.writeVInt(0, postings.termFreqs[termID]);
	}
	postings.termFreqs[termID] = 1;
	fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
	postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
	postings.lastDocIDs[termID] = docState.docID;
	if (hasProx) {
	writeProx(termID, fieldState.position);
	if (hasOffsets) {
	postings.lastOffsets[termID] = 0;
	writeOffsets(termID, fieldState.offset);
	}
	} else {
	assert !hasOffsets;
	}
	fieldState.uniqueTermCount++;
	} else {
	fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.termFreqs[termID]);
	if (hasProx) {
	writeProx(termID, fieldState.position-postings.lastPositions[termID]);
	}
	if (hasOffsets) {
	writeOffsets(termID, fieldState.offset);
	}
	}
	}

	@Override
	ParallelPostingsArray createPostingsArray(int size) {
	return new FreqProxPostingsArray(size, hasFreq, hasProx, hasOffsets);
	}

	static final class FreqProxPostingsArray extends ParallelPostingsArray {
	public FreqProxPostingsArray(int size, boolean writeFreqs, boolean writeProx, boolean writeOffsets) {
	super(size);
	if (writeFreqs) {
	termFreqs = new int[size];
	}
	lastDocIDs = new int[size];
	lastDocCodes = new int[size];
	if (writeProx) {
	lastPositions = new int[size];
	if (writeOffsets) {
	lastOffsets = new int[size];
	}
	} else {
	assert !writeOffsets;
	}
	//System.out.println("PA init freqs=" + writeFreqs + " pos=" + writeProx + " offs=" + writeOffsets);
	}

	int termFreqs[]; // # times this term occurs in the current doc
	int lastDocIDs[]; // Last docID where this term occurred
	int lastDocCodes[]; // Code for prior doc
	int lastPositions[]; // Last position where this term occurred
	int lastOffsets[]; // Last endOffset where this term occurred

	@Override
	ParallelPostingsArray newInstance(int size) {
	return new FreqProxPostingsArray(size, termFreqs != null, lastPositions != null, lastOffsets != null);
	}

	@Override
	void copyTo(ParallelPostingsArray toArray, int numToCopy) {
	assert toArray instanceof FreqProxPostingsArray;
	FreqProxPostingsArray to = (FreqProxPostingsArray) toArray;

	super.copyTo(toArray, numToCopy);

	System.arraycopy(lastDocIDs, 0, to.lastDocIDs, 0, numToCopy);
	System.arraycopy(lastDocCodes, 0, to.lastDocCodes, 0, numToCopy);
	if (lastPositions != null) {
	assert to.lastPositions != null;
	System.arraycopy(lastPositions, 0, to.lastPositions, 0, numToCopy);
	}
	if (lastOffsets != null) {
	assert to.lastOffsets != null;
	System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, numToCopy);
	}
	if (termFreqs != null) {
	assert to.termFreqs != null;
	System.arraycopy(termFreqs, 0, to.termFreqs, 0, numToCopy);
	}
	}

	@Override
	int bytesPerPosting() {
	int bytes = ParallelPostingsArray.BYTES_PER_POSTING + 2 * RamUsageEstimator.NUM_BYTES_INT;
	if (lastPositions != null) {
	bytes += RamUsageEstimator.NUM_BYTES_INT;
	}
	if (lastOffsets != null) {
	bytes += RamUsageEstimator.NUM_BYTES_INT;
	}
	if (termFreqs != null) {
	bytes += RamUsageEstimator.NUM_BYTES_INT;
	}

	return bytes;
	}
	}

	public void abort() {}

	BytesRef payload;

	int[] sortedTermIDs;

	void sortPostings() {
	assert sortedTermIDs == null;
	sortedTermIDs = termsHashPerField.sortPostings();
	}
	}