lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.codecs.memory;


 import java.io.IOException;
 import java.util.Arrays;

 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.fst.Outputs;

 /**
  * An FST {@link Outputs} implementation for
  * {@link FSTTermsWriter}.
  *
  * @lucene.experimental
  */

 // NOTE: outputs should be per-field, since
 // longsSize is fixed for each field
 class FSTTermOutputs extends Outputs<FSTTermOutputs.TermData> {
   private final static TermData NO_OUTPUT = new TermData();
   //private static boolean TEST = false;
   private final boolean hasPos;

   /**
    * Represents the metadata for one term.
    * On an FST, only long[] part is 'shared' and pushed towards root.
    * byte[] and term stats will be kept on deeper arcs.
    */
   static class TermData implements Accountable {
     private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermData.class);
     byte[] bytes;
     int docFreq;
     long totalTermFreq;
     TermData() {
       this.bytes = null;
       this.docFreq = 0;
       this.totalTermFreq = -1;
     }
     TermData(byte[] bytes, int docFreq, long totalTermFreq) {
       this.bytes = bytes;
       this.docFreq = docFreq;
       this.totalTermFreq = totalTermFreq;
     }

     @Override
     public long ramBytesUsed() {
       long ramBytesUsed = BASE_RAM_BYTES_USED;
       if (bytes != null) {
         ramBytesUsed += RamUsageEstimator.sizeOf(bytes);
       }
       return ramBytesUsed;
     }

     // NOTE: actually, FST nodes are seldom
     // identical when outputs on their arcs
     // aren't NO_OUTPUTs.
     @Override
     public int hashCode() {
       int hash = 0;
       if (bytes != null) {
         final int end = bytes.length;
         for (int i = 0; i < end; i++) {
           hash += bytes[i];
         }
       }
       hash += docFreq + totalTermFreq;
       return hash;
     }

     @Override
     public String toString() {
       return "FSTTermOutputs$TermData bytes=" + Arrays.toString(bytes) + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq;
     }

     @Override
     public boolean equals(Object other_) {
       if (other_ == this) {
         return true;
       } else if (!(other_ instanceof FSTTermOutputs.TermData)) {
         return false;
       }
       TermData other = (TermData) other_;
       return statsEqual(this, other) &&
              bytesEqual(this, other);

     }
   }

   protected FSTTermOutputs(FieldInfo fieldInfo) {
     this.hasPos = fieldInfo.getIndexOptions() != IndexOptions.DOCS;
   }

   @Override
   public long ramBytesUsed(TermData output) {
     return output.ramBytesUsed();
   }

   @Override
   //
   // The return value will be the smaller one, when these two are
   // 'comparable', i.e.
   // 1. every value in t1 is not larger than in t2, or
   // 2. every value in t1 is not smaller than t2.
   //
   public TermData common(TermData t1, TermData t2) {
     //if (TEST) System.out.print("common("+t1+", "+t2+") = ");
     if (t1 == NO_OUTPUT || t2 == NO_OUTPUT) {
       //if (TEST) System.out.println("ret:"+NO_OUTPUT);
       return NO_OUTPUT;
     }

     TermData ret;

     if (statsEqual(t1, t2) && bytesEqual(t1, t2)) {
       ret = t1;
     } else {
       ret = NO_OUTPUT;
     }
     //if (TEST) System.out.println("ret:"+ret);
     return ret;
   }

   @Override
   public TermData subtract(TermData t1, TermData t2) {
     //if (TEST) System.out.print("subtract("+t1+", "+t2+") = ");
     if (t2 == NO_OUTPUT) {
       //if (TEST) System.out.println("ret:"+t1);
       return t1;
     }

     TermData ret;
     if (statsEqual(t1, t2) && bytesEqual(t1, t2)) {
       ret = NO_OUTPUT;
     } else {
       ret = new TermData(t1.bytes, t1.docFreq, t1.totalTermFreq);
     }
     //if (TEST) System.out.println("ret:"+ret);
     return ret;
   }

   // TODO: if we refactor a 'addSelf(TermData other)',
   // we can gain about 5~7% for fuzzy queries, however this also
   // means we are putting too much stress on FST Outputs decoding?
   @Override
   public TermData add(TermData t1, TermData t2) {
     //if (TEST) System.out.print("add("+t1+", "+t2+") = ");
     if (t1 == NO_OUTPUT) {
       //if (TEST) System.out.println("ret:"+t2);
       return t2;
     } else if (t2 == NO_OUTPUT) {
       //if (TEST) System.out.println("ret:"+t1);
       return t1;
     }

     TermData ret;
     if (t2.bytes != null || t2.docFreq > 0) {
       ret = new TermData(t2.bytes, t2.docFreq, t2.totalTermFreq);
     } else {
       ret = new TermData(t1.bytes, t1.docFreq, t1.totalTermFreq);
     }
     //if (TEST) System.out.println("ret:"+ret);
     return ret;
   }

   @Override
   public void write(TermData data, DataOutput out) throws IOException {
     assert hasPos || data.totalTermFreq == -1;
     int bit0 = ((data.bytes == null || data.bytes.length == 0) ? 0 : 1);
     int bit1 = ((data.docFreq == 0)  ? 0 : 1) << 1;
     int bits = bit0 | bit1;
     if (bit0 > 0) {  // determine extra length
       if (data.bytes.length < 32) {
         bits |= (data.bytes.length << 2);
         out.writeByte((byte)bits);
       } else {
         out.writeByte((byte)bits);
         out.writeVInt(data.bytes.length);
       }
     } else {
       out.writeByte((byte)bits);
     }
     if (bit0 > 0) {  // bytes exists
       out.writeBytes(data.bytes, 0, data.bytes.length);
     }
     if (bit1 > 0) {  // stats exist
       if (hasPos) {
         if (data.docFreq == data.totalTermFreq) {
           out.writeVInt((data.docFreq << 1) | 1);
         } else {
           out.writeVInt((data.docFreq << 1));
           out.writeVLong(data.totalTermFreq - data.docFreq);
         }
       } else {
         out.writeVInt(data.docFreq);
       }
     }
   }

   @Override
   public TermData read(DataInput in) throws IOException {
     byte[] bytes = null;
     int docFreq = 0;
     long totalTermFreq = -1;
     int bits = in.readByte() & 0xff;
     int bit0 = bits & 1;
     int bit1 = bits & 2;
     int bytesSize = (bits >>> 2);
     if (bit0 > 0 && bytesSize == 0) {  // determine extra length
       bytesSize = in.readVInt();
     }
     if (bit0 > 0) {  // bytes exists
       bytes = new byte[bytesSize];
       in.readBytes(bytes, 0, bytesSize);
     }
     if (bit1 > 0) {  // stats exist
       int code = in.readVInt();
       if (hasPos) {
         totalTermFreq = docFreq = code >>> 1;
         if ((code & 1) == 0) {
           totalTermFreq += in.readVLong();
         }
       } else {
         docFreq = code;
       }
     }
     return new TermData(bytes, docFreq, totalTermFreq);
   }


   @Override
   public void skipOutput(DataInput in) throws IOException {
     int bits = in.readByte() & 0xff;
     int bit0 = bits & 1;
     int bit1 = bits & 2;
     int bytesSize = (bits >>> 2);
     if (bit0 > 0 && bytesSize == 0) {  // determine extra length
       bytesSize = in.readVInt();
     }
     if (bit0 > 0) {  // bytes exists
       in.skipBytes(bytesSize);
     }
     if (bit1 > 0) {  // stats exist
       int code = in.readVInt();
       if (hasPos && (code & 1) == 0) {
         in.readVLong();
       }
     }
   }

   @Override
   public TermData getNoOutput() {
     return NO_OUTPUT;
   }

   @Override
   public String outputToString(TermData data) {
     return data.toString();
   }

   static boolean statsEqual(final TermData t1, final TermData t2) {
     return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
   }
   static boolean bytesEqual(final TermData t1, final TermData t2) {
     if (t1.bytes == null && t2.bytes == null) {
       return true;
     }
     return t1.bytes != null && t2.bytes != null && Arrays.equals(t1.bytes, t2.bytes);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.codecs.memory;


	import java.io.IOException;
	import java.util.Arrays;

	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.index.IndexOptions;
	import org.apache.lucene.store.DataInput;
	import org.apache.lucene.store.DataOutput;
	import org.apache.lucene.util.Accountable;
	import org.apache.lucene.util.RamUsageEstimator;
	import org.apache.lucene.util.fst.Outputs;

	/**
	* An FST {@link Outputs} implementation for
	* {@link FSTTermsWriter}.
	*
	* @lucene.experimental
	*/

	// NOTE: outputs should be per-field, since
	// longsSize is fixed for each field
	class FSTTermOutputs extends Outputs<FSTTermOutputs.TermData> {
	private final static TermData NO_OUTPUT = new TermData();
	//private static boolean TEST = false;
	private final boolean hasPos;

	/**
	* Represents the metadata for one term.
	* On an FST, only long[] part is 'shared' and pushed towards root.
	* byte[] and term stats will be kept on deeper arcs.
	*/
	static class TermData implements Accountable {
	private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermData.class);
	byte[] bytes;
	int docFreq;
	long totalTermFreq;
	TermData() {
	this.bytes = null;
	this.docFreq = 0;
	this.totalTermFreq = -1;
	}
	TermData(byte[] bytes, int docFreq, long totalTermFreq) {
	this.bytes = bytes;
	this.docFreq = docFreq;
	this.totalTermFreq = totalTermFreq;
	}

	@Override
	public long ramBytesUsed() {
	long ramBytesUsed = BASE_RAM_BYTES_USED;
	if (bytes != null) {
	ramBytesUsed += RamUsageEstimator.sizeOf(bytes);
	}
	return ramBytesUsed;
	}

	// NOTE: actually, FST nodes are seldom
	// identical when outputs on their arcs
	// aren't NO_OUTPUTs.
	@Override
	public int hashCode() {
	int hash = 0;
	if (bytes != null) {
	final int end = bytes.length;
	for (int i = 0; i < end; i++) {
	hash += bytes[i];
	}
	}
	hash += docFreq + totalTermFreq;
	return hash;
	}

	@Override
	public String toString() {
	return "FSTTermOutputs$TermData bytes=" + Arrays.toString(bytes) + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq;
	}

	@Override
	public boolean equals(Object other_) {
	if (other_ == this) {
	return true;
	} else if (!(other_ instanceof FSTTermOutputs.TermData)) {
	return false;
	}
	TermData other = (TermData) other_;
	return statsEqual(this, other) &&
	bytesEqual(this, other);

	}
	}

	protected FSTTermOutputs(FieldInfo fieldInfo) {
	this.hasPos = fieldInfo.getIndexOptions() != IndexOptions.DOCS;
	}

	@Override
	public long ramBytesUsed(TermData output) {
	return output.ramBytesUsed();
	}

	@Override
	//
	// The return value will be the smaller one, when these two are
	// 'comparable', i.e.
	// 1. every value in t1 is not larger than in t2, or
	// 2. every value in t1 is not smaller than t2.
	//
	public TermData common(TermData t1, TermData t2) {
	//if (TEST) System.out.print("common("+t1+", "+t2+") = ");
	if (t1 == NO_OUTPUT \|\| t2 == NO_OUTPUT) {
	//if (TEST) System.out.println("ret:"+NO_OUTPUT);
	return NO_OUTPUT;
	}

	TermData ret;

	if (statsEqual(t1, t2) && bytesEqual(t1, t2)) {
	ret = t1;
	} else {
	ret = NO_OUTPUT;
	}
	//if (TEST) System.out.println("ret:"+ret);
	return ret;
	}

	@Override
	public TermData subtract(TermData t1, TermData t2) {
	//if (TEST) System.out.print("subtract("+t1+", "+t2+") = ");
	if (t2 == NO_OUTPUT) {
	//if (TEST) System.out.println("ret:"+t1);
	return t1;
	}

	TermData ret;
	if (statsEqual(t1, t2) && bytesEqual(t1, t2)) {
	ret = NO_OUTPUT;
	} else {
	ret = new TermData(t1.bytes, t1.docFreq, t1.totalTermFreq);
	}
	//if (TEST) System.out.println("ret:"+ret);
	return ret;
	}

	// TODO: if we refactor a 'addSelf(TermData other)',
	// we can gain about 5~7% for fuzzy queries, however this also
	// means we are putting too much stress on FST Outputs decoding?
	@Override
	public TermData add(TermData t1, TermData t2) {
	//if (TEST) System.out.print("add("+t1+", "+t2+") = ");
	if (t1 == NO_OUTPUT) {
	//if (TEST) System.out.println("ret:"+t2);
	return t2;
	} else if (t2 == NO_OUTPUT) {
	//if (TEST) System.out.println("ret:"+t1);
	return t1;
	}

	TermData ret;
	if (t2.bytes != null \|\| t2.docFreq > 0) {
	ret = new TermData(t2.bytes, t2.docFreq, t2.totalTermFreq);
	} else {
	ret = new TermData(t1.bytes, t1.docFreq, t1.totalTermFreq);
	}
	//if (TEST) System.out.println("ret:"+ret);
	return ret;
	}

	@Override
	public void write(TermData data, DataOutput out) throws IOException {
	assert hasPos \|\| data.totalTermFreq == -1;
	int bit0 = ((data.bytes == null \|\| data.bytes.length == 0) ? 0 : 1);
	int bit1 = ((data.docFreq == 0) ? 0 : 1) << 1;
	int bits = bit0 \| bit1;
	if (bit0 > 0) { // determine extra length
	if (data.bytes.length < 32) {
	bits \|= (data.bytes.length << 2);
	out.writeByte((byte)bits);
	} else {
	out.writeByte((byte)bits);
	out.writeVInt(data.bytes.length);
	}
	} else {
	out.writeByte((byte)bits);
	}
	if (bit0 > 0) { // bytes exists
	out.writeBytes(data.bytes, 0, data.bytes.length);
	}
	if (bit1 > 0) { // stats exist
	if (hasPos) {
	if (data.docFreq == data.totalTermFreq) {
	out.writeVInt((data.docFreq << 1) \| 1);
	} else {
	out.writeVInt((data.docFreq << 1));
	out.writeVLong(data.totalTermFreq - data.docFreq);
	}
	} else {
	out.writeVInt(data.docFreq);
	}
	}
	}

	@Override
	public TermData read(DataInput in) throws IOException {
	byte[] bytes = null;
	int docFreq = 0;
	long totalTermFreq = -1;
	int bits = in.readByte() & 0xff;
	int bit0 = bits & 1;
	int bit1 = bits & 2;
	int bytesSize = (bits >>> 2);
	if (bit0 > 0 && bytesSize == 0) { // determine extra length
	bytesSize = in.readVInt();
	}
	if (bit0 > 0) { // bytes exists
	bytes = new byte[bytesSize];
	in.readBytes(bytes, 0, bytesSize);
	}
	if (bit1 > 0) { // stats exist
	int code = in.readVInt();
	if (hasPos) {
	totalTermFreq = docFreq = code >>> 1;
	if ((code & 1) == 0) {
	totalTermFreq += in.readVLong();
	}
	} else {
	docFreq = code;
	}
	}
	return new TermData(bytes, docFreq, totalTermFreq);
	}


	@Override
	public void skipOutput(DataInput in) throws IOException {
	int bits = in.readByte() & 0xff;
	int bit0 = bits & 1;
	int bit1 = bits & 2;
	int bytesSize = (bits >>> 2);
	if (bit0 > 0 && bytesSize == 0) { // determine extra length
	bytesSize = in.readVInt();
	}
	if (bit0 > 0) { // bytes exists
	in.skipBytes(bytesSize);
	}
	if (bit1 > 0) { // stats exist
	int code = in.readVInt();
	if (hasPos && (code & 1) == 0) {
	in.readVLong();
	}
	}
	}

	@Override
	public TermData getNoOutput() {
	return NO_OUTPUT;
	}

	@Override
	public String outputToString(TermData data) {
	return data.toString();
	}

	static boolean statsEqual(final TermData t1, final TermData t2) {
	return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
	}
	static boolean bytesEqual(final TermData t1, final TermData t2) {
	if (t1.bytes == null && t2.bytes == null) {
	return true;
	}
	return t1.bytes != null && t2.bytes != null && Arrays.equals(t1.bytes, t2.bytes);
	}
	}