blob: efb74c3562d071990e11eec21315115d56d6e7e8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.uniformsplit;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
/**
* Term of a block line.
* <p>
* Contains the term bytes and the minimal distinguishing prefix (MDP) length
* of this term.
* <p>
* The MDP is the minimal prefix that distinguishes a term from its immediate
* previous term (terms are alphabetically sorted).
* <p>
* The incremental encoding suffix is the suffix starting at the last byte of
* the MDP (inclusive).
* <p>
* Example:
* For the block
* <pre>
* client
* color
* company
* companies
* </pre>
* "color" - MDP is "co" - incremental encoding suffix is "olor".
* <br>
* "company" - MDP is "com" - incremental encoding suffix is "mpany".
* <br>
* "companies" - MDP is "compani" - incremental encoding suffix is "ies".
*
* @lucene.experimental
*/
public class TermBytes implements Accountable {
private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(TermBytes.class);
protected int mdpLength;
protected BytesRef term;
public TermBytes(int mdpLength, BytesRef term) {
reset(mdpLength, term);
}
public TermBytes reset(int mdpLength, BytesRef term) {
assert term.length > 0 && mdpLength > 0 || term.length == 0 && mdpLength == 0 : "Inconsistent mdpLength=" + mdpLength + ", term.length=" + term.length;
assert term.length == 0 || mdpLength <= term.length : "Too large mdpLength=" + mdpLength + ", term.length=" + term.length;
assert term.offset == 0;
this.mdpLength = mdpLength;
this.term = term;
return this;
}
/**
* @return This term MDP length.
* @see TermBytes
*/
public int getMdpLength() {
return mdpLength;
}
/**
* @return This term bytes.
*/
public BytesRef getTerm() {
return term;
}
/**
* @return The offset of this term incremental encoding suffix.
* @see TermBytes
*/
public int getSuffixOffset() {
return Math.max(mdpLength - 1, 0);
}
/**
* @return The length of this term incremental encoding suffix.
* @see TermBytes
*/
public int getSuffixLength() {
return term.length - getSuffixOffset();
}
/**
* Computes the length of the minimal distinguishing prefix (MDP) between
* a current term and its previous term (terms are alphabetically sorted).
* <p>
* Example: If previous="car" and current="cartridge", then MDP length is
* 4. It is the length of the minimal prefix distinguishing "cartridge" from
* "car", that is, the length of "cart".
*
* @see TermBytes
*/
public static int computeMdpLength(BytesRef previousTerm, BytesRef currentTerm) {
int mdpLength = previousTerm == null ? 1 : StringHelper.sortKeyLength(previousTerm, currentTerm);
return Math.min(mdpLength, currentTerm.length);
}
@Override
public long ramBytesUsed() {
return BASE_RAM_USAGE + RamUsageUtil.ramBytesUsed(term);
}
}