| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.legacy; |
| |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.index.FilterLeafReader; |
| import org.apache.lucene.index.FilteredTermsEnum; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| |
| /** |
| * This is a helper class to generate prefix-encoded representations for numerical values |
| * and supplies converters to represent float/double values as sortable integers/longs. |
| * |
| * <p>To quickly execute range queries in Apache Lucene, a range is divided recursively |
| * into multiple intervals for searching: The center of the range is searched only with |
| * the lowest possible precision in the trie, while the boundaries are matched |
| * more exactly. This reduces the number of terms dramatically. |
| * |
| * <p>This class generates terms to achieve this: First the numerical integer values need to |
| * be converted to bytes. For that integer values (32 bit or 64 bit) are made unsigned |
| * and the bits are converted to ASCII chars with each 7 bit. The resulting byte[] is |
| * sortable like the original integer value (even using UTF-8 sort order). Each value is also |
| * prefixed (in the first char) by the <code>shift</code> value (number of bits removed) used |
| * during encoding. |
| * |
| * <p>For easy usage, the trie algorithm is implemented for indexing inside |
| * {@link org.apache.solr.legacy.LegacyNumericTokenStream} that can index <code>int</code>, <code>long</code>, |
| * <code>float</code>, and <code>double</code>. For querying, |
| * {@link org.apache.solr.legacy.LegacyNumericRangeQuery} implements the query part |
| * for the same data types. |
| * |
| * @lucene.internal |
| * |
| * @deprecated Please use {@link org.apache.lucene.index.PointValues} instead. |
| * |
| * @since 2.9, API changed non backwards-compliant in 4.0 |
| */ |
| |
| @Deprecated |
| public final class LegacyNumericUtils { |
| |
| private LegacyNumericUtils() {} // no instance! |
| |
| /** |
| * The default precision step used by {@link org.apache.solr.legacy.LegacyLongField}, |
| * {@link org.apache.solr.legacy.LegacyDoubleField}, {@link org.apache.solr.legacy.LegacyNumericTokenStream}, {@link |
| * org.apache.solr.legacy.LegacyNumericRangeQuery}. |
| */ |
| public static final int PRECISION_STEP_DEFAULT = 16; |
| |
| /** |
| * The default precision step used by {@link org.apache.solr.legacy.LegacyIntField} and |
| * {@link org.apache.solr.legacy.LegacyFloatField}. |
| */ |
| public static final int PRECISION_STEP_DEFAULT_32 = 8; |
| |
| /** |
| * Longs are stored at lower precision by shifting off lower bits. The shift count is |
| * stored as <code>SHIFT_START_LONG+shift</code> in the first byte |
| */ |
| public static final byte SHIFT_START_LONG = 0x20; |
| |
| /** |
| * The maximum term length (used for <code>byte[]</code> buffer size) |
| * for encoding <code>long</code> values. |
| * @see #longToPrefixCoded |
| */ |
| public static final int BUF_SIZE_LONG = 63/7 + 2; |
| |
| /** |
| * Integers are stored at lower precision by shifting off lower bits. The shift count is |
| * stored as <code>SHIFT_START_INT+shift</code> in the first byte |
| */ |
| public static final byte SHIFT_START_INT = 0x60; |
| |
| /** |
| * The maximum term length (used for <code>byte[]</code> buffer size) |
| * for encoding <code>int</code> values. |
| * @see #intToPrefixCoded |
| */ |
| public static final int BUF_SIZE_INT = 31/7 + 2; |
| |
| /** |
| * Returns prefix coded bits after reducing the precision by <code>shift</code> bits. |
| * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}. |
| * After encoding, {@code bytes.offset} will always be 0. |
| * @param val the numeric value |
| * @param shift how many bits to strip from the right |
| * @param bytes will contain the encoded value |
| */ |
| public static void longToPrefixCoded(final long val, final int shift, final BytesRefBuilder bytes) { |
| // ensure shift is 0..63 |
| if ((shift & ~0x3f) != 0) { |
| throw new IllegalArgumentException("Illegal shift value, must be 0..63; got shift=" + shift); |
| } |
| int nChars = (((63-shift)*37)>>8) + 1; // i/7 is the same as (i*37)>>8 for i in 0..63 |
| bytes.setLength(nChars+1); // one extra for the byte that contains the shift info |
| bytes.grow(BUF_SIZE_LONG); |
| bytes.setByteAt(0, (byte)(SHIFT_START_LONG + shift)); |
| long sortableBits = val ^ 0x8000000000000000L; |
| sortableBits >>>= shift; |
| while (nChars > 0) { |
| // Store 7 bits per byte for compatibility |
| // with UTF-8 encoding of terms |
| bytes.setByteAt(nChars--, (byte)(sortableBits & 0x7f)); |
| sortableBits >>>= 7; |
| } |
| } |
| |
| /** |
| * Returns prefix coded bits after reducing the precision by <code>shift</code> bits. |
| * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}. |
| * After encoding, {@code bytes.offset} will always be 0. |
| * @param val the numeric value |
| * @param shift how many bits to strip from the right |
| * @param bytes will contain the encoded value |
| */ |
| public static void intToPrefixCoded(final int val, final int shift, final BytesRefBuilder bytes) { |
| // ensure shift is 0..31 |
| if ((shift & ~0x1f) != 0) { |
| throw new IllegalArgumentException("Illegal shift value, must be 0..31; got shift=" + shift); |
| } |
| int nChars = (((31-shift)*37)>>8) + 1; // i/7 is the same as (i*37)>>8 for i in 0..63 |
| bytes.setLength(nChars+1); // one extra for the byte that contains the shift info |
| bytes.grow(LegacyNumericUtils.BUF_SIZE_LONG); // use the max |
| bytes.setByteAt(0, (byte)(SHIFT_START_INT + shift)); |
| int sortableBits = val ^ 0x80000000; |
| sortableBits >>>= shift; |
| while (nChars > 0) { |
| // Store 7 bits per byte for compatibility |
| // with UTF-8 encoding of terms |
| bytes.setByteAt(nChars--, (byte)(sortableBits & 0x7f)); |
| sortableBits >>>= 7; |
| } |
| } |
| |
| |
| /** |
| * Returns the shift value from a prefix encoded {@code long}. |
| * @throws NumberFormatException if the supplied {@link BytesRef} is |
| * not correctly prefix encoded. |
| */ |
| public static int getPrefixCodedLongShift(final BytesRef val) { |
| final int shift = val.bytes[val.offset] - SHIFT_START_LONG; |
| if (shift > 63 || shift < 0) |
| throw new NumberFormatException("Invalid shift value (" + shift + ") in prefixCoded bytes (is encoded value really an INT?)"); |
| return shift; |
| } |
| |
| /** |
| * Returns the shift value from a prefix encoded {@code int}. |
| * @throws NumberFormatException if the supplied {@link BytesRef} is |
| * not correctly prefix encoded. |
| */ |
| public static int getPrefixCodedIntShift(final BytesRef val) { |
| final int shift = val.bytes[val.offset] - SHIFT_START_INT; |
| if (shift > 31 || shift < 0) |
| throw new NumberFormatException("Invalid shift value in prefixCoded bytes (is encoded value really an INT?)"); |
| return shift; |
| } |
| |
| /** |
| * Returns a long from prefixCoded bytes. |
| * Rightmost bits will be zero for lower precision codes. |
| * This method can be used to decode a term's value. |
| * @throws NumberFormatException if the supplied {@link BytesRef} is |
| * not correctly prefix encoded. |
| * @see #longToPrefixCoded |
| */ |
| public static long prefixCodedToLong(final BytesRef val) { |
| long sortableBits = 0L; |
| for (int i=val.offset+1, limit=val.offset+val.length; i<limit; i++) { |
| sortableBits <<= 7; |
| final byte b = val.bytes[i]; |
| if (b < 0) { |
| throw new NumberFormatException( |
| "Invalid prefixCoded numerical value representation (byte "+ |
| Integer.toHexString(b&0xff)+" at position "+(i-val.offset)+" is invalid)" |
| ); |
| } |
| sortableBits |= b; |
| } |
| return (sortableBits << getPrefixCodedLongShift(val)) ^ 0x8000000000000000L; |
| } |
| |
| /** |
| * Returns an int from prefixCoded bytes. |
| * Rightmost bits will be zero for lower precision codes. |
| * This method can be used to decode a term's value. |
| * @throws NumberFormatException if the supplied {@link BytesRef} is |
| * not correctly prefix encoded. |
| * @see #intToPrefixCoded |
| */ |
| public static int prefixCodedToInt(final BytesRef val) { |
| int sortableBits = 0; |
| for (int i=val.offset+1, limit=val.offset+val.length; i<limit; i++) { |
| sortableBits <<= 7; |
| final byte b = val.bytes[i]; |
| if (b < 0) { |
| throw new NumberFormatException( |
| "Invalid prefixCoded numerical value representation (byte "+ |
| Integer.toHexString(b&0xff)+" at position "+(i-val.offset)+" is invalid)" |
| ); |
| } |
| sortableBits |= b; |
| } |
| return (sortableBits << getPrefixCodedIntShift(val)) ^ 0x80000000; |
| } |
| |
| /** |
| * Splits a long range recursively. |
| * You may implement a builder that adds clauses to a |
| * {@link org.apache.lucene.search.BooleanQuery} for each call to its |
| * {@link LongRangeBuilder#addRange(BytesRef,BytesRef)} |
| * method. |
| * <p>This method is used by {@link org.apache.solr.legacy.LegacyNumericRangeQuery}. |
| */ |
| public static void splitLongRange(final LongRangeBuilder builder, |
| final int precisionStep, final long minBound, final long maxBound |
| ) { |
| splitRange(builder, 64, precisionStep, minBound, maxBound); |
| } |
| |
| /** |
| * Splits an int range recursively. |
| * You may implement a builder that adds clauses to a |
| * {@link org.apache.lucene.search.BooleanQuery} for each call to its |
| * {@link IntRangeBuilder#addRange(BytesRef,BytesRef)} |
| * method. |
| * <p>This method is used by {@link org.apache.solr.legacy.LegacyNumericRangeQuery}. |
| */ |
| public static void splitIntRange(final IntRangeBuilder builder, |
| final int precisionStep, final int minBound, final int maxBound |
| ) { |
| splitRange(builder, 32, precisionStep, minBound, maxBound); |
| } |
| |
| /** This helper does the splitting for both 32 and 64 bit. */ |
| private static void splitRange( |
| final Object builder, final int valSize, |
| final int precisionStep, long minBound, long maxBound |
| ) { |
| if (precisionStep < 1) |
| throw new IllegalArgumentException("precisionStep must be >=1"); |
| if (minBound > maxBound) return; |
| for (int shift=0; ; shift += precisionStep) { |
| // calculate new bounds for inner precision |
| final long diff = 1L << (shift+precisionStep), |
| mask = ((1L<<precisionStep) - 1L) << shift; |
| final boolean |
| hasLower = (minBound & mask) != 0L, |
| hasUpper = (maxBound & mask) != mask; |
| final long |
| nextMinBound = (hasLower ? (minBound + diff) : minBound) & ~mask, |
| nextMaxBound = (hasUpper ? (maxBound - diff) : maxBound) & ~mask; |
| final boolean |
| lowerWrapped = nextMinBound < minBound, |
| upperWrapped = nextMaxBound > maxBound; |
| |
| if (shift+precisionStep>=valSize || nextMinBound>nextMaxBound || lowerWrapped || upperWrapped) { |
| // We are in the lowest precision or the next precision is not available. |
| addRange(builder, valSize, minBound, maxBound, shift); |
| // exit the split recursion loop |
| break; |
| } |
| |
| if (hasLower) |
| addRange(builder, valSize, minBound, minBound | mask, shift); |
| if (hasUpper) |
| addRange(builder, valSize, maxBound & ~mask, maxBound, shift); |
| |
| // recurse to next precision |
| minBound = nextMinBound; |
| maxBound = nextMaxBound; |
| } |
| } |
| |
| /** Helper that delegates to correct range builder */ |
| private static void addRange( |
| final Object builder, final int valSize, |
| long minBound, long maxBound, |
| final int shift |
| ) { |
| // for the max bound set all lower bits (that were shifted away): |
| // this is important for testing or other usages of the splitted range |
| // (e.g. to reconstruct the full range). The prefixEncoding will remove |
| // the bits anyway, so they do not hurt! |
| maxBound |= (1L << shift) - 1L; |
| // delegate to correct range builder |
| switch(valSize) { |
| case 64: |
| ((LongRangeBuilder)builder).addRange(minBound, maxBound, shift); |
| break; |
| case 32: |
| ((IntRangeBuilder)builder).addRange((int)minBound, (int)maxBound, shift); |
| break; |
| default: |
| // Should not happen! |
| throw new IllegalArgumentException("valSize must be 32 or 64."); |
| } |
| } |
| |
| /** |
| * Callback for {@link #splitLongRange}. |
| * You need to overwrite only one of the methods. |
| * @lucene.internal |
| * @since 2.9, API changed non backwards-compliant in 4.0 |
| */ |
| public static abstract class LongRangeBuilder { |
| |
| /** |
| * Overwrite this method, if you like to receive the already prefix encoded range bounds. |
| * You can directly build classical (inclusive) range queries from them. |
| */ |
| public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| /** |
| * Overwrite this method, if you like to receive the raw long range bounds. |
| * You can use this for e.g. debugging purposes (print out range bounds). |
| */ |
| public void addRange(final long min, final long max, final int shift) { |
| final BytesRefBuilder minBytes = new BytesRefBuilder(), maxBytes = new BytesRefBuilder(); |
| longToPrefixCoded(min, shift, minBytes); |
| longToPrefixCoded(max, shift, maxBytes); |
| addRange(minBytes.get(), maxBytes.get()); |
| } |
| |
| } |
| |
| /** |
| * Callback for {@link #splitIntRange}. |
| * You need to overwrite only one of the methods. |
| * @lucene.internal |
| * @since 2.9, API changed non backwards-compliant in 4.0 |
| */ |
| public static abstract class IntRangeBuilder { |
| |
| /** |
| * Overwrite this method, if you like to receive the already prefix encoded range bounds. |
| * You can directly build classical range (inclusive) queries from them. |
| */ |
| public void addRange(BytesRef minPrefixCoded, BytesRef maxPrefixCoded) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| /** |
| * Overwrite this method, if you like to receive the raw int range bounds. |
| * You can use this for e.g. debugging purposes (print out range bounds). |
| */ |
| public void addRange(final int min, final int max, final int shift) { |
| final BytesRefBuilder minBytes = new BytesRefBuilder(), maxBytes = new BytesRefBuilder(); |
| intToPrefixCoded(min, shift, minBytes); |
| intToPrefixCoded(max, shift, maxBytes); |
| addRange(minBytes.get(), maxBytes.get()); |
| } |
| |
| } |
| |
| /** |
| * Filters the given {@link TermsEnum} by accepting only prefix coded 64 bit |
| * terms with a shift value of <tt>0</tt>. |
| * |
| * @param termsEnum |
| * the terms enum to filter |
| * @return a filtered {@link TermsEnum} that only returns prefix coded 64 bit |
| * terms with a shift value of <tt>0</tt>. |
| */ |
| public static TermsEnum filterPrefixCodedLongs(TermsEnum termsEnum) { |
| return new SeekingNumericFilteredTermsEnum(termsEnum) { |
| |
| @Override |
| protected AcceptStatus accept(BytesRef term) { |
| return LegacyNumericUtils.getPrefixCodedLongShift(term) == 0 ? AcceptStatus.YES : AcceptStatus.END; |
| } |
| }; |
| } |
| |
| /** |
| * Filters the given {@link TermsEnum} by accepting only prefix coded 32 bit |
| * terms with a shift value of <tt>0</tt>. |
| * |
| * @param termsEnum |
| * the terms enum to filter |
| * @return a filtered {@link TermsEnum} that only returns prefix coded 32 bit |
| * terms with a shift value of <tt>0</tt>. |
| */ |
| public static TermsEnum filterPrefixCodedInts(TermsEnum termsEnum) { |
| return new SeekingNumericFilteredTermsEnum(termsEnum) { |
| |
| @Override |
| protected AcceptStatus accept(BytesRef term) { |
| return LegacyNumericUtils.getPrefixCodedIntShift(term) == 0 ? AcceptStatus.YES : AcceptStatus.END; |
| } |
| }; |
| } |
| |
| /** Just like FilteredTermsEnum, except it adds a limited |
| * seekCeil implementation that only works with {@link |
| * #filterPrefixCodedInts} and {@link |
| * #filterPrefixCodedLongs}. */ |
| private static abstract class SeekingNumericFilteredTermsEnum extends FilteredTermsEnum { |
| public SeekingNumericFilteredTermsEnum(final TermsEnum tenum) { |
| super(tenum, false); |
| } |
| |
| @Override |
| @SuppressWarnings("fallthrough") |
| public SeekStatus seekCeil(BytesRef term) throws IOException { |
| |
| // NOTE: This is not general!! It only handles YES |
| // and END, because that's all we need for the numeric |
| // case here |
| |
| SeekStatus status = tenum.seekCeil(term); |
| if (status == SeekStatus.END) { |
| return SeekStatus.END; |
| } |
| |
| actualTerm = tenum.term(); |
| |
| if (accept(actualTerm) == AcceptStatus.YES) { |
| return status; |
| } else { |
| return SeekStatus.END; |
| } |
| } |
| } |
| |
| private static Terms intTerms(Terms terms) { |
| return new FilterLeafReader.FilterTerms(terms) { |
| @Override |
| public TermsEnum iterator() throws IOException { |
| return filterPrefixCodedInts(in.iterator()); |
| } |
| }; |
| } |
| |
| private static Terms longTerms(Terms terms) { |
| return new FilterLeafReader.FilterTerms(terms) { |
| @Override |
| public TermsEnum iterator() throws IOException { |
| return filterPrefixCodedLongs(in.iterator()); |
| } |
| }; |
| } |
| |
| /** |
| * Returns the minimum int value indexed into this |
| * numeric field or null if no terms exist. |
| */ |
| public static Integer getMinInt(Terms terms) throws IOException { |
| // All shift=0 terms are sorted first, so we don't need |
| // to filter the incoming terms; we can just get the |
| // min: |
| BytesRef min = terms.getMin(); |
| return (min != null) ? LegacyNumericUtils.prefixCodedToInt(min) : null; |
| } |
| |
| /** |
| * Returns the maximum int value indexed into this |
| * numeric field or null if no terms exist. |
| */ |
| public static Integer getMaxInt(Terms terms) throws IOException { |
| BytesRef max = intTerms(terms).getMax(); |
| return (max != null) ? LegacyNumericUtils.prefixCodedToInt(max) : null; |
| } |
| |
| /** |
| * Returns the minimum long value indexed into this |
| * numeric field or null if no terms exist. |
| */ |
| public static Long getMinLong(Terms terms) throws IOException { |
| // All shift=0 terms are sorted first, so we don't need |
| // to filter the incoming terms; we can just get the |
| // min: |
| BytesRef min = terms.getMin(); |
| return (min != null) ? LegacyNumericUtils.prefixCodedToLong(min) : null; |
| } |
| |
| /** |
| * Returns the maximum long value indexed into this |
| * numeric field or null if no terms exist. |
| */ |
| public static Long getMaxLong(Terms terms) throws IOException { |
| BytesRef max = longTerms(terms).getMax(); |
| return (max != null) ? LegacyNumericUtils.prefixCodedToLong(max) : null; |
| } |
| |
| } |