| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.datasketches.sampling; |
| |
| import org.apache.datasketches.SketchesArgumentException; |
| import org.apache.datasketches.Util; |
| |
| /** |
| * This class provides a compact representation of reservoir size by encoding it into a |
| * fixed-point 16-bit value. |
| * <p>The value itself is a fractional power of 2, with 5 bits of exponent and 11 bits of |
| * mantissa. The exponent allows a choice of anywhere from 0-30, and there are 2048 possible |
| * reservoir size values within each octave. Because reservoir size must be an integer, this |
| * means that some sizes below 2048 may have multiple valid encodings.</p> |
| * |
| * <p>Reservoir sizes can be specified exactly to 4096, and may be off by up to 0.03% thereafter. |
| * The value returned is always at least as large as the requested size, but may be larger. |
| * </p> |
| * |
| * <p>NOTE: Numerical instability may cause an off-by-one error on reservoir size, causing a |
| * slight increase in storage over the optimal value.</p> |
| * |
| * @author Jon Malkin |
| */ |
| final class ReservoirSize { |
| /** |
| * Number of bins per power of two. |
| */ |
| static final int BINS_PER_OCTAVE = 2048; |
| |
| /** |
| * Precomputed inverse values for efficiency |
| */ |
| private static final double INV_BINS_PER_OCTAVE = 1.0 / BINS_PER_OCTAVE; |
| private static final double INV_LN_2 = 1.0 / Math.log(2.0); |
| |
| /** |
| * Values for encoding/decoding |
| */ |
| private static final int EXPONENT_MASK = 0x1F; |
| private static final int EXPONENT_SHIFT = 11; |
| private static final int INDEX_MASK = 0x07FF; |
| private static final int OUTPUT_MASK = 0xFFFF; |
| private static final int MAX_ABS_VALUE = 2146959360; |
| private static final int MAX_ENC_VALUE = 0xF7FF; // p=30, i=2047 |
| |
| private ReservoirSize() {} |
| |
| /** |
| * Given target reservoir size k, computes the smallest representable reservoir size that can |
| * hold k entries and returns it in a 16-bit fixed-point format as a <code>short</code>. |
| * |
| * @param k target reservoir size |
| * @return reservoir size as 16-bit encoded value |
| */ |
| public static short computeSize(final int k) { |
| if ((k < 1) || (k > MAX_ABS_VALUE)) { |
| throw new SketchesArgumentException("Can only encode strictly positive sketch sizes " |
| + "less than " + MAX_ABS_VALUE + ", found: " + k); |
| } |
| |
| final int p = Util.toLog2(Util.floorPowerOf2(k), "computeSize: p"); |
| |
| // because of floor() + 1 below, need to check power of 2 here |
| if (Util.isPowerOf2(k)) { |
| return (short) (((p & EXPONENT_MASK) << EXPONENT_SHIFT) & OUTPUT_MASK); |
| } |
| |
| // mantissa is scalar in range [1,2); can reconstruct k as m * 2^p |
| final double m = Math.pow(2.0, (Math.log(k) * INV_LN_2) - p); |
| |
| // Convert to index offset: ceil(m * BPO) - BPO |
| // Typically in range range 0-(BINS_PER_OCTAVE-1) (but see note below) |
| final int i = ((int) Math.floor(m * BINS_PER_OCTAVE) - BINS_PER_OCTAVE) + 1; |
| |
| // Due to ceiling, possible to overflow BINS_PER_OCTAVE |
| // E.g., if BPO = 2048 then for k=32767 we have p=14. Except that 32767 > decodeValue |
| // (p=14, i=2047)=32756, so we encode and return p+1 |
| if (i == BINS_PER_OCTAVE) { |
| return (short) ((((p + 1) & EXPONENT_MASK) << EXPONENT_SHIFT) & OUTPUT_MASK); |
| } |
| |
| return (short) (((p & EXPONENT_MASK) << EXPONENT_SHIFT) | ((i & INDEX_MASK) & OUTPUT_MASK)); |
| } |
| |
| /** |
| * Decodes the 16-bit reservoir size value into an int. |
| * |
| * @param encodedSize Encoded 16-bit value |
| * @return int represented by <code>encodedSize</code> |
| */ |
| public static int decodeValue(final short encodedSize) { |
| final int value = encodedSize & 0xFFFF; |
| |
| if (value > MAX_ENC_VALUE) { |
| throw new SketchesArgumentException("Maximum valid encoded value is " |
| + Integer.toHexString(MAX_ENC_VALUE) + ", found: " + value); |
| } |
| |
| final int p = (value >>> EXPONENT_SHIFT) & EXPONENT_MASK; |
| final int i = value & INDEX_MASK; |
| |
| return (int) ((1 << p) * ((i * INV_BINS_PER_OCTAVE) + 1.0)); |
| } |
| } |