blob: 76f8af3fd826af87646aa4464a8dc1b31a5f30f3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.statistics.inference;
/**
* Computes double-double floating-point operations.
*
* <p>A double-double is an unevaluated sum of two IEEE double precision numbers capable
* of representing at least 106 bits of significand.
*
* <p>This implementation performs all computations using the explicit high and low parts
* of the double-double numbers. Results are written to a provided double-double instance
* which is returned for convenience. This allows the same double-double instance to be an
* argument and the result of a computation. The double-double class is mutable but can
* only be modified by methods within this class. All static methods in this class will
* not allocate objects so minimising memory allocation of intermediate numbers during
* computation.
*
* <p>Note: This is not a full double-double implementation. Only the methods required
* within this package for extended precision computations have been implemented.
*
* <p>References:
* <ol>
* <li>
* Dekker, T.J. (1971)
* <a href="https://doi.org/10.1007/BF01397083">
* A floating-point technique for extending the available precision</a>
* Numerische Mathematik, 18:224–242.
* <li>
* Shewchuk, J.R. (1997)
* <a href="https://www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps">
* Arbitrary Precision Floating-Point Arithmetic</a>.
* <li>
* Hide, Y, Li, X.S. and Bailey, D.H. (2008)
* <a href="https://www.davidhbailey.com/dhbpapers/qd.pdf">
* Library for Double-Double and Quad-Double Arithmetic</a>.
* </ol>
*
* @since 1.1
*/
final class DD {
// Caveat:
//
// The code below uses many additions/subtractions that may
// appear redundant. However, they should NOT be simplified, as they
// do use IEEE754 floating point arithmetic rounding properties.
//
// Algorithms are based on computing the product or sum of two values x and y in
// extended precision. The standard result is stored using a double (high part z) and
// the round-off error (or low part zz) is stored in a second double, e.g:
// x * y = (z, zz); z + zz = x * y
// x + y = (z, zz); z + zz = x + y
//
// The building blocks for double-double arithmetic are:
//
// Fast-Two-Sum: Addition of two doubles (ordered |x| > |y|) to a double-double
// Two-Sum: Addition of two doubles (unordered) to a double-double
// Two-Prod: Multiplication of two doubles to a double-double
//
// These are used to create functions operating on double and double-double numbers.
//
// To sum multiple (z, zz) results ideally the parts are sorted in order of
// non-decreasing magnitude and summed. This is exact if each number's most significant
// bit is below the least significant bit of the next (i.e. does not
// overlap). Creating non-overlapping parts requires a rebalancing
// of adjacent pairs using a summation z + zz = (z1, zz1) iteratively through the parts
// (see Shewchuk (1997) Grow-Expansion and Expansion-Sum [2]).
//
// Accurate summation of an expansion (more than one double value) to a double-double
// performs a two-sum through the expansion e (length m).
// The single pass with two-sum ensures that the final term e_m is a good approximation
// for e: |e - e_m| < ulp(e_m); and the sum of the parts to
// e_(m-1) is within 1 ULP of the round-off ulp(|e - e_m|).
// These final two terms create the double-double result using two-sum.
/**
* The multiplier used to split the double value into high and low parts. From
* Dekker (1971): "The constant should be chosen equal to 2^(p - p/2) + 1,
* where p is the number of binary digits in the mantissa". Here p is 53
* and the multiplier is {@code 2^27 + 1}.
*/
private static final double MULTIPLIER = 1.0 + 0x1.0p27;
/**
* The upper limit above which a number may overflow during the split into a high part.
* Assuming the multiplier is above 2^27 and the maximum exponent is 1023 then a safe
* limit is a value with an exponent of (1023 - 27) = 2^996.
* 996 is the value obtained from {@code Math.getExponent(Double.MAX_VALUE / MULTIPLIER)}.
*/
private static final double SAFE_UPPER = 0x1.0p996;
/** The scale to use when down-scaling during a split into a high part.
* This must be smaller than the inverse of the multiplier and a power of 2 for exact scaling. */
private static final double DOWN_SCALE = 0x1.0p-30;
/** The scale to use when re-scaling during a split into a high part.
* This is the inverse of {@link #DOWN_SCALE}. */
private static final double UP_SCALE = 0x1.0p30;
/** The mask to extract the raw 11-bit exponent.
* The value must be shifted 52-bits to remove the mantissa bits. */
private static final int EXP_MASK = 0x7ff;
/** The value 2046 converted for use if using {@link Integer#compareUnsigned(int, int)}.
* This requires adding {@link Integer#MIN_VALUE} to 2046. */
private static final int CMP_UNSIGNED_2046 = Integer.MIN_VALUE + 2046;
/** The value -1 converted for use if using {@link Integer#compareUnsigned(int, int)}.
* This requires adding {@link Integer#MIN_VALUE} to -1. */
private static final int CMP_UNSIGNED_MINUS_1 = Integer.MIN_VALUE - 1;
/** The value 1022 converted for use if using {@link Integer#compareUnsigned(int, int)}.
* This requires adding {@link Integer#MIN_VALUE} to 1022. */
private static final int CMP_UNSIGNED_1022 = Integer.MIN_VALUE + 1022;
/** 2^512. */
private static final double TWO_POW_512 = 0x1.0p512;
/** 2^-512. */
private static final double TWO_POW_M512 = 0x1.0p-512;
/** Mask to remove the sign bit from a long. */
private static final long UNSIGN_MASK = 0x7fff_ffff_ffff_ffffL;
/** Mask to extract the 52-bit mantissa from a long representation of a double. */
private static final long MANTISSA_MASK = 0x000f_ffff_ffff_ffffL;
/** Exponent offset in IEEE754 representation. */
private static final int EXPONENT_OFFSET = 1023;
/** 0.5. */
private static final double HALF = 0.5;
/** The limit for safe multiplication of {@code x*y}, assuming values above 1.
* Used to maintain positive values during the power computation. */
private static final double SAFE_MULTIPLY = 0x1.0p500;
/** Error message when the input is not a normalized double: {@code x != x + xx}. */
private static final String NOT_NOMALIZED = "Input is not a normalized double-double";
/** The high part of the double-double number. */
private double hi;
/** The low part of the double-double number. */
private double lo;
/**
* Create a double-double. The value is zero.
*/
private DD() {
// Do nothing
}
/**
* Copy constructor.
*
* @param source Source to copy.
*/
private DD(DD source) {
hi = source.hi;
lo = source.lo;
}
/**
* Return a copy of this number.
*
* @return the copy
*/
DD copy() {
return new DD(this);
}
/**
* Creates the a new instance of a double-double number. The value is zero.
*
* @return the double-double
*/
static DD create() {
return new DD();
}
/**
* Creates the double-double number as the value {@code (x, 0)}.
*
* @param x Value.
* @return the double-double
*/
static DD create(double x) {
final DD z = new DD();
z.hi = x;
return z;
}
/**
* Creates the double-double number as the sum of two values {@code (x, xx) = a + b}.
* The number will be normalised such that {@code |x| > epsilon * |xx|}.
* The arguments are not required to be ordered by magnitude,
* i.e. the result is commutative {@code (x, xx) = a + b == b + a}.
*
* @param a First part of sum.
* @param b Second part of sum.
* @return the double-double
*/
static DD create(double a, double b) {
return twoSum(a, b, new DD());
}
/**
* Gets the high part of the double-double number.
*
* @return the high part
*/
double hi() {
return hi;
}
/**
* Gets the low part of the double-double number.
*
* @return the low part
*/
double lo() {
return lo;
}
/**
* Get the value as a double.
*
* @return the value converted to a double
*/
double doubleValue() {
return hi + lo;
}
/**
* Set the double-double value to {@code (x, xx)}.
*
* @param x High part of x.
* @param xx Low part of x.
* @return a reference to {@code this}
*/
private DD set(double x, double xx) {
hi = x;
lo = xx;
return this;
}
/**
* Sets the double-double number as the value {@code (x, 0)}.
*
* @param x High part.
* @param z Number (result).
* @return the double-double
*/
static DD set(double x, DD z) {
z.set(x, 0);
return z;
}
/**
* Sets the double-double number as the value {@code (x, xx)}.
* The number must be normalised such that {@code x == x + xx}.
*
* @param x High part.
* @param xx Low part.
* @param z Number (result).
* @return the double-double
*/
static DD set(double x, double xx, DD z) {
assert x == x + xx : NOT_NOMALIZED;
z.set(x, xx);
return z;
}
/**
* Compute the sum of two numbers {@code a} and {@code b} using
* Dekker's two-sum algorithm. The values are required to be ordered by magnitude:
* {@code |a| >= |b|}.
*
* <p>If {@code a} is zero and {@code b} is non-zero the returned value is {@code (b, 0)}.
*
* @param a First part of sum.
* @param b Second part of sum.
* @param s Sum (result).
* @return the sum
* @see #fastTwoDiff(double, double, DD)
* @see <a href="http://www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps">
* Shewchuk (1997) Theorum 6</a>
*/
static DD fastTwoSum(double a, double b, DD s) {
// (x, xx) = a + b
// bVirtual = x - a
// xx = b - bVirtual
final double x = a + b;
s.lo = b - (x - a);
s.hi = x;
return s;
}
/**
* Compute the round-off of the sum of two numbers {@code a} and {@code b} using
* Dekker's two-sum algorithm. The values are required to be ordered by magnitude:
* {@code |a| >= |b|}.
*
* <p>If {@code a} is zero and {@code b} is non-zero the returned value is zero.
*
* @param a First part of sum.
* @param b Second part of sum.
* @param x Sum.
* @return the sum round-off
* @see #fastTwoSum(double, double, DD)
*/
private static double fastTwoSumLow(double a, double b, double x) {
return b - (x - a);
}
/**
* Compute the difference of two numbers {@code a} and {@code b} using
* Dekker's two-sum algorithm. The values are required to be ordered by magnitude:
* {@code |a| >= |b|}.
*
* <p>Computes the same results as {@link #fastTwoSum(double, double, DD) fastTwoSum(a, -b)}.
*
* @param a Minuend.
* @param b Subtrahend.
* @param d Difference (result).
* @return the difference
* @see #fastTwoSum(double, double, DD)
* @see <a href="http://www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps">
* Shewchuk (1997) Theorum 6</a>
*/
static DD fastTwoDiff(double a, double b, DD d) {
// (x, xx) = a - b
// bVirtual = a - x
// xx = bVirtual - b
final double x = a - b;
d.lo = (a - x) - b;
d.hi = x;
return d;
}
/**
* Compute the sum of two numbers {@code a} and {@code b} using
* Knuth's two-sum algorithm. The values are not required to be ordered by magnitude,
* i.e. the result is commutative {@code s = a + b == b + a}.
*
* @param a First part of sum.
* @param b Second part of sum.
* @param s Sum (result).
* @return the sum
* @see #twoDiff(double, double, DD)
* @see <a href="http://www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps">
* Shewchuk (1997) Theorum 7</a>
*/
static DD twoSum(double a, double b, DD s) {
// (x, xx) = a + b
// bVirtual = x - a
// aVirtual = x - bVirtual
// bRoundoff = b - bVirtual
// aRoundoff = a - aVirtual
// xx = aRoundoff + bRoundoff
final double x = a + b;
final double bVirtual = x - a;
s.lo = (a - (x - bVirtual)) + (b - bVirtual);
s.hi = x;
return s;
}
/**
* Compute the round-off of the sum of two numbers {@code a} and {@code b} using
* Knuth two-sum algorithm. The values are not required to be ordered by magnitude,
* i.e. the result is commutative {@code s = a + b == b + a}.
*
* @param a First part of sum.
* @param b Second part of sum.
* @param x Sum.
* @return the sum round-off
* @see #twoSum(double, double, DD)
*/
private static double twoSumLow(double a, double b, double x) {
final double bVirtual = x - a;
return (a - (x - bVirtual)) + (b - bVirtual);
}
/**
* Compute the difference of two numbers {@code a} and {@code b} using
* Knuth's two-sum algorithm. The values are not required to be ordered by magnitude
*
* <p>Computes the same results as {@link #twoSum(double, double, DD) twoSum(a, -b)}.
*
* @param a Minuend.
* @param b Subtrahend.
* @param d Difference (result).
* @return the difference
* @see #twoSum(double, double, DD)
*/
static DD twoDiff(double a, double b, DD d) {
// (x, xx) = a - b
// bVirtual = a - x
// aVirtual = x + bVirtual
// bRoundoff = b - bVirtual
// aRoundoff = a - aVirtual
// xx = aRoundoff - bRoundoff
final double x = a - b;
final double bVirtual = a - x;
d.lo = (a - (x + bVirtual)) - (b - bVirtual);
d.hi = x;
return d;
}
/**
* Compute the double-double number {@code (z,zz)} for the exact
* product of {@code x} and {@code y}.
*
* <p>The method is written to be functionally similar to using a fused multiply add (FMA)
* operation to compute the low part, for example JDK 9's Math.fma function:
* <pre>
* double x = ...;
* double y = ...;
* double xy = x * y;
* double low = Math.fma(x, y, -xy);
* </pre>
*
* <p>This creates the following special cases:
*
* <ul>
* <li>If {@code x * y} is sub-normal or zero then the low part is 0.0.
* <li>If {@code x * y} is infinite or NaN then the low part is NaN.
* </ul>
*
* @param x First factor.
* @param y Second factor.
* @param z Product (result).
* @return the product
*/
static DD twoProd(double x, double y, DD z) {
final double xy = x * y;
z.hi = xy;
// If the number is sub-normal, inf or nan there is no round-off.
if (isNotNormal(xy)) {
// Returns 0.0 for sub-normal xy, otherwise NaN for inf/nan:
z.lo = xy - xy;
return z;
}
// The result xy is finite and normal.
// Use Dekker's mul12 algorithm that splits the values into high and low parts.
// Dekker's split using multiplication will overflow if the value is within 2^27
// of double max value. It can also produce 26-bit approximations that are larger
// than the input numbers for the high part causing overflow in hx * hy when
// x * y does not overflow. So we must scale down big numbers.
// We only have to scale the largest number as we know the product does not overflow
// (if one is too big then the other cannot be).
// We also scale if the product is close to overflow to avoid intermediate overflow.
// This could be done at a higher limit (e.g. Math.abs(xy) > Double.MAX_VALUE / 4)
// but is included here to have a single low probability branch condition.
// Add the absolute inputs for a single comparison. The sum will not be more than
// 3-fold higher than any component.
final double a = Math.abs(x);
final double b = Math.abs(y);
if (a + b + Math.abs(xy) >= SAFE_UPPER) {
// Only required to scale the largest number as x*y does not overflow.
if (a > b) {
z.lo = productLowUnscaled(x * DOWN_SCALE, y, xy * DOWN_SCALE) * UP_SCALE;
} else {
z.lo = productLowUnscaled(x, y * DOWN_SCALE, xy * DOWN_SCALE) * UP_SCALE;
}
} else {
// No scaling required. This is the expected branch for a finite product.
z.lo = productLowUnscaled(x, y, xy);
}
return z;
}
/**
* Compute the low part of the double length number {@code (z,zz)} for the exact
* product of {@code x} and {@code y} using Dekker's mult12 algorithm. The standard
* precision product {@code x*y} must be provided. The numbers {@code x} and {@code y}
* are split into high and low parts using Dekker's algorithm.
*
* <p>Warning: This method does not perform scaling in Dekker's split and large
* finite numbers can create NaN results.
*
* @param x First factor.
* @param y Second factor.
* @param xy Product of the factors (x * y).
* @return the low part of the product double length number
* @see #highPartUnscaled(double)
*/
private static double productLowUnscaled(double x, double y, double xy) {
// Split the numbers using Dekker's algorithm without scaling
final double hx = highPartUnscaled(x);
final double lx = x - hx;
final double hy = highPartUnscaled(y);
final double ly = y - hy;
// Compute the multiply low part:
// err1 = xy - hx * hy
// err2 = err1 - lx * hy
// err3 = err2 - hx * ly
// low = lx * ly - err3
return lx * ly - (((xy - hx * hy) - lx * hy) - hx * ly);
}
/**
* Compute the low part of the double length number {@code (z,zz)} for the exact
* product of {@code x} and {@code y} using Dekker's mult12 algorithm. The standard
* precision product {@code x*y}, and the high and low parts of the factors must be
* provided.
*
* @param hx High-part of first factor.
* @param lx Low-part of first factor.
* @param hy High-part of second factor.
* @param ly Low-part of second factor.
* @param xy Product of the factors (x * y).
* @return the low part of the product double length number
* @see #productLowUnscaled(double, double, double)
*/
private static double productLowUnscaled(double hx, double lx, double hy, double ly, double xy) {
return lx * ly - (((xy - hx * hy) - lx * hy) - hx * ly);
}
/**
* Compute the low part of the double length number {@code (z,zz)} for the exact
* square of {@code x} using Dekker's mult12 algorithm. The standard
* precision square {@code x*x} must be provided. The number {@code x}
* is split into high and low parts using Dekker's algorithm.
*
* <p>Warning: This method does not perform scaling in Dekker's split and large
* finite numbers can create NaN results.
*
* @param x Factor.
* @param x2 Square of the factor (x * x).
* @return the low part of the square double length number
* @see #highPartUnscaled(double)
* @see #productLowUnscaled(double, double, double)
*/
private static double squareLowUnscaled(double x, double x2) {
// See productLowUnscaled
final double hx = highPartUnscaled(x);
final double lx = x - hx;
return lx * lx - ((x2 - hx * hx) - 2 * lx * hx);
}
/**
* Compute the low part of the double length number {@code (z,zz)} for the exact
* square of {@code x} using Dekker's mult12 algorithm. The standard
* precision square {@code x*x}, and the high and low parts of the factors must be
* provided.
*
* @param hx High-part of factor.
* @param lx Low-part of factor.
* @param x2 Square of the factor (x * x).
* @return the low part of the square double length number
* @see #squareLowUnscaled(double, double)
*/
private static double squareLowUnscaled(double hx, double lx, double x2) {
return lx * lx - ((x2 - hx * hx) - 2 * lx * hx);
}
/**
* Implement Dekker's method to split a value into two parts. Multiplying by (2^s + 1) creates
* a big value from which to derive the two split parts.
* <pre>
* c = (2^s + 1) * a
* a_big = c - a
* a_hi = c - a_big
* a_lo = a - a_hi
* a = a_hi + a_lo
* </pre>
*
* <p>The multiplicand allows a p-bit value to be split into
* (p-s)-bit value {@code a_hi} and a non-overlapping (s-1)-bit value {@code a_lo}.
* Combined they have (p-1) bits of significand but the sign bit of {@code a_lo}
* contains a bit of information. The constant is chosen so that s is ceil(p/2) where
* the precision p for a double is 53-bits (1-bit of the mantissa is assumed to be
* 1 for a non sub-normal number) and s is 27.
*
* <p>This conversion does not use scaling and the result of overflow is NaN. Overflow
* may occur when the exponent of the input value is above 996.
*
* <p>Splitting a NaN or infinite value will return NaN.
*
* @param value Value.
* @return the high part of the value.
* @see Math#getExponent(double)
*/
private static double highPartUnscaled(double value) {
final double c = MULTIPLIER * value;
return c - (c - value);
}
/**
* Checks if the number is not normal. This is functionally equivalent to:
* <pre>{@code
* final double abs = Math.abs(a);
* return (abs <= Double.MIN_NORMAL || !(abs <= Double.MAX_VALUE));
* }</pre>
*
* @param a The value.
* @return true if the value is not normal
*/
private static boolean isNotNormal(double a) {
// Sub-normal numbers have a biased exponent of 0.
// Inf/NaN numbers have a biased exponent of 2047.
// Catch both cases by extracting the raw exponent, subtracting 1
// and compare unsigned (so 0 underflows to a unsigned large value).
final int baisedExponent = ((int) (Double.doubleToRawLongBits(a) >>> 52)) & EXP_MASK;
// Pre-compute the additions used by Integer.compareUnsigned
return baisedExponent + CMP_UNSIGNED_MINUS_1 >= CMP_UNSIGNED_2046;
}
/**
* Compute the sum of {@code (x, xx)} and {@code y}.
*
* <p>This computes the same result as
* {@link #fastAdd(double, double, double, double, DD) add(x, xx, y, 0, s)}.
*
* @param x High part of x.
* @param xx Low part of x.
* @param y y.
* @param s Sum (result).
* @return the sum
* @see #fastAdd(double, double, double, double, DD)
*/
static DD fastAdd(double x, double xx, double y, DD s) {
// (s0, s1) = x + y
twoSum(x, y, s);
// Note: if x + y cancel to a non-zero result then s.hi is >= 1 ulp of x.
// This is larger than xx so fast-two-sum can be used.
return fastTwoSum(s.hi, s.lo + xx, s);
}
/**
* Compute the sum of {@code (x, xx)} and {@code y}.
*
* <p>This computes the same result as
* {@link #add(double, double, double, double, DD) add(x, xx, y, 0, s)}.
*
* @param x High part of x.
* @param xx Low part of x.
* @param y y.
* @param s Sum (result).
* @return the sum
* @see #add(double, double, double, double, DD)
*/
static DD add(double x, double xx, double y, DD s) {
// Grow expansion (Schewchuk): (x, xx) + y -> (s0, s1, s2)
twoSum(xx, y, s);
double s2 = s.lo;
twoSum(x, s.hi, s);
final double s1 = s.lo;
final double s0 = s.hi;
// Compress (Schewchuk Fig. 15): (s0, s1, s2) -> (s0, s1)
fastTwoSum(s1, s2, s);
s2 = s.lo;
fastTwoSum(s0, s.hi, s);
// Here (s0, s1) = s
// e = exact 159-bit result
// |e - s0| <= ulp(s0)
// |s1 + s2| <= ulp(e - s0)
return fastTwoSum(s.hi, s2 + s.lo, s);
}
/**
* Compute the sum of {@code (x, xx)} and {@code (y, yy)}.
*
* <p>The result is within the error bound {@code 4 eps^2} with {@code eps = 2^-53}.
*
* @param x High part of x.
* @param xx Low part of x.
* @param y High part of y.
* @param yy Low part of y.
* @param s Sum (result).
* @return the sum
* @see #add(double, double, double, double, DD)
*/
static DD fastAdd(double x, double xx, double y, double yy, DD s) {
// Sum parts and save
// (p, pp) = x + y
twoSum(x, y, s);
final double p = s.hi;
final double pp = s.lo;
// (q, qq) = xx + yy
twoSum(xx, yy, s);
final double q = s.hi;
final double qq = s.lo;
// result = p + q
// |pp| is >= 1 ulp of max(|x|, |y|)
// |q| is >= 1 ulp of max(|xx|, |yy|)
fastTwoSum(p, pp + q, s);
return fastTwoSum(s.hi, s.lo + qq, s);
}
/**
* Compute the sum of {@code (x, xx)} and {@code (y, yy)}.
*
* <p>The high-part of the result is within 1 ulp of the true sum {@code e}.
* The low-part of the result is within 1 ulp of the result of the high-part
* subtracted from the true sum {@code e - hi}.
*
* @param x High part of x.
* @param xx Low part of x.
* @param y High part of y.
* @param yy Low part of y.
* @param s Sum (result).
* @return the sum
* @see #fastAdd(double, double, double, double, DD)
*/
static DD add(double x, double xx, double y, double yy, DD s) {
// Expansion sum (Schewchuk Fig 7): (x, xx) + (x, yy) -> (s0, s1, s2, s3)
twoSum(xx, yy, s);
double s3 = s.lo;
twoSum(x, s.hi, s);
// (s0, s1, s2) == (s.hi, s.lo, s3)
double s0 = s.hi;
twoSum(s.lo, y, s);
double s2 = s.lo;
twoSum(s0, s.hi, s);
// s1 = s.lo
s0 = s.hi;
// Compress (Schewchuk Fig. 15) (s0, s1, s2, s3) -> (s0, s1)
fastTwoSum(s.lo, s2, s);
final double s1 = s.hi;
fastTwoSum(s.lo, s3, s);
// s2 = s.hi
s3 = s.lo;
fastTwoSum(s1, s.hi, s);
s2 = s.lo;
fastTwoSum(s0, s.hi, s);
// Here (s0, s1) = s
// e = exact 212-bit result
// |e - s0| <= ulp(s0)
// |s1 + s2 + s3| <= ulp(e - s0) (Sum magnitudes small to high)
return fastTwoSum(s.hi, s3 + s2 + s.lo, s);
}
/**
* Compute the multiplication product of {@code (x, xx)} and {@code y}.
*
* <p>This computes the same result as
* {@link #multiply(double, double, double, double, DD) multiply(x, xx, y, 0, s)}.
*
* @param x High part of x.
* @param xx Low part of x.
* @param y High part of y.
* @param p Product (result).
* @return the product
* @see #multiply(double, double, double, double, DD)
*/
static DD multiply(double x, double xx, double y, DD p) {
// Dekker mul2 with yy=0
// (Alternative: Scale expansion (Schewchuk Fig 13))
twoProd(x, y, p);
// Save 2 FLOPS compared to multiply(x, xx, y, 0).
// This is reused in divide to save more FLOPS so worth the optimisation.
return fastTwoSum(p.hi, p.lo + xx * y, p);
}
/**
* Compute the multiplication product of {@code (x, xx)} and {@code (y, yy)}.
*
* <p>The result is within the error bound {@code 16 eps^2} with {@code eps = 2^-53}.
*
* @param x High part of x.
* @param xx Low part of x.
* @param y High part of y.
* @param yy Low part of y.
* @param p Product (result).
* @return the product
*/
static DD multiply(double x, double xx, double y, double yy, DD p) {
// Dekker mul2
// (Alternative: Scale expansion (Schewchuk Fig 13))
twoProd(x, y, p);
return fastTwoSum(p.hi, p.lo + (x * yy + xx * y), p);
}
/**
* Compute the multiplication product of {@code (x, xx)} and {@code (y, yy)}.
*
* <p>This computes the same result as
* {@link #multiply(double, double, double, double, DD) multiply(x, xx, y, yy,
* p)} without checks for overflow of intermediates. An exception is if either
* argument is a signed zero; in this case the sign of the zero result may be
* different.
*
* <p>Use when the magnitude of both factors is below {@code 2^511} and the
* product will be finite.
*
* @param x High part of x.
* @param xx Low part of x.
* @param y High part of y.
* @param yy Low part of y.
* @param p Product (result).
* @return the product
* @see #multiply(double, double, double, double, DD)
*/
static DD uncheckedMultiply(double x, double xx, double y, double yy, DD p) {
// Dekker mul2
final double hi = x * y;
final double lo = productLowUnscaled(x, y, hi);
return fastTwoSum(hi, lo + (x * yy + xx * y), p);
}
/**
* Compute the division of {@code x} by {@code y}.
* If {@code y = 0} the result is undefined.
*
* @param x x.
* @param y y.
* @param q Quotient (result).
* @return the quotient
*/
static DD divide(double x, double y, DD q) {
// Long division
// quotient q0 = x / y
final double q0 = x / y;
// remainder r = x - q0 * y
twoProd(q0, y, q);
final double p1 = q.lo;
twoDiff(x, q.hi, q);
q.lo -= p1;
// correction term q1 = r0 / y
final double q1 = q.doubleValue() / y;
return fastTwoSum(q0, q1, q);
}
/**
* Compute the division of {@code x} by {@code y}.
* If {@code y = 0} the result is undefined.
*
* <p>This computes the same result as {@link #divide(double, double, DD) divide(x, y, q)}
* without checks for intermediate overflow. Use when the magnitude of
* both {@code x/y} and {@code y} are below {@code 2^996}.
*
* @param x x.
* @param y y.
* @param q Quotient (result).
* @return the quotient
*/
static DD uncheckedDivide(double x, double y, DD q) {
// Long division
// quotient q0 = x / y
final double q0 = x / y;
// remainder r = x - q0 * y
final double p0 = q0 * y;
final double p1 = productLowUnscaled(q0, y, p0);
twoDiff(x, p0, q);
q.lo -= p1;
// correction term q1 = r0 / y
final double q1 = q.doubleValue() / y;
return fastTwoSum(q0, q1, q);
}
/**
* Compute the division of {@code (x, xx)} by {@code y}.
* If {@code y = 0} the result is undefined.
*
* @param x High part of x.
* @param xx Low part of x.
* @param y High part of y.
* @param yy Low part of y.
* @param q Quotient (result).
* @return the quotient
*/
static DD divide(double x, double xx, double y, double yy, DD q) {
// Long division
// quotient q0 = x / y
final double q0 = x / y;
// remainder r0 = x - q0 * y
multiply(y, yy, q0, q);
add(-q.hi, -q.lo, x, xx, q);
final double r = q.hi;
final double rr = q.lo;
// next quotient q1 = r0 / y
final double q1 = r / y;
// remainder r1 = r0 - q1 * y
multiply(y, yy, q1, q);
add(-q.hi, -q.lo, r, rr, q);
// next quotient q2 = r1 / y
final double q2 = q.hi / y;
// Collect (q0, q1, q2)
fastTwoSum(q0, q1, q);
return twoSum(q.hi, q.lo + q2, q);
}
/**
* Compute the inverse of {@code (y, yy)}.
* If {@code y = 0} the result is undefined.
*
* @param y High part of y.
* @param yy Low part of y.
* @param q Quotient 1 / y (result).
* @return the inverse
*/
static DD inverse(double y, double yy, DD q) {
// As per divide using (x, xx) = (1, 0)
// quotient q0 = x / y
final double q0 = 1 / y;
// remainder r0 = x - q0 * y
multiply(y, yy, q0, q);
// This add saves 2 twoSum and 2 fastTwoSum (18 FLOPS)
add(-q.hi, -q.lo, 1, q);
final double r = q.hi;
final double rr = q.lo;
// next quotient q1 = r0 / y
final double q1 = r / y;
// remainder r1 = r0 - q1 * y
multiply(y, yy, q1, q);
add(-q.hi, -q.lo, r, rr, q);
// next quotient q2 = r1 / y
final double q2 = q.hi / y;
// Collect (q0, q1, q2)
fastTwoSum(q0, q1, q);
return twoSum(q.hi, q.lo + q2, q);
}
/**
* Multiply the floating-point number {@code x} by an integral power of two.
*
* <p>This performs the same result as:
* <pre>
* hi = Math.scalb(x, exp);
* lo = Math.scalb(x, exp);
* </pre>
*
* <p>The implementation computes this with a single multiplication if {@code exp}
* is in {@code [-1022, 1023]}. Otherwise the pair {@code (x, xx)} are scaled by
* repeated multiplication by power-of-two factors without any loss of precision.
*
* <p>This is named using the equivalent function in the standard C math.h library.
*
* @param x High part of x.
* @param xx Low part of x.
* @param exp Power of two scale factor.
* @param r Result.
* @return the result
* @see Math#scalb(double, int)
* @see #frexp(double, double, DD)
* @see <a href="https://www.cplusplus.com/reference/cmath/ldexp/">C math.h ldexp</a>
*/
static DD ldexp(double x, double xx, int exp, DD r) {
// Handle scaling when 2^n can be represented with a single normal number
// n >= -1022 && n <= 1023
// Using unsigned compare => n + 1022 <= 1023 + 1022
if (exp + CMP_UNSIGNED_1022 < CMP_UNSIGNED_2046) {
final double s = twoPow(exp);
r.hi = x * s;
r.lo = xx * s;
return r;
}
// Scale by multiples of 2^512 (largest representable power of 2).
// Scaling requires max 5 multiplications to under/overflow any normal value.
// Break this down into e.g.: 2^512^(exp / 512) * 2^(exp % 512)
// Number of multiples n = exp / 512 : exp >>> 9
// Remainder m = exp % 512 : exp & 511 (exp must be positive)
int n;
int m;
double p;
if (exp < 0) {
// Downscaling
// (Note: Using an unsigned shift handles negation of min value: -2^31)
n = -exp >>> 9;
// m = exp % 512
m = -(-exp & 511);
p = TWO_POW_M512;
} else {
// Upscaling
n = exp >>> 9;
m = exp & 511;
p = TWO_POW_512;
}
// Multiply by the remainder scaling factor first. The remaining multiplications
// are either 2^512 or 2^-512.
// Down-scaling to sub-normal will use the final multiplication into a sub-normal result.
// Note here that n >= 1 as the n in [-1022, 1023] case has been handled.
// Handle n : 1, 2, 3, 4, 5
if (n >= 5) {
// n >= 5 will be over/underflow. Use an extreme scale factor.
// Do not use +/- infinity as this creates NaN if x = 0.
// p -> 2^1023 or 2^-1025
p *= p * 0.5;
r.hi = x * p * p * p;
r.lo = xx * p * p * p;
return r;
}
final double s = twoPow(m);
if (n == 4) {
r.hi = x * s * p * p * p * p;
r.lo = xx * s * p * p * p * p;
} else if (n == 3) {
r.hi = x * s * p * p * p;
r.lo = xx * s * p * p * p;
} else if (n == 2) {
r.hi = x * s * p * p;
r.lo = xx * s * p * p;
} else {
// n = 1. Occurs only if exp = -1023.
r.hi = x * s * p;
r.lo = xx * s * p;
}
return r;
}
/**
* Create a double with the value {@code 2^n}.
*
* <p>Warning: Do not call with {@code n = -1023}. This will create zero.
*
* @param n Exponent (in the range [-1022, 1023]).
* @return the double
*/
private static double twoPow(int n) {
return Double.longBitsToDouble(((long) (n + 1023)) << 52);
}
/**
* Convert floating-point number {@code x} to fractional {@code f} and integral
* {@code 2^exp} components.
* <pre>
* x = f * 2^exp
* </pre>
*
* <p>The combined fractional part (f, ff) is in the range {@code [0.5, 1)}.
*
* <p>Special cases:
* <ul>
* <li>If {@code x} is zero, then the normalized fraction is zero and the exponent is zero.
* <li>If {@code x} is NaN, then the normalized fraction is NaN and the exponent is unspecified.
* <li>If {@code x} is infinite, then the normalized fraction is infinite and the exponent is unspecified.
* <li>If high-part {@code x} is an exact power of 2 and the low-part {@code xx} has an opposite
* signed non-zero magnitude then fraction high-part {@code f} will be {@code +/-1} such that
* the double-double number is in the range {@code [0.5, 1)}.
* </ul>
*
* <p>This is named using the equivalent function in the standard C math.h library.
*
* <p>Note: This method returns the exponent to avoid using an {@code int[] exp} argument
* to save the result.
*
* @param x High part of x.
* @param xx Low part of x.
* @param f Fraction part.
* @return Power of two scale factor (integral exponent).
* @see Math#getExponent(double)
* @see #ldexp(double, double, int, DD)
* @see <a href="https://www.cplusplus.com/reference/cmath/frexp/">C math.h frexp</a>
*/
static int frexp(double x, double xx, DD f) {
int exp = getScale(x);
// Handle non-scalable numbers
if (exp == Double.MAX_EXPONENT + 1) {
// Returns +/-0.0, inf or nan
f.hi = x;
// Maintain the fractional part unchanged.
// Do not change the fractional part of inf/nan, and assume
// |xx| < |x| thus if x == 0 then xx == 0 (otherwise the double-double is invalid)
f.lo = xx;
// Unspecified for NaN/inf so just return zero
return 0;
}
// The scale will create the fraction in [1, 2) so increase by 1 for [0.5, 1)
exp += 1;
ldexp(x, xx, -exp, f);
// Return |(hi, lo)| = (1, -eps) if required.
// f.hi * f.lo < 0 detects sign change unless the product underflows.
// Handle extreme case of |f.lo| being min value by doubling f.hi to 1.
if (Math.abs(f.hi) == HALF && 2 * f.hi * f.lo < 0) {
f.hi *= 2;
f.lo *= 2;
exp -= 1;
}
return exp;
}
/**
* Returns a scale suitable for use with {@link Math#scalb(double, int)} to normalise
* the number to the interval {@code [1, 2)}.
*
* <p>In contrast to {@link Math#getExponent(double)} this handles
* sub-normal numbers by computing the number of leading zeros in the mantissa
* and shifting the unbiased exponent. The result is that for all finite, non-zero,
* numbers, the magnitude of {@code scalb(x, -getScale(x))} is
* always in the range {@code [1, 2)}.
*
* <p>This method is a functional equivalent of the c function ilogb(double).
*
* <p>The result is to be used to scale a number using {@link Math#scalb(double, int)}.
* Hence the special case of a zero argument is handled using the return value for NaN
* as zero cannot be scaled. This is different from {@link Math#getExponent(double)}.
*
* <p>Special cases:
* <ul>
* <li>If the argument is NaN or infinite, then the result is {@link Double#MAX_EXPONENT} + 1.
* <li>If the argument is zero, then the result is {@link Double#MAX_EXPONENT} + 1.
* </ul>
*
* @param a Value.
* @return The unbiased exponent of the value to be used for scaling, or 1024 for 0, NaN or Inf
* @see Math#getExponent(double)
* @see Math#scalb(double, int)
* @see <a href="https://www.cplusplus.com/reference/cmath/ilogb/">ilogb</a>
*/
private static int getScale(double a) {
// Only interested in the exponent and mantissa so remove the sign bit
final long bits = Double.doubleToRawLongBits(a) & UNSIGN_MASK;
// Get the unbiased exponent
int exp = ((int) (bits >>> 52)) - EXPONENT_OFFSET;
// No case to distinguish nan/inf (exp == 1024).
// Handle sub-normal numbers
if (exp == Double.MIN_EXPONENT - 1) {
// Special case for zero, return as nan/inf to indicate scaling is not possible
if (bits == 0) {
return Double.MAX_EXPONENT + 1;
}
// A sub-normal number has an exponent below -1022. The amount below
// is defined by the number of shifts of the most significant bit in
// the mantissa that is required to get a 1 at position 53 (i.e. as
// if it were a normal number with assumed leading bit)
final long mantissa = bits & MANTISSA_MASK;
exp -= Long.numberOfLeadingZeros(mantissa << 12);
}
return exp;
}
/**
* Compute the number {@code x} raised to the power {@code n}.
*
* <p>The value is returned as fractional {@code f} and integral
* {@code 2^exp} components.
* <pre>
* (x+xx)^n = (f+ff) * 2^exp
* </pre>
*
* <p>The combined fractional part (f, ff) is in the range {@code [0.5, 1)}.
*
* <p>Special cases:
* <ul>
* <li>If {@code (x, xx)} is zero the high part of the fractional part is
* computed using {@link Math#pow(double, double) Math.pow(x, n)} and the exponent is 0.
* <li>If {@code n = 0} the fractional part is 0.5 and the exponent is 1.
* <li>If {@code (x, xx)} is an exact power of 2 the fractional part is 0.5 and the exponent
* is the power of 2 minus 1.
* <li>If the result high-part is an exact power of 2 and the low-part has an opposite
* signed non-zero magnitude then the fraction high-part {@code f} will be {@code +/-1} such that
* the double-double number is in the range {@code [0.5, 1)}.
* <p>If the argument is not finite then a fractional representation is not possible.
* In this case the fraction and the scale factor is undefined.
* </ul>
*
* <p>Note: This method returns the exponent to avoid using an {@code long[] exp} argument
* to save the result.
*
* <p>The computed result is approximately {@code 16 * (n - 1) * eps} of the exact result
* where eps is {@code 2^-106}.
*
* @param x High part of x.
* @param xx Low part of x.
* @param n Power.
* @param f Fraction part.
* @return Power of two scale factor (integral exponent).
* @see #frexp(double, double, DD)
*/
static long fastPowScaled(double x, double xx, int n, DD f) {
// Edge cases.
if (n == 0) {
f.set(0.5, 0);
return 1;
}
// IEEE result for non-finite or zero
if (!Double.isFinite(x) || x == 0) {
f.set(Math.pow(x, n), 0);
return 0;
}
// Here the number is non-zero finite
assert x == x + xx : NOT_NOMALIZED;
long b = frexp(x, xx, f);
// Handle exact powers of 2
if (Math.abs(f.hi) == HALF && f.lo == 0) {
// (f * 2^b)^n = (2f)^n * 2^(b-1)^n
// Use Math.pow to create the sign.
// Note the result must be scaled to the fractional representation
// by multiplication by 0.5 and addition of 1 to the exponent.
f.hi = 0.5 * Math.pow(2 * f.hi, n);
// Propagate sign change (x*f.hi) to the zero
f.lo = Math.copySign(0.0, x * f.hi * xx);
return 1 + (b - 1) * n;
}
if (n < 0) {
b = computeFastPowScaled(b, f.hi, f.lo, -n, f);
// Result is a non-zero fraction part so inversion is safe
inverse(f.hi, f.lo, f);
// Rescale to [0.5, 1.0]
return -b + frexp(f.hi, f.lo, f);
}
return computeFastPowScaled(b, f.hi, f.lo, n, f);
}
/**
* Compute the number {@code x} (non-zero finite) raised to the power {@code n}.
*
* <p>The input power is treated as an unsigned integer. Thus the negative value
* {@link Integer#MIN_VALUE} is 2^31.
*
* @param exp Integral component 2^exp of x.
* @param x Fractional high part of x.
* @param xx Fractional low part of x.
* @param n Power (in [2, 2^31]).
* @param f Fraction part.
* @return Power of two scale factor (integral exponent).
*/
private static long computeFastPowScaled(long exp, double x, double xx, int n, DD f) {
// Compute the power by multiplication (keeping track of the scale):
// 13 = 1101
// x^13 = x^8 * x^4 * x^1
// = ((x^2 * x)^2)^2 * x
// 21 = 10101
// x^21 = x^16 * x^4 * x^1
// = (((x^2)^2 * x)^2)^2 * x
// 1. Find highest set bit in n (assume n != 0)
// 2. Initialise result as x
// 3. For remaining bits (0 or 1) below the highest set bit:
// - square the current result
// - if the current bit is 1 then multiply by x
// In this scheme the factors to multiply by x can be pre-computed.
// Scale the input in [0.5, 1) to be above 1. Represented as 2^be * b.
final long be = exp - 1;
final double b0 = x * 2;
final double b1 = xx * 2;
// Split b
final double b0h = highPartUnscaled(b0);
final double b0l = b0 - b0h;
// Initialise the result as x^1. Represented as 2^fe * f.
long fe = be;
double f0 = b0;
double f1 = b1;
double u;
double v;
double w;
// Shift the highest set bit off the top.
// Any remaining bits are detected in the sign bit.
final int shift = Integer.numberOfLeadingZeros(n) + 1;
int bits = n << shift;
// Multiplication is done without using DD.multiply as the arguments
// are always finite and the product will not overflow. The square can be optimised.
// Process remaining bits below highest set bit.
for (int i = 32 - shift; i != 0; i--, bits <<= 1) {
// Square the result
// Inline multiply(f0, f1, f0, f1, f), adapted for squaring
fe <<= 1;
u = f0 * f0;
v = squareLowUnscaled(f0, u);
// Inline fastTwoSum(hi, lo + (2 * f0 * f1), f)
w = v + (2 * f0 * f1);
f0 = u + w;
f1 = fastTwoSumLow(u, w, f0);
// Rescale
if (Math.abs(f0) > SAFE_MULTIPLY) {
// Scale back to the [1, 2) range. As safe multiply is 2^500
// the exponent should be < 1001 so the twoPow scaling factor is supported.
final int e = Math.getExponent(f0);
final double s = twoPow(-e);
fe += e;
f0 *= s;
f1 *= s;
}
if (bits < 0) {
// Multiply by b
fe += be;
// Inline multiply(f0, f1, b0, b1, f)
u = highPartUnscaled(f0);
v = f0 - u;
w = f0 * b0;
v = productLowUnscaled(u, v, b0h, b0l, w);
// Inline fastTwoSum(w, v + (f0 * b1 + f1 * b0), f)
u = v + (f0 * b1 + f1 * b0);
f0 = w + u;
f1 = fastTwoSumLow(w, u, f0);
// Avoid rescale as x2 is in [1, 2)
}
}
return fe + frexp(f0, f1, f);
}
/**
* Compute the number {@code x} raised to the power {@code n}.
*
* <p>The value is returned as fractional {@code f} and integral
* {@code 2^exp} components.
* <pre>
* (x+xx)^n = (f+ff) * 2^exp
* </pre>
*
* <p>The combined fractional part (f, ff) is in the range {@code [0.5, 1)}.
*
* <p>Special cases:
* <ul>
* <li>If {@code (x, xx)} is zero the high part of the fractional part is
* computed using {@link Math#pow(double, double) Math.pow(x, n)} and the exponent is 0.
* <li>If {@code n = 0} the fractional part is 0.5 and the exponent is 1.
* <li>If {@code (x, xx)} is an exact power of 2 the fractional part is 0.5 and the exponent
* is the power of 2 minus 1.
* <li>If the result high-part is an exact power of 2 and the low-part has an opposite
* signed non-zero magnitude then the fraction high-part {@code f} will be {@code +/-1} such that
* the double-double number is in the range {@code [0.5, 1)}.
* <p>If the argument is not finite then a fractional representation is not possible.
* In this case the fraction and the scale factor is undefined.
* </ul>
*
* <p>Note: This method returns the exponent to avoid using an {@code long[] exp} argument
* to save the result.
*
* <p>The computed result is within 1 ULP of the exact result where ULP is {@code 2^-106}.
*
* @param x High part of x.
* @param xx Low part of x.
* @param n Power.
* @param f Fraction part.
* @return Power of two scale factor (integral exponent).
* @see #frexp(double, double, DD)
*/
static long powScaled(double x, double xx, int n, DD f) {
// Edge cases.
if (n == 0) {
f.set(0.5, 0);
return 1;
}
// IEEE result for non-finite or zero
if (!Double.isFinite(x) || x == 0) {
f.set(Math.pow(x, n), 0);
return 0;
}
// Here the number is non-zero finite
assert x == x + xx : NOT_NOMALIZED;
final long b = frexp(x, xx, f);
// Handle exact powers of 2
if (Math.abs(f.hi) == HALF && f.lo == 0) {
// (f * 2^b)^n = (2f)^n * 2^(b-1)^n
// Use Math.pow to create the sign.
// Note the result must be scaled to the fractional representation
// by multiplication by 0.5 and addition of 1 to the exponent.
f.hi = 0.5 * Math.pow(2 * f.hi, n);
// Propagate sign change (x*f.hi) to the zero
f.lo = Math.copySign(0.0, x * f.hi * xx);
return 1 + (b - 1) * n;
}
return computePowScaled(b, f.hi, f.lo, n, f);
}
/**
* Compute the number {@code x} (non-zero finite) raised to the power {@code n}.
*
* <p>Performs the computation in triple-length precision. If the input power is
* negative the result is computed using the absolute value of {@code n} and then
* inverted by dividing into 1.
*
* @param exp Integral component 2^exp of x.
* @param x Fractional high part of x.
* @param xx Fractional low part of x.
* @param n Power (in [2, 2^31]).
* @param f Fraction part.
* @return Power of two scale factor (integral exponent).
*/
private static long computePowScaled(long exp, double x, double xx, int n, DD f) {
// Same as computePowScaled using a triple-double intermediate.
// triple-double multiplication:
// (a0, a1, a2) * (b0, b1, b2)
// a x b ~ a0b0 O(1) term
// + a0b1 + a1b0 O(eps) terms
// + a0b2 + a1b1 + a2b0 O(eps^2) terms
// + a1b2 + a2b1 O(eps^3) terms
// + a2b2 O(eps^4) term (not required for the first 159 bits)
// Higher terms require two-prod if the round-off is <= O(eps^3).
// (pij,qij) = two-prod(ai, bj); pij = O(eps^i+j); qij = O(eps^i+j+1)
// p00 O(1)
// p01, p10, q00 O(eps)
// p02, p11, p20, q01, q10 O(eps^2)
// p12, p21, q02, q11, q20 O(eps^3)
// Sum terms of the same order. Carry round-off to lower order:
// s0 = p00 Order(1)
// Sum (p01, p10, q00) -> (s1, r2, r3a) Order(eps)
// Sum (p02, p11, p20, q01, q10, r2) -> (s2, r3b) Order(eps^2)
// Sum (p12, p21, q02, q11, q20, r3a, r3b) -> s3 Order(eps^3)
//
// Simplifies for (b0, b1):
// Sum (p01, p10, q00) -> (s1, r2, r3a) Order(eps)
// Sum (p11, p20, q01, q10, r2) -> (s2, r3b) Order(eps^2)
// Sum (p21, q11, q20, r3a, r3b) -> s3 Order(eps^3)
//
// Simplifies for the square:
// Sum (2 * p01, q00) -> (s1, r2) Order(eps)
// Sum (2 * p02, 2 * q01, p11, r2) -> (s2, r3b) Order(eps^2)
// Sum (2 * p12, 2 * q02, q11, r3b) -> s3 Order(eps^3)
// Scale the input in [0.5, 1) to be above 1. Represented as 2^be * b.
final long be = exp - 1;
final double b0 = x * 2;
final double b1 = xx * 2;
// Split b
final double b0h = highPartUnscaled(b0);
final double b0l = b0 - b0h;
final double b1h = highPartUnscaled(b1);
final double b1l = b1 - b1h;
// Initialise the result as x^1. Represented as 2^fe * f.
long fe = be;
double f0 = b0;
double f1 = b1;
double f2 = 0;
// Shift the highest set bit off the top.
// Any remaining bits are detected in the sign bit.
final int an = Math.abs(n);
final int shift = Integer.numberOfLeadingZeros(an) + 1;
int bits = an << shift;
// Multiplication is done inline with some triple precision helper routines.
// Process remaining bits below highest set bit.
for (int i = 32 - shift; i != 0; i--, bits <<= 1) {
// Square the result
fe <<= 1;
double a0h = highPartUnscaled(f0);
double a0l = f0 - a0h;
double a1h = highPartUnscaled(f1);
double a1l = f1 - a1h;
double a2h = highPartUnscaled(f2);
double a2l = f2 - a2h;
double p00 = f0 * f0;
double q00 = squareLowUnscaled(a0h, a0l, p00);
double p01 = f0 * f1;
double q01 = productLowUnscaled(a0h, a0l, a1h, a1l, p01);
final double p02 = f0 * f2;
final double q02 = productLowUnscaled(a0h, a0l, a2h, a2l, p02);
double p11 = f1 * f1;
double q11 = squareLowUnscaled(a1h, a1l, p11);
final double p12 = f1 * f2;
double s0 = p00;
// Sum (2 * p01, q00) -> (s1, r2) Order(eps)
double s1 = 2 * p01 + q00;
double r2 = twoSumLow(2 * p01, q00, s1);
// Sum (2 * p02, 2 * q01, p11, r2) -> (s2, r3b) Order(eps^2)
double s2 = p02 + q01;
double r3b = twoSumLow(p02, q01, s2);
double u = p11 + r2;
double v = twoSumLow(p11, r2, u);
fastAdd(2 * s2, 2 * r3b, u, v, f);
s2 = f.hi;
r3b = f.lo;
// Sum (2 * p12, 2 * q02, q11, r3b) -> s3 Order(eps^3)
double s3 = 2 * (p12 + q02) + q11 + r3b;
f0 = norm3(s0, s1, s2, s3, f);
f1 = f.hi;
f2 = f.lo;
// Rescale
if (Math.abs(f0) > SAFE_MULTIPLY) {
// Scale back to the [1, 2) range. As safe multiply is 2^500
// the exponent should be < 1001 so the twoPow scaling factor is supported.
final int e = Math.getExponent(f0);
final double s = twoPow(-e);
fe += e;
f0 *= s;
f1 *= s;
f2 *= s;
}
if (bits < 0) {
// Multiply by b
fe += be;
a0h = highPartUnscaled(f0);
a0l = f0 - a0h;
a1h = highPartUnscaled(f1);
a1l = f1 - a1h;
a2h = highPartUnscaled(f2);
a2l = f2 - a2h;
p00 = f0 * b0;
q00 = productLowUnscaled(a0h, a0l, b0h, b0l, p00);
p01 = f0 * b1;
q01 = productLowUnscaled(a0h, a0l, b1h, b1l, p01);
final double p10 = f1 * b0;
final double q10 = productLowUnscaled(a1h, a1l, b0h, b0l, p10);
p11 = f1 * b1;
q11 = productLowUnscaled(a1h, a1l, b1h, b1l, p11);
final double p20 = f2 * b0;
final double q20 = productLowUnscaled(a2h, a2l, b0h, b0l, p20);
final double p21 = f2 * b1;
s0 = p00;
// Sum (p01, p10, q00) -> (s1, r2, r3a) Order(eps)
u = p01 + p10;
v = twoSumLow(p01, p10, u);
s1 = q00 + u;
final double w = twoSumLow(q00, u, s1);
r2 = v + w;
final double r3a = twoSumLow(v, w, r2);
// Sum (p11, p20, q01, q10, r2) -> (s2, r3b) Order(eps^2)
s2 = p11 + p20;
r3b = twoSumLow(p11, p20, s2);
u = q01 + q10;
v = twoSumLow(q01, q10, u);
fastAdd(s2, r3b, u, v, f);
s2 = f.hi + r2;
r3b = twoSumLow(f.hi, r2, s2);
// Sum (p21, q11, q20, r3a, r3b) -> s3 Order(eps^3)
s3 = p21 + q11 + q20 + r3a + r3b;
f0 = norm3(s0, s1, s2, s3, f);
f1 = f.hi;
f2 = f.lo;
// Avoid rescale as x2 is in [1, 2)
}
}
// Ensure (f0, f1) are 1 ulp exact
final double u = f1 + f2;
DD.fastTwoSum(f0, u, f);
// If the power is negative, invert in triple precision
if (n < 0) {
// Require the round-off
final double v = fastTwoSumLow(f1, f2, u);
// Result is in approximately [1, 2^501] so inversion is safe.
inverse3(f.hi, f.lo, v, f);
// Rescale to [0.5, 1.0]
return -fe + frexp(f.hi, f.lo, f);
}
return fe + frexp(f.hi, f.lo, f);
}
/**
* Normalize (s0, s1, s2, s3) to (s0, s1, s2).
*
* @param s0 High part of s.
* @param s1 Second part of s.
* @param s2 Third part of s.
* @param s3 Fourth part of s.
* @param s12 Output parts (s1, s2)
* @return s0
*/
private static double norm3(double s0, double s1, double s2, double s3, DD s12) {
double q;
// Compress (Schewchuk Fig. 15) (s0, s1, s2, s3) -> (g0, g1, g2, g3)
final double g0 = s0 + s1;
q = fastTwoSumLow(s0, s1, g0);
final double g1 = q + s2;
q = fastTwoSumLow(q, s2, g1);
final double g2 = q + s3;
final double g3 = fastTwoSumLow(q, s3, g2);
// (g0, g1, g2, g3) -> (h0, h1, h2, h3), returned as (h0, h1, h2 + h3)
q = g1 + g2;
s12.lo = fastTwoSumLow(g1, g2, q) + g3;
final double h0 = g0 + q;
s12.hi = fastTwoSumLow(g0, q, h0);
return h0;
}
/**
* Compute the inverse of {@code (y, yy, yyy)}.
* If {@code y = 0} the result is undefined.
*
* <p>This is special routine used in {@link #powScaled(double, double, int, DD)}
* to invert the triple precision result.
*
* @param y First part of y.
* @param yy Second part of y.
* @param yyy Third part of y.
* @param q Quotient 1 / y (result).
* @return the inverse
*/
private static DD inverse3(double y, double yy, double yyy, DD q) {
// Long division (1, 0, 0) / (y, yy, yyy)
double r;
double rr;
double rrr;
double t;
// quotient q0 = x / y
final double q0 = 1 / y;
// remainder r0 = x - q0 * y
t = multiply3(y, yy, yyy, q0, q);
r = add3(-t, -q.hi, -q.lo, 1, q);
rr = q.hi;
rrr = q.lo;
// next quotient q1 = r0 / y
final double q1 = r / y;
// remainder r1 = r0 - q1 * y
t = multiply3(y, yy, yyy, q1, q);
r = add3(-t, -q.hi, -q.lo, r, rr, rrr, q);
rr = q.hi;
rrr = q.lo;
// next quotient q2 = r1 / y
final double q2 = r / y;
// remainder r2 = r1 - q2 * y
t = multiply3(y, yy, yyy, q2, q);
r = add3(-t, -q.hi, -q.lo, r, rr, rrr, q);
// next quotient q3 = r2 / y
final double q3 = r / y;
// Collect (q0, q1, q2, q3) to (s0, s1, s2)
t = norm3(q0, q1, q2, q3, q);
// Reduce to (s0, s1)
return fastTwoSum(t, q.hi + q.lo, q);
}
/**
* Compute the multiplication product of {@code (a0,a1,a2)} and {@code b}.
*
* @param a0 High part of a.
* @param a1 Second part of a.
* @param a2 Third part of a.
* @param b Factor.
* @param s12 Output parts (s1, s2)
* @return s0
*/
private static double multiply3(double a0, double a1, double a2, double b, DD s12) {
// Triple-Double x Double
// a x b ~ a0b O(1) term
// + a1b O(eps) terms
// + a2b O(eps^2) terms
// Higher terms require two-prod if the round-off is <= O(eps^2).
// (pij,qij) = two-prod(ai, bj); pij = O(eps^i+j); qij = O(eps^i+j+1)
// p00 O(1)
// p10, q00 O(eps)
// p20, q10 O(eps^2)
// |a2| < |eps^2 a0| => |a2 * b| < eps^2 |a0 * b| and q20 < eps^3 |a0 * b|
//
// Sum terms of the same order. Carry round-off to lower order:
// s0 = p00 Order(1)
// Sum (p10, q00) -> (s1, r1) Order(eps)
// Sum (p20, q10, r1) -> (s2, s3) Order(eps^2)
final double a0h = highPartUnscaled(a0);
final double a0l = a0 - a0h;
final double a1h = highPartUnscaled(a1);
final double a1l = a1 - a1h;
final double b0h = highPartUnscaled(b);
final double b0l = b - b0h;
final double p00 = a0 * b;
final double q00 = productLowUnscaled(a0h, a0l, b0h, b0l, p00);
final double p10 = a1 * b;
final double q10 = productLowUnscaled(a1h, a1l, b0h, b0l, p10);
final double p20 = a2 * b;
// Sum (p10, q00) -> (s1, r1) Order(eps)
final double s1 = p10 + q00;
final double r1 = twoSumLow(p10, q00, s1);
// Sum (p20, q10, r1) -> (s2, s3) Order(eps^2)
double u = p20 + q10;
final double v = twoSumLow(p20, q10, u);
final double s2 = u + r1;
u = twoSumLow(u, r1, s2);
return norm3(p00, s1, s2, v + u, s12);
}
/**
* Compute the sum of {@code (a0,a1,a2)} and {@code b}.
*
* @param a0 High part of a.
* @param a1 Second part of a.
* @param a2 Third part of a.
* @param b Addend.
* @param s12 Output parts (s1, s2)
* @return s0
*/
private static double add3(double a0, double a1, double a2, double b, DD s12) {
// Hide et al (2008) Fig.5: Quad-Double + Double without final a3.
double u;
double v;
final double s0 = a0 + b;
u = twoSumLow(a0, b, s0);
final double s1 = a1 + u;
v = twoSumLow(a1, u, s1);
final double s2 = a2 + v;
u = twoSumLow(a2, v, s2);
return norm3(s0, s1, s2, u, s12);
}
/**
* Compute the sum of {@code (a0,a1,a2)} and {@code (b0,b1,b2))}.
* It is assumed the absolute magnitudes of a and b are equal and the sign
* of a and b are opposite.
*
* @param a0 High part of a.
* @param a1 Second part of a.
* @param a2 Third part of a.
* @param b0 High part of b.
* @param b1 Second part of b.
* @param b2 Third part of b.
* @param s12 Output parts (s1, s2)
* @return s0
*/
private static double add3(double a0, double a1, double a2, double b0, double b1, double b2, DD s12) {
// Hide et al (2008) Fig.6: Quad-Double + Quad-Double without final a3, b3.
double u;
double v;
// a0 + b0 -> (s0, r1)
final double s0 = a0 + b0;
final double r1 = twoSumLow(a0, b0, s0);
// a1 + b1 + r1 -> (s1, r2, r3)
u = a1 + b1;
v = twoSumLow(a1, b1, u);
final double s1 = r1 + u;
u = twoSumLow(r1, u, s1);
final double r2 = v + u;
final double r3 = twoSumLow(v, u, r2);
// (a2 + b2 + r2) + r3 -> (s2, s3)
u = a2 + b2;
v = twoSumLow(a2, b2, u);
final double s2 = r2 + u;
u = twoSumLow(r2, u, s2);
final double s3 = v + u + r3;
return norm3(s0, s1, s2, s3, s12);
}
}