blob: b4d5632f8947ea5dc32091063de6c9675a41dfe1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @author Intel, Pavel A. Ozhdikhin
*
*/
#ifndef _OPTARITHMETIC_H
#define _OPTARITHMETIC_H
namespace Jitrino {
/*
* Implemented algorithms are similar to the ones described in
* [T.Granlund and P.L.Montgomery. Division by Invariant Integers using
* Multiplication. PLDI, 1994]
*/
template <typename inttype, int width> inline int popcount(inttype x);
template <>
inline
int popcount<I_32, 32> (I_32 x)
{
#ifdef _USE_ITANIUM_INTRINSICS_
U_32 y = x; // avoid sign extension
__m64 z = _m_from_int(int(y));
__m64 r = __m64_popcnt(z);
return _m_to_int(r);
#else
x = x - ((x >>1) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
x = (x + (x >> 4)) & 0x0f0f0f0f;
x = x + (x >> 8);
x = x + (x >> 16);
return x & 0x0000003F;
#endif
}
template <>
inline
int popcount<int64, 64> (int64 x)
{
#ifdef _USE_ITANIUM_INTRINSICS_
__m64 y = _m_fron_int(int(x));
__m64 r = __m64_popcnt(y);
return _m_to_int(r);
#else
int64 tmp5 = 0x55555555;
tmp5 = tmp5 | (tmp5 << 32);
x = x - ((x >>1) & tmp5);
int64 tmp3 = 0x33333333;
tmp3 = tmp3 | (tmp3 << 32);
x = (x & tmp3) + ((x >> 2) & tmp3);
int64 tmp0f = 0x0f0f0f0f;
tmp0f = tmp0f | (tmp0f << 32);
x = (x + (x >> 4)) & tmp0f;
x = x + (x >> 8);
x = x + (x >> 16);
x = x + (x >> 32);
return int(x & 0x0000007F);
#endif
}
template <typename inttype>
bool isPowerOf2(inttype d) {
if (d < 0) {
d = -d;
}
// turn trailing 0s into 1s, others become 0
inttype justrightzeros = (~d) & (d - 1);
inttype bitandleftbits = ~justrightzeros;
// rightmost 1-bit and trailing 0s:
inttype bitandrightzeros = d ^ (d - 1);
inttype justrightbit = bitandrightzeros & bitandleftbits;
if (d == justrightbit) {
return true;
} else {
return false;
}
}
template <typename inttype, typename uinttype,
int width>
void
getMagic(inttype d, inttype *magicNum, inttype *shiftBy)
{
// d must not be -1, 0, 1
assert((d < -1) || (d > 1));
const uinttype hiBitSet = (uinttype)1 << (width-1);
uinttype ad = abs(d); // ad = |d|
uinttype t = hiBitSet + (((uinttype)d) >> (width-1));
uinttype anc = t - 1 - (t%ad); // anc = |nc|
// initial value, we try values from [width-1, 2*width]
uinttype p = width-1; // power of 2 to divide by
// these are maintained incrementally in loop as p is changed
uinttype q1 = hiBitSet / anc; // q1 = 2**p/|nc|
uinttype r1 = hiBitSet - q1*anc; // r1 = rem(2**p, |nc|)
uinttype q2 = hiBitSet / ad; // q2 = 2**p /|d|
uinttype r2 = hiBitSet - q2*ad; // r2 = rem(2**p, |d|)
uinttype delta = 0;
do {
p = p + 1;
// increment q1, r1, q2, r2
q1 = q1 << 1;
r1 = r1 << 1;
q2 = q2 << 1;
r2 = r2 << 1;
// r1 overflows into q1
if (r1 >= anc) {
q1 = q1 + 1;
r1 = r1 - anc;
}
// r2 overflows into q2
if (r2 >= ad) {
q2 = q2 + 1;
r2 = r2 - ad;
}
delta = ad - r2;
} while ((q1 < delta) || ((q1 == delta) && (r1 == 0)));
inttype mag1 = q2 + 1;
*magicNum = (d < 0) ? -mag1 : mag1;
*shiftBy = p - width;
}
// isolate leftmost 1
template <typename inttype>
inline
inttype leftmost1(inttype x);
template <>
inline
I_32 leftmost1<I_32>(I_32 x)
{
if (x == 1) return 1;
if (x == 0) return 0;
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
x = x >> 1;
x = x + 1;
return x;
}
template <>
inline
int64 leftmost1<int64>(int64 x)
{
if (x == 1) return 1;
if (x == 0) return 0;
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
x = x | (x >> 32);
x = x >> 1;
x = x + 1;
return x;
}
// number of leading (leftmost) zeros
template <typename inttype, int width>
inline
int nlz(inttype x);
template <>
inline
int nlz<I_32, 32>(I_32 x)
{
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
return popcount<I_32, 32>(~x);
}
template <>
inline
int nlz<int64, 64>(int64 x)
{
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
x = x | (x >> 32);
return popcount<int64, 64>(~x);
}
// number of trailing (rightmost) zeros
template <typename inttype, int width>
inline
int ntz(inttype x)
{
inttype trailingzeromask = (~x) & (x - 1);
return popcount<inttype, width>(trailingzeromask);
}
template <typename inttype, int width>
inline
int whichPowerOf2(inttype d) {
bool isNegative = ((d < 0) && (d != -d));
if (isNegative) {
d = -d;
}
int i = ntz<inttype, width>(d);
if (isNegative)
return -i;
else
return i;
}
template <typename inttype>
inline
int shifttomakeOdd(inttype d, inttype &newd)
{
assert(d != 0);
int i = 0;
while ((d & 1) == 0) {
++i;
d = d >> 1;
}
newd = d;
return i;
}
template <typename uinttype>
inline uinttype isqrt_iter(uinttype n, uinttype approx)
{
uinttype q = n / approx;
uinttype newapprox = (approx + q) / 2;
while (q != newapprox) {
if (newapprox == approx) return approx;
approx = newapprox;
q = n / approx;
newapprox = (approx + q) / 2;
}
return newapprox;
}
template <typename uinttype> inline uinttype isqrt(uinttype n);
template <> inline uint64 isqrt(uint64 n) { return isqrt_iter(n, (uint64) 65536); }
template <> inline U_32 isqrt(U_32 n) { return isqrt_iter(n, (U_32) 256); }
template <> inline uint16 isqrt(uint16 n) { return isqrt_iter(n, (uint16) 16); }
template <> inline U_8 isqrt(U_8 n) { return isqrt_iter(n, (U_8) 4); }
} //namespace Jitrino
#endif // _OPTARITHMETIC_H