/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
/* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ====================================================
*/ #ifdefined(_MSC_VER) #define ONCE0 \
__pragma(warning(push)) \
__pragma(warning(disable : 4127)) while (0) \
__pragma(warning(pop)) /**/ #else #define ONCE0 while (0) #endif
/* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ====================================================
*/
#ifdefined(__GNUC__) && defined(__BYTE_ORDER__) #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define XSIMD_LITTLE_ENDIAN #endif #elifdefined(_WIN32) // We can safely assume that Windows is always little endian #define XSIMD_LITTLE_ENDIAN #elifdefined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__) #define XSIMD_LITTLE_ENDIAN #endif
/* * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2) * double x[],y[]; int e0,nx,prec; int ipio2[]; * * __kernel_rem_pio2 return the last three digits of N with * y = x - N*pi/2 * so that |y| < pi/2. * * The method is to compute the integer (mod 8) and fraction parts of * (2/pi)*x without doing the full multiplication. In general we * skip the part of the product that are known to be a huge integer ( * more accurately, = 0 mod 8 ). Thus the number of operations are * independent of the exponent of the input. * * (2/pi) is represented by an array of 24-bit integers in ipio2[]. * * Input parameters: * x[] The input value (must be positive) is broken into nx * pieces of 24-bit integers in double precision format. * x[i] will be the i-th 24 bit of x. The scaled exponent * of x[0] is given in input parameter e0 (i.e., x[0]*2^e0 * match x's up to 24 bits. * * Example of breaking a double positive z into x[0]+x[1]+x[2]: * e0 = ilogb(z)-23 * z = scalbn(z,-e0) * for i = 0,1,2 * x[i] = floor(z) * z = (z-x[i])*2**24 * * * y[] ouput result in an array of double precision numbers. * The dimension of y[] is: * 24-bit precision 1 * 53-bit precision 2 * 64-bit precision 2 * 113-bit precision 3 * The actual value is the sum of them. Thus for 113-bit * precison, one may have to do something like: * * long double t,w,r_head, r_tail; * t = (long double)y[2] + (long double)y[1]; * w = (long double)y[0]; * r_head = t+w; * r_tail = w - (r_head - t); * * e0 The exponent of x[0] * * nx dimension of x[] * * prec an integer indicating the precision: * 0 24 bits (single) * 1 53 bits (double) * 2 64 bits (extended) * 3 113 bits (quad) * * ipio2[] * integer array, contains the (24*i)-th to (24*i+23)-th * bit of 2/pi after binary point. The corresponding * floating value is * * ipio2[i] * 2^(-24(i+1)). * * External function: * double scalbn(), floor(); * * * Here is the description of some local variables: * * jk jk+1 is the initial number of terms of ipio2[] needed * in the computation. The recommended value is 2,3,4, * 6 for single, double, extended,and quad. * * jz local integer variable indicating the number of * terms of ipio2[] used. * * jx nx - 1 * * jv index for pointing to the suitable ipio2[] for the * computation. In general, we want * ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8 * is an integer. Thus * e0-3-24*jv >= 0 or (e0-3)/24 >= jv * Hence jv = max(0,(e0-3)/24). * * jp jp+1 is the number of terms in PIo2[] needed, jp = jk. * * q[] double array with integral value, representing the * 24-bits chunk of the product of x and 2/pi. * * q0 the corresponding exponent of q[0]. Note that the * exponent for q[i] would be q0-24*i. * * PIo2[] double precision array, obtained by cutting pi/2 * into 24 bits chunks. * * f[] ipio2[] in floating point * * iq[] integer array by breaking up q[] in 24-bits chunk. * * fq[] final product of x*(2/pi) in fq[0],..,fq[jk] * * ih integer. If >0 it indicates q[] is >= 0.5, hence * it also indicates the *sign* of the result. *
*/
XSIMD_INLINE int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
{ staticconst int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */
/* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
j = jv - jx;
m = jx + jk; for (i = 0; i <= m; i++, j++)
f[i] = (j < 0) ? zero : (double)ipio2[j];
/* compute q[0],q[1],...q[jk] */ for (i = 0; i <= jk; i++)
{ for (j = 0, fw = 0.0; j <= jx; j++)
fw += x[j] * f[jx + i - j];
q[i] = fw;
}
jz = jk;
recompute: /* distill q[] into iq[] reversingly */ for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
{
fw = (double)((int32_t)(twon24 * z));
iq[i] = (int)(z - two24 * fw);
z = q[j - 1] + fw;
}
/* compute n */
z = std::scalbn(z, q0); /* actual value of z */
z -= 8.0 * std::floor(z * 0.125); /* trim off integer >= 8 */
n = (int32_t)z;
z -= (double)n;
ih = 0; if (q0 > 0)
{ /* need iq[jz-1] to determine n */
i = (iq[jz - 1] >> (24 - q0));
n += i;
iq[jz - 1] -= i << (24 - q0);
ih = iq[jz - 1] >> (23 - q0);
} elseif (q0 == 0)
ih = iq[jz - 1] >> 23; elseif (z >= 0.5)
ih = 2;
if (ih > 0)
{ /* q > 0.5 */
n += 1;
carry = 0; for (i = 0; i < jz; i++)
{ /* compute 1-q */
j = iq[i]; if (carry == 0)
{ if (j != 0)
{
carry = 1;
iq[i] = 0x1000000 - j;
}
} else
iq[i] = 0xffffff - j;
} if (q0 > 0)
{ /* rare case: chance is 1 in 12 */ switch (q0)
{ case 1:
iq[jz - 1] &= 0x7fffff; break; case 2:
iq[jz - 1] &= 0x3fffff; break;
}
} if (ih == 2)
{
z = one - z; if (carry != 0)
z -= std::scalbn(one, q0);
}
}
/* check if recomputation is needed */ if (z == zero)
{
j = 0; for (i = jz - 1; i >= jk; i--)
j |= iq[i]; if (j == 0)
{ /* need recomputation */ for (k = 1; iq[jk - k] == 0; k++)
; /* k = no. of terms needed */
for (i = jz + 1; i <= jz + k; i++)
{ /* add q[jz+1] to q[jz+k] */
f[jx + i] = (double)ipio2[jv + i]; for (j = 0, fw = 0.0; j <= jx; j++)
fw += x[j] * f[jx + i - j];
q[i] = fw;
}
jz += k; goto recompute;
}
}
/* chop off zero terms */ if (z == 0.0)
{
jz -= 1;
q0 -= 24; while (iq[jz] == 0)
{
jz--;
q0 -= 24;
}
} else
{ /* break z into 24-bit if necessary */
z = std::scalbn(z, -q0); if (z >= two24)
{
fw = (double)((int32_t)(twon24 * z));
iq[jz] = (int32_t)(z - two24 * fw);
jz += 1;
q0 += 24;
iq[jz] = (int32_t)fw;
} else
iq[jz] = (int32_t)z;
}
/* convert integer "bit" chunk to floating-point value */
fw = scalbn(one, q0); for (i = jz; i >= 0; i--)
{
q[i] = fw * (double)iq[i];
fw *= twon24;
}
/* compute PIo2[0,...,jp]*q[jz,...,0] */ for (i = jz; i >= 0; i--)
{ for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
fw += PIo2[k] * q[i + k];
fq[jz - i] = fw;
}
/* compress fq[] into y[] */ switch (prec)
{ case 0:
fw = 0.0; for (i = jz; i >= 0; i--)
fw += fq[i];
y[0] = (ih == 0) ? fw : -fw; break; case 1: case 2:
fw = 0.0; for (i = jz; i >= 0; i--)
fw += fq[i];
y[0] = (ih == 0) ? fw : -fw;
fw = fq[0] - fw; for (i = 1; i <= jz; i++)
fw += fq[i];
y[1] = (ih == 0) ? fw : -fw; break; case 3: /* painful */ for (i = jz; i > 0; i--)
{
fw = fq[i - 1] + fq[i];
fq[i] += fq[i - 1] - fw;
fq[i - 1] = fw;
} for (i = jz; i > 1; i--)
{
fw = fq[i - 1] + fq[i];
fq[i] += fq[i - 1] - fw;
fq[i - 1] = fw;
} for (fw = 0.0, i = jz; i >= 2; i--)
fw += fq[i]; if (ih == 0)
{
y[0] = fq[0];
y[1] = fq[1];
y[2] = fw;
} else
{
y[0] = -fq[0];
y[1] = -fq[1];
y[2] = -fw;
}
} return n & 7;
}
/* * invpio2: 53 bits of 2/pi * pio2_1: first 33 bit of pi/2 * pio2_1t: pi/2 - pio2_1 * pio2_2: second 33 bit of pi/2 * pio2_2t: pi/2 - (pio2_1+pio2_2) * pio2_3: third 33 bit of pi/2 * pio2_3t: pi/2 - (pio2_1+pio2_2+pio2_3)
*/
double z = 0., w, t, r, fn; double tx[3];
std::int32_t e0, i, j, nx, n, ix, hx;
std::uint32_t low;
GET_HIGH_WORD(hx, x); /* high word of x */
ix = hx & 0x7fffffff; if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
{
y[0] = x;
y[1] = 0; return 0;
} if (ix < 0x4002d97c)
{ /* |x| < 3pi/4, special case with n=+-1 */ if (hx > 0)
{
z = x - pio2_1; if (ix != 0x3ff921fb)
{ /* 33+53 bit pi is good enough */
y[0] = z - pio2_1t;
y[1] = (z - y[0]) - pio2_1t;
} else
{ /* near pi/2, use 33+33+53 bit pi */
z -= pio2_2;
y[0] = z - pio2_2t;
y[1] = (z - y[0]) - pio2_2t;
} return 1;
} else
{ /* negative x */
z = x + pio2_1; if (ix != 0x3ff921fb)
{ /* 33+53 bit pi is good enough */
y[0] = z + pio2_1t;
y[1] = (z - y[0]) + pio2_1t;
} else
{ /* near pi/2, use 33+33+53 bit pi */
z += pio2_2;
y[0] = z + pio2_2t;
y[1] = (z - y[0]) + pio2_2t;
}
return -1;
}
} if (ix <= 0x413921fb)
{ /* |x| ~<= 2^19*(pi/2), medium_ size */
t = std::fabs(x);
n = (std::int32_t)(t * invpio2 + half);
fn = (double)n;
r = t - fn * pio2_1;
w = fn * pio2_1t; /* 1st round good to 85 bit */ if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1]))
{
y[0] = r - w; /* quick check no cancellation */
} else
{
std::uint32_t high;
j = ix >> 20;
y[0] = r - w;
GET_HIGH_WORD(high, y[0]);
i = j - static_cast<int32_t>((high >> 20) & 0x7ff); if (i > 16)
{ /* 2nd iteration needed, good to 118 */
t = r;
w = fn * pio2_2;
r = t - w;
w = fn * pio2_2t - ((t - r) - w);
y[0] = r - w;
GET_HIGH_WORD(high, y[0]);
i = j - static_cast<int32_t>((high >> 20) & 0x7ff); if (i > 49)
{ /* 3rd iteration need, 151 bits acc */
t = r; /* will cover all possible cases */
w = fn * pio2_3;
r = t - w;
w = fn * pio2_3t - ((t - r) - w);
y[0] = r - w;
}
}
}
y[1] = (r - y[0]) - w; if (hx < 0)
{
y[0] = -y[0];
y[1] = -y[1]; return -n;
} else return n;
} /* * all other (large) arguments
*/ if (ix >= 0x7ff00000)
{ /* x is inf or NaN */
y[0] = y[1] = x - x; return 0;
} /* set z = scalbn(|x|,ilogb(x)-23) */
GET_LOW_WORD(low, x);
SET_LOW_WORD(z, low);
e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */
SET_HIGH_WORD(z, static_cast<uint32_t>(ix - (e0 << 20))); for (i = 0; i < 2; i++)
{
tx[i] = (double)((std::int32_t)(z));
z = (z - tx[i]) * two24;
}
tx[2] = z;
nx = 3; while (tx[nx - 1] == zero)
nx--; /* skip zero term */
n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi); if (hx < 0)
{
y[0] = -y[0];
y[1] = -y[1]; return -n;
} return n;
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.