/* Include file for internal GNU MP types and definitions.
THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND ARE ALMOST CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES IN FUTURE GNU MP RELEASES.
Copyright 1991-2018, 2021, 2022 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version.
or
* the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
or both in parallel, as here.
The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received copies of the GNU General Public License and the GNU Lesser General Public License along with the GNU MP Library. If not,
see https://www.gnu.org/licenses/. */
/* __GMP_DECLSPEC must be given on any global data that will be accessed from outside libgmp, meaning from the test or development programs, or from libgmpxx. Failing to do this will result in an incorrect address being used for the accesses. On functions __GMP_DECLSPEC makes calls from outside libgmp more efficient, but they'll still work fine without
it. */
#ifndef __GMP_IMPL_H__ #define __GMP_IMPL_H__
#ifdefined _CRAY #include <intrinsics.h> /* for _popcnt */ #endif
/* For INT_MAX, etc. We used to avoid it because of a bug (on solaris, gcc 2.95 under -mcpu=ultrasparc in ABI=32 ends up getting wrong values (the ABI=64 values)), but it should be safe now.
On Cray vector systems, however, we need the system limits.h since sizes of signed and unsigned types can differ there, depending on compiler options (eg. -hnofastmd), making our SHRT_MAX etc expressions fail. For reference, int can be 46 or 64 bits, whereas uint is always 64 bits; and
short can be 24, 32, 46 or 64 bits, and different for ushort. */
#include <limits.h>
/* For fat.h and other fat binary stuff. No need for __GMP_ATTRIBUTE_PURE or __GMP_NOTHROW, since functions declared this way are only used to set function pointers in __gmpn_cpuvec,
they're not called directly. */ #define DECL_add_n(name) \
__GMP_DECLSPEC mp_limb_t name (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t) #define DECL_addlsh1_n(name) \
DECL_add_n (name) #define DECL_addlsh2_n(name) \
DECL_add_n (name) #define DECL_addmul_1(name) \
__GMP_DECLSPEC mp_limb_t name (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t) #define DECL_addmul_2(name) \
__GMP_DECLSPEC mp_limb_t name (mp_ptr, mp_srcptr, mp_size_t, mp_srcptr) #define DECL_bdiv_dbm1c(name) \
__GMP_DECLSPEC mp_limb_t name (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t) #define DECL_cnd_add_n(name) \
__GMP_DECLSPEC mp_limb_t name (mp_limb_t, mp_ptr, mp_srcptr, mp_srcptr, mp_size_t) #define DECL_cnd_sub_n(name) \
__GMP_DECLSPEC mp_limb_t name (mp_limb_t, mp_ptr, mp_srcptr, mp_srcptr, mp_size_t) #define DECL_com(name) \
__GMP_DECLSPEC void name (mp_ptr, mp_srcptr, mp_size_t) #define DECL_copyd(name) \
__GMP_DECLSPEC void name (mp_ptr, mp_srcptr, mp_size_t) #define DECL_copyi(name) \
DECL_copyd (name) #define DECL_divexact_1(name) \
__GMP_DECLSPEC void name (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t) #define DECL_divexact_by3c(name) \
__GMP_DECLSPEC mp_limb_t name (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t) #define DECL_divrem_1(name) \
__GMP_DECLSPEC mp_limb_t name (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t) #define DECL_gcd_11(name) \
__GMP_DECLSPEC mp_limb_t name (mp_limb_t, mp_limb_t) #define DECL_lshift(name) \
__GMP_DECLSPEC mp_limb_t name (mp_ptr, mp_srcptr, mp_size_t, unsigned) #define DECL_lshiftc(name) \
DECL_lshift (name) #define DECL_mod_1(name) \
__GMP_DECLSPEC mp_limb_t name (mp_srcptr, mp_size_t, mp_limb_t) #define DECL_mod_1_1p(name) \
__GMP_DECLSPEC mp_limb_t name (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t []) #define DECL_mod_1_1p_cps(name) \
__GMP_DECLSPEC void name (mp_limb_t cps[], mp_limb_t b) #define DECL_mod_1s_2p(name) \
DECL_mod_1_1p (name) #define DECL_mod_1s_2p_cps(name) \
DECL_mod_1_1p_cps (name) #define DECL_mod_1s_4p(name) \
DECL_mod_1_1p (name) #define DECL_mod_1s_4p_cps(name) \
DECL_mod_1_1p_cps (name) #define DECL_mod_34lsub1(name) \
__GMP_DECLSPEC mp_limb_t name (mp_srcptr, mp_size_t) #define DECL_modexact_1c_odd(name) \
__GMP_DECLSPEC mp_limb_t name (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t) #define DECL_mul_1(name) \
DECL_addmul_1 (name) #define DECL_mul_basecase(name) \
__GMP_DECLSPEC void name (mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t) #define DECL_mullo_basecase(name) \
__GMP_DECLSPEC void name (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t) #define DECL_preinv_divrem_1(name) \
__GMP_DECLSPEC mp_limb_t name (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, int) #define DECL_preinv_mod_1(name) \
__GMP_DECLSPEC mp_limb_t name (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t) #define DECL_redc_1(name) \
__GMP_DECLSPEC mp_limb_t name (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t) #define DECL_redc_2(name) \
__GMP_DECLSPEC mp_limb_t name (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr) #define DECL_rshift(name) \
DECL_lshift (name) #define DECL_sqr_basecase(name) \
__GMP_DECLSPEC void name (mp_ptr, mp_srcptr, mp_size_t) #define DECL_sub_n(name) \
DECL_add_n (name) #define DECL_sublsh1_n(name) \
DECL_add_n (name) #define DECL_submul_1(name) \
DECL_addmul_1 (name)
#if HAVE_INTTYPES_H /* for uint_least32_t */ # include <inttypes.h> #endif /* On some platforms inttypes.h exists but is incomplete
and we still need stdint.h. */ #if HAVE_STDINT_H # include <stdint.h> #endif
#ifdef __cplusplus #include <cstring> /* for strlen */ #include <string> /* for std::string */ #endif
#ifndef WANT_TMP_DEBUG /* for TMP_ALLOC_LIMBS_2 and others */ #define WANT_TMP_DEBUG 0 #endif
/* The following tries to get a good version of alloca. The tests are adapted from autoconf AC_FUNC_ALLOCA, with a couple of additions. Whether this succeeds is tested by GMP_FUNC_ALLOCA and HAVE_ALLOCA will be setup appropriately.
ifndef alloca - a cpp define might already exist. glibc <stdlib.h> includes <alloca.h> which uses GCC __builtin_alloca. HP cc +Olibcalls adds a #define of alloca to __builtin_alloca.
_AIX pragma - IBM compilers need a #pragma in "each module that needs to use alloca". Pragma indented to protect pre-ANSI cpp's. _IBMR2 was used in past versions of GMP, retained still in case it matters.
The autoconf manual says this pragma needs to be at the start of a C file, apart from comments and preprocessor directives. Is that true? xlc on aix 4.xxx doesn't seem to mind it being after prototypes etc from gmp.h.
*/
/* if not provided by gmp-mparam.h */ #ifndef GMP_LIMB_BYTES #define GMP_LIMB_BYTES SIZEOF_MP_LIMB_T #endif #ifndef GMP_LIMB_BITS #define GMP_LIMB_BITS (8 * SIZEOF_MP_LIMB_T) #endif
#define BITS_PER_ULONG (8 * SIZEOF_UNSIGNED_LONG)
/* gmp_uint_least32_t is an unsigned integer type with at least 32 bits. */ #if HAVE_UINT_LEAST32_T typedef uint_least32_t gmp_uint_least32_t; #else #if SIZEOF_UNSIGNED_SHORT >= 4 typedefunsignedshort gmp_uint_least32_t; #else #if SIZEOF_UNSIGNED >= 4 typedefunsigned gmp_uint_least32_t; #else typedefunsignedlong gmp_uint_least32_t; #endif #endif #endif
/* gmp_intptr_t, for pointer to integer casts */ #if HAVE_INTPTR_T typedef intptr_t gmp_intptr_t; #else/* fallback */ typedef size_t gmp_intptr_t; #endif
/* pre-inverse types for truncating division and modulo */ typedefstruct {mp_limb_t inv32;} gmp_pi1_t; typedefstruct {mp_limb_t inv21, inv32, inv53;} gmp_pi2_t;
/* "const" basically means a function does nothing but examine its arguments and give a return value, it doesn't read or write any memory (neither global nor pointed to by arguments), and has no other side-effects. This is more restrictive than "pure". See info node "(gcc)Function Attributes". __GMP_NO_ATTRIBUTE_CONST_PURE lets tune/common.c etc turn
this off when trying to write timing loops. */ #if HAVE_ATTRIBUTE_CONST && ! defined (__GMP_NO_ATTRIBUTE_CONST_PURE) #define ATTRIBUTE_CONST __attribute__ ((const)) #else #define ATTRIBUTE_CONST #endif
/* "malloc" means a function behaves like malloc in that the pointer it
returns doesn't alias anything. */ #if HAVE_ATTRIBUTE_MALLOC #define ATTRIBUTE_MALLOC __attribute__ ((malloc)) #else #define ATTRIBUTE_MALLOC #endif
/* va_copy is standard in C99, and gcc provides __va_copy when in strict C89 mode. Falling back to a memcpy will give maximum portability, since it
works no matter whether va_list is a pointer, struct or array. */ #if ! defined (va_copy) && defined (__va_copy) #define va_copy(dst,src) __va_copy(dst,src) #endif #if ! defined (va_copy) #define va_copy(dst,src) \ do { memcpy (&(dst), &(src), sizeof (va_list)); } while (0) #endif
/* HAVE_HOST_CPU_alpha_CIX is 1 on an alpha with the CIX instructions
(ie. ctlz, ctpop, cttz). */ #if HAVE_HOST_CPU_alphaev67 || HAVE_HOST_CPU_alphaev68 \
|| HAVE_HOST_CPU_alphaev7 #define HAVE_HOST_CPU_alpha_CIX 1 #endif
Small allocations should use TMP_SALLOC, big allocations should use TMP_BALLOC. Allocations that might be small or big should use TMP_ALLOC.
Functions that use just TMP_SALLOC should use TMP_SDECL, TMP_SMARK, and TMP_SFREE.
TMP_DECL just declares a variable, but might be empty and so must be last in a list of variables. TMP_MARK must be done before any TMP_ALLOC. TMP_ALLOC(0) is not allowed. TMP_FREE doesn't need to be done if a
TMP_MARK was made, but then no TMP_ALLOCs. */
/* The alignment in bytes, used for TMP_ALLOCed blocks, when alloca or
__gmp_allocate_func doesn't already determine it. */ union tmp_align_t {
mp_limb_t l; double d; char *p;
}; #define __TMP_ALIGN sizeof (union tmp_align_t)
/* Return "a" rounded upwards to a multiple of "m", if it isn't already. "a" must be an unsigned type. This is designed for use with a compile-time constant "m". The POW2 case is expected to be usual, and gcc 3.0 and up recognises "(-(8*n))%8" or the like is always zero, which means the rounding up in
the WANT_TMP_NOTREENTRANT version of TMP_ALLOC below will be a noop. */ #define ROUND_UP_MULTIPLE(a,m) \
(POW2_P(m) ? (a) + (-(a))%(m) \
: (a)+(m)-1 - (((a)+(m)-1) % (m)))
/* It's more efficient to allocate one block than many. This is certainly true of the malloc methods, but it can even be true of alloca if that involves copying a chunk of stack (various RISCs), or a call to a stack bounds check (mingw). In any case, when debugging keep separate blocks
so a redzoning malloc debugger can protect each individually. */ #define TMP_ALLOC_LIMBS_2(xp,xsize, yp,ysize) \ do { \ if (WANT_TMP_DEBUG) \
{ \
(xp) = TMP_ALLOC_LIMBS (xsize); \
(yp) = TMP_ALLOC_LIMBS (ysize); \
} \ else \
{ \
(xp) = TMP_ALLOC_LIMBS ((xsize) + (ysize)); \
(yp) = (xp) + (xsize); \
} \
} while (0) #define TMP_ALLOC_LIMBS_3(xp,xsize, yp,ysize, zp,zsize) \ do { \ if (WANT_TMP_DEBUG) \
{ \
(xp) = TMP_ALLOC_LIMBS (xsize); \
(yp) = TMP_ALLOC_LIMBS (ysize); \
(zp) = TMP_ALLOC_LIMBS (zsize); \
} \ else \
{ \
(xp) = TMP_ALLOC_LIMBS ((xsize) + (ysize) + (zsize)); \
(yp) = (xp) + (xsize); \
(zp) = (yp) + (ysize); \
} \
} while (0)
/* n-1 inverts any low zeros and the lowest one bit. If n&(n-1) leaves zero then that lowest one bit must have been the only bit set. n==0 will
return true though, so avoid that. */ #define POW2_P(n) (((n) & ((n) - 1)) == 0)
/* Must cast ULONG_MAX etc to unsigned long etc, since they might not be unsigned on a K&R compiler. In particular the HP-UX 10 bundled K&R cc
treats the plain decimal values in <limits.h> as signed. */ #define ULONG_HIGHBIT (ULONG_MAX ^ ((unsignedlong) ULONG_MAX >> 1)) #define UINT_HIGHBIT (UINT_MAX ^ ((unsigned) UINT_MAX >> 1)) #define USHRT_HIGHBIT (USHRT_MAX ^ ((unsignedshort) USHRT_MAX >> 1)) #define GMP_LIMB_HIGHBIT (MP_LIMB_T_MAX ^ (MP_LIMB_T_MAX >> 1))
/* Dummy for non-gcc, code involving it will go dead. */ #if ! defined (__GNUC__) || __GNUC__ < 2 #define __builtin_constant_p(x) 0 #endif
/* In gcc 2.96 and up on i386, tail calls are optimized to jumps if the stack usage is compatible. __attribute__ ((regparm (N))) helps by putting leading parameters in registers, avoiding extra stack.
regparm cannot be used with calls going through the PLT, because the binding code there may clobber the registers (%eax, %edx, %ecx) used for the regparm parameters. Calls to local (ie. static) functions could still use this, if we cared to differentiate locals and globals.
On athlon-unknown-freebsd4.9 with gcc 3.3.3, regparm cannot be used with -p or -pg profiling, since that version of gcc doesn't realize the .mcount calls will clobber the parameter registers. Other systems are ok, like debian with glibc 2.3.2 (mcount doesn't clobber), but we don't bother to try to detect this. regparm is only an optimization so we just
disable it when profiling (profiling being a slowdown anyway). */
/* Macros for altering parameter order according to regparm usage. */ #if USE_LEADING_REGPARM #define REGPARM_2_1(a,b,x) x,a,b #define REGPARM_3_1(a,b,c,x) x,a,b,c #define REGPARM_ATTR(n) __attribute__ ((regparm (n))) #else #define REGPARM_2_1(a,b,x) a,b,x #define REGPARM_3_1(a,b,c,x) a,b,c,x #define REGPARM_ATTR(n) #endif
/* ASM_L gives a local label for a gcc asm block, for use when temporary local labels like "1:" might not be available, which is the case for instance on the x86s (the SCO assembler doesn't support them).
The label generated is made unique by including "%=" which is a unique number for each insn. This ensures the same name can be used in multiple asm blocks, perhaps via a macro. Since jumps between asm blocks are not allowed there's no need for a label to be usable outside a single
block. */
#define ASM_L(name) LSYM_PREFIX "asm_%=_"#name
#ifdefined (__GNUC__) && HAVE_HOST_CPU_FAMILY_x86 #if 0 /* FIXME: Check that these actually improve things. FIXME: Need a cld after each std. FIXME: Can't have inputs in clobbered registers, must describe them as
dummy outputs, and add volatile. */ #define MPN_COPY_INCR(DST, SRC, N) \
__asm__ ("cld\n\trep\n\tmovsl" : : \ "D" (DST), "S" (SRC), "c" (N) : \ "cx", "di", "si", "memory") #define MPN_COPY_DECR(DST, SRC, N) \
__asm__ ("std\n\trep\n\tmovsl" : : \ "D" ((DST) + (N) - 1), "S" ((SRC) + (N) - 1), "c" (N) : \ "cx", "di", "si", "memory") #endif #endif
#ifndef mpn_addmul_2 /* if not done with cpuvec in a fat binary */ #define mpn_addmul_2 __MPN(addmul_2)
__GMP_DECLSPEC mp_limb_t mpn_addmul_2 (mp_ptr, mp_srcptr, mp_size_t, mp_srcptr); #endif
/* Alternative entry point in mpn_addmul_2 for the benefit of mpn_sqr_basecase. */ #define mpn_addmul_2s __MPN(addmul_2s)
__GMP_DECLSPEC mp_limb_t mpn_addmul_2s (mp_ptr, mp_srcptr, mp_size_t, mp_srcptr);
/* Override mpn_addlsh1_n, mpn_addlsh2_n, mpn_sublsh1_n, etc with mpn_addlsh_n, etc when !HAVE_NATIVE the former but HAVE_NATIVE_ the latter. Similarly, override foo_ip1 functions with foo. We then lie and say these macros represent native functions, but leave a trace by using the value 2 rather
than 1. */
#ifndef mpn_lshiftc /* if not done with cpuvec in a fat binary */ #define mpn_lshiftc __MPN(lshiftc)
__GMP_DECLSPEC mp_limb_t mpn_lshiftc (mp_ptr, mp_srcptr, mp_size_t, unsignedint); #endif
#ifndef mpn_mul_basecase /* if not done with cpuvec in a fat binary */ #define mpn_mul_basecase __MPN(mul_basecase)
__GMP_DECLSPEC void mpn_mul_basecase (mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t); #endif
#ifndef mpn_mullo_basecase /* if not done with cpuvec in a fat binary */ #define mpn_mullo_basecase __MPN(mullo_basecase)
__GMP_DECLSPEC void mpn_mullo_basecase (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); #endif
#ifndef mpn_sqr_basecase /* if not done with cpuvec in a fat binary */ #define mpn_sqr_basecase __MPN(sqr_basecase)
__GMP_DECLSPEC void mpn_sqr_basecase (mp_ptr, mp_srcptr, mp_size_t); #endif
#ifndef mpn_redc_1 /* if not done with cpuvec in a fat binary */ #define mpn_redc_1 __MPN(redc_1)
__GMP_DECLSPEC mp_limb_t mpn_redc_1 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_limb_t); #endif
#ifndef mpn_redc_2 /* if not done with cpuvec in a fat binary */ #define mpn_redc_2 __MPN(redc_2)
__GMP_DECLSPEC mp_limb_t mpn_redc_2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr); #endif
#ifndef mpn_mod_1_1p_cps /* if not done with cpuvec in a fat binary */ #define mpn_mod_1_1p_cps __MPN(mod_1_1p_cps)
__GMP_DECLSPEC void mpn_mod_1_1p_cps (mp_limb_t [4], mp_limb_t); #endif #ifndef mpn_mod_1_1p /* if not done with cpuvec in a fat binary */ #define mpn_mod_1_1p __MPN(mod_1_1p)
__GMP_DECLSPEC mp_limb_t mpn_mod_1_1p (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]) __GMP_ATTRIBUTE_PURE; #endif
#ifndef mpn_mod_1s_2p_cps /* if not done with cpuvec in a fat binary */ #define mpn_mod_1s_2p_cps __MPN(mod_1s_2p_cps)
__GMP_DECLSPEC void mpn_mod_1s_2p_cps (mp_limb_t [5], mp_limb_t); #endif #ifndef mpn_mod_1s_2p /* if not done with cpuvec in a fat binary */ #define mpn_mod_1s_2p __MPN(mod_1s_2p)
__GMP_DECLSPEC mp_limb_t mpn_mod_1s_2p (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [5]) __GMP_ATTRIBUTE_PURE; #endif
#ifndef mpn_mod_1s_3p_cps /* if not done with cpuvec in a fat binary */ #define mpn_mod_1s_3p_cps __MPN(mod_1s_3p_cps)
__GMP_DECLSPEC void mpn_mod_1s_3p_cps (mp_limb_t [6], mp_limb_t); #endif #ifndef mpn_mod_1s_3p /* if not done with cpuvec in a fat binary */ #define mpn_mod_1s_3p __MPN(mod_1s_3p)
__GMP_DECLSPEC mp_limb_t mpn_mod_1s_3p (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [6]) __GMP_ATTRIBUTE_PURE; #endif
#ifndef mpn_mod_1s_4p_cps /* if not done with cpuvec in a fat binary */ #define mpn_mod_1s_4p_cps __MPN(mod_1s_4p_cps)
__GMP_DECLSPEC void mpn_mod_1s_4p_cps (mp_limb_t [7], mp_limb_t); #endif #ifndef mpn_mod_1s_4p /* if not done with cpuvec in a fat binary */ #define mpn_mod_1s_4p __MPN(mod_1s_4p)
__GMP_DECLSPEC mp_limb_t mpn_mod_1s_4p (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [7]) __GMP_ATTRIBUTE_PURE; #endif
/* Macro to obtain a void pointer to the function pointers structure. */ #define RNG_FNPTR(rstate) ((rstate)->_mp_algdata._mp_lc)
/* Macro to obtain a pointer to the generator's state.
When used as a lvalue the rvalue needs to be cast to mp_ptr. */ #define RNG_STATE(rstate) ((rstate)->_mp_seed->_mp_d)
/* Write a given number of random bits to rp. */ #define _gmp_rand(rp, state, bits) \ do { \
gmp_randstate_ptr __rstate = (state); \
(*((gmp_randfnptr_t *) RNG_FNPTR (__rstate))->randget_fn) \
(__rstate, rp, bits); \
} while (0)
/* __gmp_rands is the global state for the old-style random functions, and is also used in the test programs (hence the __GMP_DECLSPEC).
There's no seeding here, so mpz_random etc will generate the same sequence every time. This is not unlike the C library random functions if you don't seed them, so perhaps it's acceptable. Digging up a seed from /dev/random or the like would work on many systems, but might encourage a false confidence, since it'd be pretty much impossible to do something that would work reliably everywhere. In any case the new style functions are recommended to applications which care about randomness, so
the old functions aren't too important. */
/* this is used by the test programs, to free memory */ #define RANDS_CLEAR() \ do { \ if (__gmp_rands_initialized) \
{ \
__gmp_rands_initialized = 0; \
gmp_randclear (__gmp_rands); \
} \
} while (0)
/* For a threshold between algorithms A and B, size>=thresh is where B should be used. Special value MP_SIZE_T_MAX means only ever use A, or value 0 means only ever use B. The tests for these special values will be compile-time constants, so the compiler should be able to eliminate
the code for the unwanted algorithm. */
/* The minimal supported value for Toom22 depends also on Toom32 and
Toom42 implementations. */ #define MPN_TOOM22_MUL_MINSIZE 6 #define MPN_TOOM2_SQR_MINSIZE 4
#ifndef mpn_bdiv_dbm1c /* if not done with cpuvec in a fat binary */ #define mpn_bdiv_dbm1c __MPN(bdiv_dbm1c)
__GMP_DECLSPEC mp_limb_t mpn_bdiv_dbm1c (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t); #endif
#define mpn_bsqrtinv __MPN(bsqrtinv)
__GMP_DECLSPEC int mpn_bsqrtinv (mp_ptr, mp_srcptr, mp_bitcnt_t, mp_ptr);
#ifdefined (_CRAY) #define MPN_COPY_INCR(dst, src, n) \ do { \ int __i; /* Faster on some Crays with plain int */ \
_Pragma ("_CRI ivdep"); \ for (__i = 0; __i < (n); __i++) \
(dst)[__i] = (src)[__i]; \
} while (0) #endif
/* used by test programs, hence __GMP_DECLSPEC */ #ifndef mpn_copyi /* if not done with cpuvec in a fat binary */ #define mpn_copyi __MPN(copyi)
__GMP_DECLSPEC void mpn_copyi (mp_ptr, mp_srcptr, mp_size_t); #endif
#ifdefined (_CRAY) #define MPN_COPY_DECR(dst, src, n) \ do { \ int __i; /* Faster on some Crays with plain int */ \
_Pragma ("_CRI ivdep"); \ for (__i = (n) - 1; __i >= 0; __i--) \
(dst)[__i] = (src)[__i]; \
} while (0) #endif
/* used by test programs, hence __GMP_DECLSPEC */ #ifndef mpn_copyd /* if not done with cpuvec in a fat binary */ #define mpn_copyd __MPN(copyd)
__GMP_DECLSPEC void mpn_copyd (mp_ptr, mp_srcptr, mp_size_t); #endif
For power and powerpc we want an inline stu/bdnz loop for zeroing. On ppc630 for instance this is optimal since it can sustain only 1 store per cycle.
gcc 2.95.x (for powerpc64 -maix64, or powerpc32) doesn't recognise the "for" loop in the generic code below can become stu/bdnz. The do/while here helps it get to that. The same caveat about plain -mpowerpc64 mode applies here as to __GMPN_COPY_INCR in gmp.h.
xlc 3.1 already generates stu/bdnz from the generic C, and does so from this loop too.
Enhancement: GLIBC does some trickery with dcbz to zero whole cache lines at a time. MPN_ZERO isn't all that important in GMP, so it might be more trouble than it's worth to do the same, though perhaps a call to memset
would be good when on a GNU system. */
#if HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc #define MPN_FILL(dst, n, f) \ do { \
mp_ptr __dst = (dst) - 1; \
mp_size_t __n = (n); \
ASSERT (__n > 0); \ do \
*++__dst = (f); \ while (--__n); \
} while (0) #endif
#ifndef MPN_FILL #define MPN_FILL(dst, n, f) \ do { \
mp_ptr __dst = (dst); \
mp_size_t __n = (n); \
ASSERT (__n > 0); \ do \
*__dst++ = (f); \ while (--__n); \
} while (0) #endif
#define MPN_ZERO(dst, n) \ do { \
ASSERT ((n) >= 0); \ if ((n) != 0) \
MPN_FILL (dst, n, CNST_LIMB (0)); \
} while (0)
/* On the x86s repe/scasl doesn't seem useful, since it takes many cycles to start up and would need to strip a lot of zeros before it'd be faster than a simple cmpl loop. Here are some times in cycles for std/repe/scasl/cld and cld/repe/scasl (the latter would be for stripping low zeros).
/* Strip least significant zero limbs from {ptr,size} by incrementing ptr and decrementing size. low should be ptr[0], and will be the new ptr[0] on returning. The number in {ptr,size} must be non-zero, ie. size!=0 and
somewhere a non-zero limb. */ #define MPN_STRIP_LOW_ZEROS_NOT_ZERO(ptr, size, low) \ do { \
ASSERT ((size) >= 1); \
ASSERT ((low) == (ptr)[0]); \
\ while ((low) == 0) \
{ \
(size)--; \
ASSERT ((size) >= 1); \
(ptr)++; \
(low) = *(ptr); \
} \
} while (0)
/* Initialize X of type mpz_t with space for NLIMBS limbs. X should be a temporary variable; it will be automatically cleared out at function return. We use __x here to make it possible to accept both mpz_ptr and
mpz_t arguments. */ #define MPZ_TMP_INIT(X, NLIMBS) \ do { \
mpz_ptr __x = (X); \
ASSERT ((NLIMBS) >= 1); \
__x->_mp_alloc = (NLIMBS); \
__x->_mp_d = TMP_ALLOC_LIMBS (NLIMBS); \
} while (0)
#if WANT_ASSERT staticinlinevoid *
_mpz_newalloc (mpz_ptr z, mp_size_t n)
{ void * res = _mpz_realloc(z,n); /* If we are checking the code, force a random change to limbs. */
((mp_ptr) res)[0] = ~ ((mp_ptr) res)[ALLOC (z) - 1]; return res;
} #else #define _mpz_newalloc _mpz_realloc #endif /* Realloc for an mpz_t WHAT if it has less than NEEDED limbs. */ #define MPZ_REALLOC(z,n) (UNLIKELY ((n) > ALLOC(z)) \
? (mp_ptr) _mpz_realloc(z,n) \
: PTR(z)) #define MPZ_NEWALLOC(z,n) (UNLIKELY ((n) > ALLOC(z)) \
? (mp_ptr) _mpz_newalloc(z,n) \
: PTR(z))
/* MPN_FIB2_SIZE(n) is the size in limbs required by mpn_fib2_ui for fp and f1p.
From Knuth vol 1 section 1.2.8, F[n] = phi^n/sqrt(5) rounded to the nearest integer, where phi=(1+sqrt(5))/2 is the golden ratio. So the number of bits required is n*log_2((1+sqrt(5))/2) = n*0.6942419.
The multiplier used is 23/32=0.71875 for efficient calculation on CPUs without good floating point. There's +2 for rounding up, and a further +2 since at the last step x limbs are doubled into a 2x+1 limb region whereas the actual F[2k] value might be only 2x-1 limbs.
Note that a division is done first, since on a 32-bit system it's at least conceivable to go right up to n==ULONG_MAX. (F[2^32-1] would be about 380Mbytes, plus temporary workspace of about 1.2Gbytes here and whatever a multiply of two 190Mbyte numbers takes.)
Enhancement: When GMP_NUMB_BITS is not a power of 2 the division could be
worked into the multiplier. */
/* n^log <= GMP_NUMB_MAX, a limb can store log factors less than n */ staticinlineunsigned
log_n_max (mp_limb_t n)
{ unsigned log; for (log = 8; n > __gmp_limbroots_table[log - 1]; log--); return log;
}
#define SIEVESIZE 512 /* FIXME: Allow gmp_init_primesieve to choose */ typedefstruct
{ unsignedlong d; /* current index in s[] */ unsignedlong s0; /* number corresponding to s[0] */ unsignedlong sqrt_s0; /* misnomer for sqrt(s[SIEVESIZE-1]) */ unsignedchar s[SIEVESIZE + 1]; /* sieve table */
} gmp_primesieve_t;
/* MUL_TOOM22_THRESHOLD_LIMIT is the maximum for MUL_TOOM22_THRESHOLD. In a normal build MUL_TOOM22_THRESHOLD is a constant and we use that. In a fat binary or tune program build MUL_TOOM22_THRESHOLD is a variable and a
separate hard limit will have been defined. Similarly for TOOM3. */ #ifndef MUL_TOOM22_THRESHOLD_LIMIT #define MUL_TOOM22_THRESHOLD_LIMIT MUL_TOOM22_THRESHOLD #endif #ifndef MUL_TOOM33_THRESHOLD_LIMIT #define MUL_TOOM33_THRESHOLD_LIMIT MUL_TOOM33_THRESHOLD #endif #ifndef MULLO_BASECASE_THRESHOLD_LIMIT #define MULLO_BASECASE_THRESHOLD_LIMIT MULLO_BASECASE_THRESHOLD #endif #ifndef SQRLO_BASECASE_THRESHOLD_LIMIT #define SQRLO_BASECASE_THRESHOLD_LIMIT SQRLO_BASECASE_THRESHOLD #endif #ifndef SQRLO_DC_THRESHOLD_LIMIT #define SQRLO_DC_THRESHOLD_LIMIT SQRLO_DC_THRESHOLD #endif
/* SQR_BASECASE_THRESHOLD is where mpn_sqr_basecase should take over from mpn_mul_basecase. Default is to use mpn_sqr_basecase from 0. (Note that we certainly always want it if there's a native assembler mpn_sqr_basecase.)
If it turns out that mpn_toom2_sqr becomes faster than mpn_mul_basecase before mpn_sqr_basecase does, then SQR_BASECASE_THRESHOLD is the toom2 threshold and SQR_TOOM2_THRESHOLD is 0. This oddity arises more or less because SQR_TOOM2_THRESHOLD represents the size up to which mpn_sqr_basecase
should be used, and that may be never. */
#ifndef SQR_BASECASE_THRESHOLD #define SQR_BASECASE_THRESHOLD 0 /* never use mpn_mul_basecase */ #endif
/* First k to use for an FFT modF multiply. A modF FFT is an order log(2^k)/log(2^(k-1)) algorithm, so k=3 is merely 1.5 like karatsuba,
whereas k=4 is 1.33 which is faster than toom3 at 1.485. */ #define FFT_FIRST_K 4
/* Threshold at which FFT should be used to do a modF NxN -> N multiply. */ #ifndef MUL_FFT_MODF_THRESHOLD #define MUL_FFT_MODF_THRESHOLD (MUL_TOOM33_THRESHOLD * 3) #endif #ifndef SQR_FFT_MODF_THRESHOLD #define SQR_FFT_MODF_THRESHOLD (SQR_TOOM3_THRESHOLD * 3) #endif
/* Threshold at which FFT should be used to do an NxN -> 2N multiply. This will be a size where FFT is using k=7 or k=8, since an FFT-k used for an NxN->2N multiply and not recursing into itself is an order log(2^k)/log(2^(k-2)) algorithm, so it'll be at least k=7 at 1.39 which
is the first better than toom3. */ #ifndef MUL_FFT_THRESHOLD #define MUL_FFT_THRESHOLD (MUL_FFT_MODF_THRESHOLD * 10) #endif #ifndef SQR_FFT_THRESHOLD #define SQR_FFT_THRESHOLD (SQR_FFT_MODF_THRESHOLD * 10) #endif
/* Return non-zero if xp,xsize and yp,ysize overlap. If xp+xsize<=yp there's no overlap, or if yp+ysize<=xp there's no
overlap. If both these are false, there's an overlap. */ #define MPN_OVERLAP_P(xp, xsize, yp, ysize) \
((xp) + (xsize) > (yp) && (yp) + (ysize) > (xp)) #define MEM_OVERLAP_P(xp, xsize, yp, ysize) \
( (char *) (xp) + (xsize) > (char *) (yp) \
&& (char *) (yp) + (ysize) > (char *) (xp))
/* Return non-zero if xp,xsize and yp,ysize are either identical or not
overlapping. Return zero if they're partially overlapping. */ #define MPN_SAME_OR_SEPARATE_P(xp, yp, size) \
MPN_SAME_OR_SEPARATE2_P(xp, size, yp, size) #define MPN_SAME_OR_SEPARATE2_P(xp, xsize, yp, ysize) \
((xp) == (yp) || ! MPN_OVERLAP_P (xp, xsize, yp, ysize))
/* Return non-zero if dst,dsize and src,ssize are either identical or overlapping in a way suitable for an incrementing/decrementing algorithm.
Return zero if they're partially overlapping in an unsuitable fashion. */ #define MPN_SAME_OR_INCR2_P(dst, dsize, src, ssize) \
((dst) <= (src) || ! MPN_OVERLAP_P (dst, dsize, src, ssize)) #define MPN_SAME_OR_INCR_P(dst, src, size) \
MPN_SAME_OR_INCR2_P(dst, size, src, size) #define MPN_SAME_OR_DECR2_P(dst, dsize, src, ssize) \
((dst) >= (src) || ! MPN_OVERLAP_P (dst, dsize, src, ssize)) #define MPN_SAME_OR_DECR_P(dst, src, size) \
MPN_SAME_OR_DECR2_P(dst, size, src, size)
/* ASSERT() is a private assertion checking scheme, similar to <assert.h>. ASSERT() does the check only if WANT_ASSERT is selected, ASSERT_ALWAYS() does it always. Generally assertions are meant for development, but
might help when looking for a problem later too. */
#define ASSERT_ALWAYS(expr) \ do { \ if (UNLIKELY (!(expr))) \
ASSERT_FAIL (expr); \
} while (0)
#if WANT_ASSERT #define ASSERT(expr) ASSERT_ALWAYS (expr) #else #define ASSERT(expr) do {} while (0) #endif
/* ASSERT_CARRY checks the expression is non-zero, and ASSERT_NOCARRY checks that it's zero. In both cases if assertion checking is disabled the expression is still evaluated. These macros are meant for use with routines like mpn_add_n() where the return value represents a carry or whatever that should or shouldn't occur in some context. For example,
ASSERT_NOCARRY (mpn_add_n (rp, s1p, s2p, size)); */ #if WANT_ASSERT #define ASSERT_CARRY(expr) ASSERT_ALWAYS ((expr) != 0) #define ASSERT_NOCARRY(expr) ASSERT_ALWAYS ((expr) == 0) #else #define ASSERT_CARRY(expr) (expr) #define ASSERT_NOCARRY(expr) (expr) #endif
/* ASSERT_CODE includes code when assertion checking is wanted. This is the
same as writing "#if WANT_ASSERT", but more compact. */ #if WANT_ASSERT #define ASSERT_CODE(expr) expr #else #define ASSERT_CODE(expr) #endif
/* Test that an mpq_t is in fully canonical form. This can be used as protection on routines like mpq_equal which give wrong results on
non-canonical inputs. */ #if WANT_ASSERT #define ASSERT_MPQ_CANONICAL(q) \ do { \
ASSERT (q->_mp_den._mp_size > 0); \ if (q->_mp_num._mp_size == 0) \
{ \ /* zero should be 0/1 */ \
ASSERT (mpz_cmp_ui (mpq_denref(q), 1L) == 0); \
} \ else \
{ \ /* no common factors */ \
mpz_t __g; \
mpz_init (__g); \
mpz_gcd (__g, mpq_numref(q), mpq_denref(q)); \
ASSERT (mpz_cmp_ui (__g, 1) == 0); \
mpz_clear (__g); \
} \
} while (0) #else #define ASSERT_MPQ_CANONICAL(q) do {} while (0) #endif
/* Check that the nail parts are zero. */ #define ASSERT_ALWAYS_LIMB(limb) \ do { \
mp_limb_t __nail = (limb) & GMP_NAIL_MASK; \
ASSERT_ALWAYS (__nail == 0); \
} while (0) #define ASSERT_ALWAYS_MPN(ptr, size) \ do { \ /* let whole loop go dead when no nails */ \ if (GMP_NAIL_BITS != 0) \
{ \
mp_size_t __i; \ for (__i = 0; __i < (size); __i++) \
ASSERT_ALWAYS_LIMB ((ptr)[__i]); \
} \
} while (0) #if WANT_ASSERT #define ASSERT_LIMB(limb) ASSERT_ALWAYS_LIMB (limb) #define ASSERT_MPN(ptr, size) ASSERT_ALWAYS_MPN (ptr, size) #else #define ASSERT_LIMB(limb) do {} while (0) #define ASSERT_MPN(ptr, size) do {} while (0) #endif
/* Assert that an mpn region {ptr,size} is zero, or non-zero.
size==0 is allowed, and in that case {ptr,size} considered to be zero. */ #if WANT_ASSERT #define ASSERT_MPN_ZERO_P(ptr,size) \ do { \
mp_size_t __i; \
ASSERT ((size) >= 0); \ for (__i = 0; __i < (size); __i++) \
ASSERT ((ptr)[__i] == 0); \
} while (0) #define ASSERT_MPN_NONZERO_P(ptr,size) \ do { \
mp_size_t __i; \ int __nonzero = 0; \
ASSERT ((size) >= 0); \ for (__i = 0; __i < (size); __i++) \ if ((ptr)[__i] != 0) \
{ \
__nonzero = 1; \ break; \
} \
ASSERT (__nonzero); \
} while (0) #else #define ASSERT_MPN_ZERO_P(ptr,size) do {} while (0) #define ASSERT_MPN_NONZERO_P(ptr,size) do {} while (0) #endif
/* ADDC_LIMB sets w=x+y and cout to 0 or 1 for a carry from that addition. */ #if GMP_NAIL_BITS == 0 #define ADDC_LIMB(cout, w, x, y) \ do { \
mp_limb_t __x = (x); \
mp_limb_t __y = (y); \
mp_limb_t __w = __x + __y; \
(w) = __w; \
(cout) = __w < __x; \
} while (0) #else #define ADDC_LIMB(cout, w, x, y) \ do { \
mp_limb_t __w; \
ASSERT_LIMB (x); \
ASSERT_LIMB (y); \
__w = (x) + (y); \
(w) = __w & GMP_NUMB_MASK; \
(cout) = __w >> GMP_NUMB_BITS; \
} while (0) #endif
/* SUBC_LIMB sets w=x-y and cout to 0 or 1 for a borrow from that
subtract. */ #if GMP_NAIL_BITS == 0 #define SUBC_LIMB(cout, w, x, y) \ do { \
mp_limb_t __x = (x); \
mp_limb_t __y = (y); \
mp_limb_t __w = __x - __y; \
(w) = __w; \
(cout) = __w > __x; \
} while (0) #else #define SUBC_LIMB(cout, w, x, y) \ do { \
mp_limb_t __w = (x) - (y); \
(w) = __w & GMP_NUMB_MASK; \
(cout) = __w >> (GMP_LIMB_BITS-1); \
} while (0) #endif
/* MPN_INCR_U does {ptr,size} += n, MPN_DECR_U does {ptr,size} -= n, both expecting no carry (or borrow) from that.
The size parameter is only for the benefit of assertion checking. In a normal build it's unused and the carry/borrow is just propagated as far as it needs to go.
On random data, usually only one or two limbs of {ptr,size} get updated, so there's no need for any sophisticated looping, just something compact and sensible.
FIXME: Switch all code from mpn_{incr,decr}_u to MPN_{INCR,DECR}_U, declaring their operand sizes, then remove the former. This is purely
for the benefit of assertion checking. */
#ifdefined (__GNUC__) && GMP_NAIL_BITS == 0 && ! defined (NO_ASM) \
&& (defined(HAVE_HOST_CPU_FAMILY_x86) || defined(HAVE_HOST_CPU_FAMILY_x86_64)) \
&& ! WANT_ASSERT /* Better flags handling than the generic C gives on i386, saving a few
bytes of code and maybe a cycle or two. */
/* Structure for conversion between internal binary format and strings. */ struct bases
{ /* Number of digits in the conversion base that always fits in an mp_limb_t. For example, for base 10 on a machine where an mp_limb_t has 32 bits this
is 9, since 10**9 is the largest number that fits into an mp_limb_t. */ int chars_per_limb;
/* base**chars_per_limb, i.e. the biggest number that fits a word, built by factors of base. Exception: For 2, 4, 8, etc, big_base is log2(base),
i.e. the number of bits used to represent each digit in the base. */
mp_limb_t big_base;
/* A GMP_LIMB_BITS bit approximation to 1/big_base, represented as a fixed-point number. Instead of dividing by big_base an application can
choose to multiply by big_base_inverted. */
mp_limb_t big_base_inverted;
};
/* Compute the number of digits in base for nbits bits, making sure the result is never too small. The two variants of the macro implement the same
function; the GT2 variant below works just for bases > 2. */ #define DIGITS_IN_BASE_FROM_BITS(res, nbits, b) \ do { \
mp_limb_t _ph, _dummy; \
size_t _nbits = (nbits); \
umul_ppmm (_ph, _dummy, mp_bases[b].logb2, _nbits); \
_ph += (_dummy + _nbits < _dummy); \
res = _ph + 1; \
} while (0) #define DIGITS_IN_BASEGT2_FROM_BITS(res, nbits, b) \ do { \
mp_limb_t _ph, _dummy; \
size_t _nbits = (nbits); \
umul_ppmm (_ph, _dummy, mp_bases[b].logb2 + 1, _nbits); \
res = _ph + 1; \
} while (0)
/* For power of 2 bases this is exact. For other bases the result is either exact or one too big.
To be exact always it'd be necessary to examine all the limbs of the operand, since numbers like 100..000 and 99...999 generally differ only in the lowest limb. It'd be possible to examine just a couple of high limbs to increase the probability of being exact, but that doesn't seem
worth bothering with. */
#define MPN_SIZEINBASE(result, ptr, size, base) \ do { \ int __lb_base, __cnt; \
size_t __totbits; \
\
ASSERT ((size) >= 0); \
ASSERT ((base) >= 2); \
ASSERT ((base) < numberof (mp_bases)); \
\ /* Special case for X == 0. */ \ if ((size) == 0) \
(result) = 1; \ else \
{ \ /* Calculate the total number of significant bits of X. */ \
count_leading_zeros (__cnt, (ptr)[(size)-1]); \
__totbits = (size_t) (size) * GMP_NUMB_BITS - (__cnt - GMP_NAIL_BITS);\
\ if (POW2_P (base)) \
{ \
__lb_base = mp_bases[base].big_base; \
(result) = (__totbits + __lb_base - 1) / __lb_base; \
} \ else \
{ \
DIGITS_IN_BASEGT2_FROM_BITS (result, __totbits, base); \
} \
} \
} while (0)
/* bit count to limb count, rounding up */ #define BITS_TO_LIMBS(n) (((n) + (GMP_NUMB_BITS - 1)) / GMP_NUMB_BITS)
/* MPN_SET_UI sets an mpn (ptr, cnt) to given ui. MPZ_FAKE_UI creates fake mpz_t from ui. The zp argument must have room for LIMBS_PER_ULONG limbs
in both cases (LIMBS_PER_ULONG is also defined here.) */ #if BITS_PER_ULONG <= GMP_NUMB_BITS /* need one limb per ulong */
/* LIMB_HIGHBIT_TO_MASK(n) examines the high bit of a limb value and turns 1 or 0 there into a limb 0xFF..FF or 0 respectively.
On most CPUs this is just an arithmetic right shift by GMP_LIMB_BITS-1, but C99 doesn't guarantee signed right shifts are arithmetic, so we have a little compile-time test and a fallback to a "? :" form. The latter is necessary for instance on Cray vector systems.
Recent versions of gcc (eg. 3.3) will in fact optimize a "? :" like this to an arithmetic right shift anyway, but it's good to get the desired shift on past versions too (in particular since an important use of
LIMB_HIGHBIT_TO_MASK is in udiv_qrnnd_preinv). */
/* Use a library function for invert_limb, if available. */ #define mpn_invert_limb __MPN(invert_limb)
__GMP_DECLSPEC mp_limb_t mpn_invert_limb (mp_limb_t) ATTRIBUTE_CONST; #if ! defined (invert_limb) && HAVE_NATIVE_mpn_invert_limb #define invert_limb(invxl,xl) \ do { \
(invxl) = mpn_invert_limb (xl); \
} while (0) #endif
#ifndef mpn_preinv_divrem_1 /* if not done with cpuvec in a fat binary */ #define mpn_preinv_divrem_1 __MPN(preinv_divrem_1)
__GMP_DECLSPEC mp_limb_t mpn_preinv_divrem_1 (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, int); #endif
/* USE_PREINV_DIVREM_1 is whether to use mpn_preinv_divrem_1, as opposed to the plain mpn_divrem_1. The default is yes, since the few CISC chips where
preinv is not good have defines saying so. */ #ifndef USE_PREINV_DIVREM_1 #define USE_PREINV_DIVREM_1 1 #endif
/* This selection may seem backwards. The reason mpn_mod_1 typically takes
over for larger sizes is that it uses the mod_1_1 function. */ #define MPN_MOD_OR_PREINV_MOD_1(src,size,divisor,inverse) \
(BELOW_THRESHOLD (size, PREINV_MOD_1_TO_MOD_1_THRESHOLD) \
? mpn_preinv_mod_1 (src, size, divisor, inverse) \
: mpn_mod_1 (src, size, divisor))
#ifndef mpn_mod_34lsub1 /* if not done with cpuvec in a fat binary */ #define mpn_mod_34lsub1 __MPN(mod_34lsub1)
__GMP_DECLSPEC mp_limb_t mpn_mod_34lsub1 (mp_srcptr, mp_size_t) __GMP_ATTRIBUTE_PURE; #endif
/* DIVEXACT_1_THRESHOLD is at what size to use mpn_divexact_1, as opposed to plain mpn_divrem_1. Likewise BMOD_1_TO_MOD_1_THRESHOLD for mpn_modexact_1_odd against plain mpn_mod_1. On most CPUs divexact and modexact are faster at all sizes, so the defaults are 0. Those CPUs
where this is not right have a tuned threshold. */ #ifndef DIVEXACT_1_THRESHOLD #define DIVEXACT_1_THRESHOLD 0 #endif #ifndef BMOD_1_TO_MOD_1_THRESHOLD #define BMOD_1_TO_MOD_1_THRESHOLD 10 #endif
#define MPN_DIVREM_OR_DIVEXACT_1(rp, up, n, d) \ do { \ if (BELOW_THRESHOLD (n, DIVEXACT_1_THRESHOLD)) \
ASSERT_NOCARRY (mpn_divrem_1 (rp, (mp_size_t) 0, up, n, d)); \ else \
{ \
ASSERT (mpn_mod_1 (up, n, d) == 0); \
mpn_divexact_1 (rp, up, n, d); \
} \
} while (0)
#ifndef mpn_modexact_1c_odd /* if not done with cpuvec in a fat binary */ #define mpn_modexact_1c_odd __MPN(modexact_1c_odd)
__GMP_DECLSPEC mp_limb_t mpn_modexact_1c_odd (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t) __GMP_ATTRIBUTE_PURE; #endif
/* binvert_limb() sets inv to the multiplicative inverse of n modulo 2^GMP_NUMB_BITS, ie. satisfying inv*n == 1 mod 2^GMP_NUMB_BITS. n must be odd (otherwise such an inverse doesn't exist).
This is not to be confused with invert_limb(), which is completely different.
The table lookup gives an inverse with the low 8 bits valid, and each multiply step doubles the number of bits. See Jebelean "An algorithm for exact division" end of section 4 (reference in gmp.texi).
Possible enhancement: Could use UHWtype until the last step, if half-size multiplies are faster (might help under _LONG_LONG_LIMB).
Alternative: As noted in Granlund and Montgomery "Division by Invariant Integers using Multiplication" (reference in gmp.texi), n itself gives a 3-bit inverse immediately, and could be used instead of a table lookup. A 4-bit inverse can be obtained effectively from xoring bits 1 and 2 into
bit 3, for instance with (((n + 2) & 4) << 1) ^ n. */
/* Multiplicative inverse of 3, modulo 2^GMP_NUMB_BITS. Eg. 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. GMP_NUMB_MAX/3*2+1 is right when GMP_NUMB_BITS is even, but when it's odd
we need to start from GMP_NUMB_MAX>>1. */ #define MODLIMB_INVERSE_3 (((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 2)) / 3) * 2 + 1)
/* ceil(GMP_NUMB_MAX/3) and ceil(2*GMP_NUMB_MAX/3).
These expressions work because GMP_NUMB_MAX%3 != 0 for all GMP_NUMB_BITS. */ #define GMP_NUMB_CEIL_MAX_DIV3 (GMP_NUMB_MAX / 3 + 1) #define GMP_NUMB_CEIL_2MAX_DIV3 ((GMP_NUMB_MAX>>1) / 3 + 1 + GMP_NUMB_HIGHBIT)
/* Set r to -a mod d. a>=d is allowed. Can give r>d. All should be limbs.
It's not clear whether this is the best way to do this calculation. Anything congruent to -a would be fine for the one limb congruence
tests. */
/* A bit mask of all the least significant zero bits of n, or -1 if n==0. */ #define LOW_ZEROS_MASK(n) (((n) & -(n)) - 1)
/* ULONG_PARITY sets "p" to 1 if there's an odd number of 1 bits in "n", or to 0 if there's an even number. "n" should be an unsigned long and "p"
an int. */
#ifdefined (__GNUC__) && ! defined (NO_ASM) && HAVE_HOST_CPU_alpha_CIX #define ULONG_PARITY(p, n) \ do { \ int __p; \
__asm__ ("ctpop %1, %0" : "=r" (__p) : "r" (n)); \
(p) = __p & 1; \
} while (0) #endif
#ifdefined (__GNUC__) && ! defined (__INTEL_COMPILER) \
&& ! defined (NO_ASM) && defined (__ia64) /* unsigned long is either 32 or 64 bits depending on the ABI, zero extend
to a 64 bit unsigned long long for popcnt */ #define ULONG_PARITY(p, n) \ do { \ unsignedlonglong __n = (unsignedlong) (n); \ int __p; \
__asm__ ("popcnt %0 = %1" : "=r" (__p) : "r" (__n)); \
(p) = __p & 1; \
} while (0) #endif
#if ! defined (ULONG_PARITY) #define ULONG_PARITY(p, n) \ do { \ unsignedlong __n = (n); \ int __p = 0; \ do \
{ \
__p ^= 0x96696996L >> (__n & 0x1F); \
__n >>= 5; \
} \ while (__n != 0); \
\
(p) = __p & 1; \
} while (0) #endif
/* 3 cycles on 604 or 750 since shifts and rlwimi's can pair. gcc (as of version 3.1 at least) doesn't seem to know how to generate rlwimi for
anything other than bit-fields, so use "asm". */ #ifdefined (__GNUC__) && ! defined (NO_ASM) \
&& HAVE_HOST_CPU_FAMILY_powerpc && GMP_LIMB_BITS == 32 #define BSWAP_LIMB(dst, src) \ do { \
mp_limb_t __bswapl_src = (src); \
mp_limb_t __tmp1 = __bswapl_src >> 24; /* low byte */ \
mp_limb_t __tmp2 = __bswapl_src << 24; /* high byte */ \
__asm__ ("rlwimi %0, %2, 24, 16, 23"/* 2nd low */ \
: "=r" (__tmp1) : "0" (__tmp1), "r" (__bswapl_src)); \
__asm__ ("rlwimi %0, %2, 8, 8, 15"/* 3nd high */ \
: "=r" (__tmp2) : "0" (__tmp2), "r" (__bswapl_src)); \
(dst) = __tmp1 | __tmp2; /* whole */ \
} while (0) #endif
/* bswap is available on i486 and up and is fast. A combination rorw $8 / roll $16 / rorw $8 is used in glibc for plain i386 (and in the linux kernel with xchgb instead of rorw), but this is not done here, because i386 means generic x86 and mixing word and dword operations will cause
partial register stalls on P6 chips. */ #ifdefined (__GNUC__) && ! defined (NO_ASM) \
&& HAVE_HOST_CPU_FAMILY_x86 && ! HAVE_HOST_CPU_i386 \
&& GMP_LIMB_BITS == 32 #define BSWAP_LIMB(dst, src) \ do { \
__asm__ ("bswap %0" : "=r" (dst) : "0" (src)); \
} while (0) #endif
#ifdefined (__GNUC__) && ! defined (NO_ASM) \
&& defined (__amd64__) && GMP_LIMB_BITS == 64 #define BSWAP_LIMB(dst, src) \ do { \
__asm__ ("bswap %q0" : "=r" (dst) : "0" (src)); \
} while (0) #endif
#ifdefined (__GNUC__) && ! defined (__INTEL_COMPILER) \
&& ! defined (NO_ASM) && defined (__ia64) && GMP_LIMB_BITS == 64 #define BSWAP_LIMB(dst, src) \ do { \
__asm__ ("mux1 %0 = %1, @rev" : "=r" (dst) : "r" (src)); \
} while (0) #endif
/* Define ieee_double_extract and _GMP_IEEE_FLOATS.
Bit field packing is "implementation defined" according to C99, which leaves us at the compiler's mercy here. For some systems packing is defined in the ABI (eg. x86). In any case so far it seems universal that little endian systems pack from low to high, and big endian from high to low within the given type.
Within the fields we rely on the integer endianness being the same as the float endianness, this is true everywhere we know of and it'd be a fairly
strange system that did anything else. */
/* Use (4.0 * ...) instead of (2.0 * ...) to work around buggy compilers
that don't convert ulong->double correctly (eg. SunOS 4 native cc). */ #define MP_BASE_AS_DOUBLE (4.0 * ((mp_limb_t) 1 << (GMP_NUMB_BITS - 2))) /* Maximum number of limbs it will take to store any `double'.
We assume doubles have 53 mantissa bits. */ #define LIMBS_PER_DOUBLE ((53 + GMP_NUMB_BITS - 2) / GMP_NUMB_BITS + 1)
__GMP_DECLSPEC int __gmp_extract_double (mp_ptr, double);
/* DOUBLE_NAN_INF_ACTION executes code a_nan if x is a NaN, or executes a_inf if x is an infinity. Both are considered unlikely values, for
branch prediction. */
#if HAVE_DOUBLE_VAX_D || HAVE_DOUBLE_VAX_G || HAVE_DOUBLE_CRAY_CFP /* no nans or infs in these formats */ #define DOUBLE_NAN_INF_ACTION(x, a_nan, a_inf) \ do { } while (0) #endif
#ifndef DOUBLE_NAN_INF_ACTION /* Unknown format, try something generic. NaN should be "unordered", so x!=x.
Inf should be bigger than DBL_MAX. */ #define DOUBLE_NAN_INF_ACTION(x, a_nan, a_inf) \ do { \
{ \ if (UNLIKELY ((x) != (x))) \
{ a_nan; } \ elseif (UNLIKELY ((x) > DBL_MAX || (x) < -DBL_MAX)) \
{ a_inf; } \
} \
} while (0) #endif
/* On m68k, x86 and amd64, gcc (and maybe other compilers) can hold doubles in the coprocessor, which means a bigger exponent range than normal, and depending on the rounding mode, a bigger mantissa than normal. (See "Disappointments" in the gcc manual.) FORCE_DOUBLE stores and fetches "d" through memory to force any rounding and overflows to occur.
On amd64, and on x86s with SSE2, gcc (depending on options) uses the xmm registers, where there's no such extra precision and no need for the FORCE_DOUBLE. We don't bother to detect this since the present uses for FORCE_DOUBLE are only in test programs and default generic C code.
Not quite sure that an "automatic volatile" will use memory, but it does in gcc. An asm("":"=m"(d):"0"(d)) can't be used to trick gcc, since apparently matching operands like "0" are only allowed on a register output. gcc 3.4 warns about this, though in fact it and past versions
seem to put the operand through memory as hoped. */
#if (HAVE_HOST_CPU_FAMILY_m68k || HAVE_HOST_CPU_FAMILY_x86 \
|| defined (__amd64__)) #define FORCE_DOUBLE(d) \ do { volatiledouble __gmp_force = (d); (d) = __gmp_force; } while (0) #else #define FORCE_DOUBLE(d) do { } while (0) #endif
/* BIT1 means a result value in bit 1 (second least significant bit), with a zero bit representing +1 and a one bit representing -1. Bits other than bit 1 are garbage. These are meant to be kept in "int"s, and casts are used to ensure the expressions are "int"s even if a and/or b might be other types.
JACOBI_TWOS_U_BIT1 and JACOBI_RECIP_UU_BIT1 are used in mpn_jacobi_base and their speed is important. Expressions are used rather than conditionals to accumulate sign changes, which effectively means XORs
instead of conditional JUMPs. */
/* (a/0), with a signed; is 1 if a=+/-1, 0 otherwise */ #define JACOBI_S0(a) (((a) == 1) | ((a) == -1))
/* (a/0), with a unsigned; is 1 if a=+/-1, 0 otherwise */ #define JACOBI_U0(a) ((a) == 1)
/* FIXME: JACOBI_LS0 and JACOBI_0LS are the same, so delete one and
come up with a better name. */
/* (a/0), with a given by low and size;
is 1 if a=+/-1, 0 otherwise */ #define JACOBI_LS0(alow,asize) \
(((asize) == 1 || (asize) == -1) && (alow) == 1)
/* (a/0), with a an mpz_t;
fetch of low limb always valid, even if size is zero */ #define JACOBI_Z0(a) JACOBI_LS0 (PTR(a)[0], SIZ(a))
/* (0/b), with b unsigned; is 1 if b=1, 0 otherwise */ #define JACOBI_0U(b) ((b) == 1)
/* (0/b), with b unsigned; is 1 if b=+/-1, 0 otherwise */ #define JACOBI_0S(b) ((b) == 1 || (b) == -1)
/* (0/b), with b given by low and size; is 1 if b=+/-1, 0 otherwise */ #define JACOBI_0LS(blow,bsize) \
(((bsize) == 1 || (bsize) == -1) && (blow) == 1)
/* Convert a bit1 to +1 or -1. */ #define JACOBI_BIT1_TO_PN(result_bit1) \
(1 - ((int) (result_bit1) & 2))
/* (2/b), with b unsigned and odd; is (-1)^((b^2-1)/8) which is 1 if b==1,7mod8 or -1 if b==3,5mod8 and
hence obtained from (b>>1)^b */ #define JACOBI_TWO_U_BIT1(b) \
((int) (((b) >> 1) ^ (b)))
/* (2/b)^twos, with b unsigned and odd */ #define JACOBI_TWOS_U_BIT1(twos, b) \
((int) ((twos) << 1) & JACOBI_TWO_U_BIT1 (b))
/* (2/b)^twos, with b unsigned and odd */ #define JACOBI_TWOS_U(twos, b) \
(JACOBI_BIT1_TO_PN (JACOBI_TWOS_U_BIT1 (twos, b)))
/* (-1/b), with b odd (signed or unsigned);
is (-1)^((b-1)/2) */ #define JACOBI_N1B_BIT1(b) \
((int) (b))
/* (a/b) effect due to sign of a: signed/unsigned, b odd;
is (-1/b) if a<0, or +1 if a>=0 */ #define JACOBI_ASGN_SU_BIT1(a, b) \
((((a) < 0) << 1) & JACOBI_N1B_BIT1(b))
/* (a/b) effect due to sign of b: signed/signed;
is -1 if a and b both negative, +1 otherwise */ #define JACOBI_BSGN_SS_BIT1(a, b) \
((((a)<0) & ((b)<0)) << 1)
/* (a/b) effect due to sign of b: signed/mpz;
is -1 if a and b both negative, +1 otherwise */ #define JACOBI_BSGN_SZ_BIT1(a, b) \
JACOBI_BSGN_SS_BIT1 (a, SIZ(b))
/* (a/b) effect due to sign of b: mpz/signed;
is -1 if a and b both negative, +1 otherwise */ #define JACOBI_BSGN_ZS_BIT1(a, b) \
JACOBI_BSGN_SZ_BIT1 (b, a)
/* (a/b) reciprocity to switch to (b/a), a,b both unsigned and odd; is (-1)^((a-1)*(b-1)/4), which means +1 if either a,b==1mod4, or -1 if both a,b==3mod4, achieved in bit 1 by a&b. No ASSERT()s about a,b odd because this is used in a couple of places with only bit 1 of a or b
valid. */ #define JACOBI_RECIP_UU_BIT1(a, b) \
((int) ((a) & (b)))
/* Strip low zero limbs from {b_ptr,b_size} by incrementing b_ptr and decrementing b_size. b_low should be b_ptr[0] on entry, and will be updated for the new b_ptr. result_bit1 is updated according to the
factors of 2 stripped, as per (a/2). */ #define JACOBI_STRIP_LOW_ZEROS(result_bit1, a, b_ptr, b_size, b_low) \ do { \
ASSERT ((b_size) >= 1); \
ASSERT ((b_low) == (b_ptr)[0]); \
\ while (UNLIKELY ((b_low) == 0)) \
{ \
(b_size)--; \
ASSERT ((b_size) >= 1); \
(b_ptr)++; \
(b_low) = *(b_ptr); \
\
ASSERT (((a) & 1) != 0); \ if ((GMP_NUMB_BITS % 2) == 1) \
(result_bit1) ^= JACOBI_TWO_U_BIT1(a); \
} \
} while (0)
/* Set a_rem to {a_ptr,a_size} reduced modulo b, either using mod_1 or modexact_1_odd, but in either case leaving a_rem<b. b must be odd and unsigned. modexact_1_odd effectively calculates -a mod b, and result_bit1 is adjusted for the factor of -1.
The way mpn_modexact_1_odd sometimes bases its remainder on a_size and sometimes on a_size-1 means if GMP_NUMB_BITS is odd we can't know what factor to introduce into result_bit1, so for that case use mpn_mod_1 unconditionally.
FIXME: mpn_modexact_1_odd is more efficient, so some way to get it used for odd GMP_NUMB_BITS would be good. Perhaps it could mung its result,
or not skip a divide step, or something. */
staticinlineint
mpn_jacobi_finish (unsigned bits)
{ /* (a, b) = (1,0) or (0,1) */
ASSERT ( (bits & 14) == 0);
return 1-2*(bits & 1);
}
staticinlineunsigned
mpn_jacobi_update (unsigned bits, unsigned denominator, unsigned q)
{ /* FIXME: Could halve table size by not including the e bit in the * index, and instead xor when updating. Then the lookup would be * like * * bits ^= table[((bits & 30) << 2) + (denominator << 2) + q];
*/
/* For almost all calls, denominator is constant and quite often q is constant too. So use addition rather than or, so the compiler can put the constant part can into the offset of an indexed addressing instruction.
With constant denominator, the below table lookup is compiled to
C q in %edx, constant denominator = 1 movzbl table+4(%edx,%eax,8), %eax
One could maintain the state preshifted 3 bits, to save a shift here, but at least on x86, that's no real saving.
*/ return jacobi_table[(bits << 3) + (denominator << 2) + q];
}
/* Extract one numb, shifting count bits left ________ ________ |___xh___||___xl___| |____r____| >count <
The count includes any nail bits, so it should work fine if count is computed using count_leading_zeros. If GMP_NAIL_BITS > 0, all of xh, xl and r include nail bits. Must have 0 < count < GMP_LIMB_BITS.
FIXME: Omit masking with GMP_NUMB_MASK, and let callers do that for those calls where the count high bits of xh may be non-zero.
*/
/* The matrix non-negative M = (u, u'; v,v') keeps track of the reduction (a;b) = M (alpha; beta) where alpha, beta are smaller than a, b. The determinant must always be one, so that M has an inverse (v', -u'; -v, u). Elements always fit in GMP_NUMB_BITS - 1
bits. */ struct hgcd_matrix1
{
mp_limb_t u[2][2];
};
/* Definitions for mpn_set_str and mpn_get_str */ struct powers
{
mp_ptr p; /* actual power value */
mp_size_t n; /* # of limbs at p */
mp_size_t shift; /* weight of lowest limb, in limb base B */
size_t digits_in_base; /* number of corresponding digits */ int base;
}; typedefstruct powers powers_t; #define mpn_str_powtab_alloc(n) ((n) + 2 * GMP_LIMB_BITS) /* FIXME: This can perhaps be trimmed */ #define mpn_dc_set_str_itch(n) ((n) + GMP_LIMB_BITS) #define mpn_dc_get_str_itch(n) ((n) + GMP_LIMB_BITS)
/* Compute the number of base-b digits corresponding to nlimbs limbs, rounding
down. */ #define DIGITS_IN_BASE_PER_LIMB(res, nlimbs, b) \ do { \
mp_limb_t _ph, _dummy; \
umul_ppmm (_ph, _dummy, \
mp_bases[b].logb2, GMP_NUMB_BITS * (mp_limb_t) (nlimbs));\
res = _ph; \
} while (0)
/* Compute the number of limbs corresponding to ndigits base-b digits, rounding
up. */ #define LIMBS_PER_DIGIT_IN_BASE(res, ndigits, b) \ do { \
mp_limb_t _ph, _dummy; \
umul_ppmm (_ph, _dummy, mp_bases[b].log2b, (mp_limb_t) (ndigits)); \
res = 8 * _ph / GMP_NUMB_BITS + 2; \
} while (0)
/* Set n to the number of significant digits an mpf of the given _mp_prec field, in the given base. This is a rounded up value, designed to ensure there's enough digits to reproduce all the guaranteed part of the value.
There are prec many limbs, but the high might be only "1" so forget it and just count prec-1 limbs into chars. +1 rounds that upwards, and a further +1 is because the limbs usually won't fall on digit boundaries.
FIXME: If base is a power of 2 and the bits per digit divides GMP_LIMB_BITS then the +2 is unnecessary. This happens always for
base==2, and in base==16 with the current 32 or 64 bit limb sizes. */
#define MPF_SIGNIFICANT_DIGITS(n, base, prec) \ do { \
size_t rawn; \
ASSERT (base >= 2 && base < numberof (mp_bases)); \
DIGITS_IN_BASE_PER_LIMB (rawn, (prec) - 1, base); \
n = rawn + 2; \
} while (0)
/* Decimal point string, from the current C locale. Needs <langinfo.h> for nl_langinfo and constants, preferably with _GNU_SOURCE defined to get DECIMAL_POINT from glibc, and needs <locale.h> for localeconv, each under their respective #if HAVE_FOO_H.
GLIBC recommends nl_langinfo because getting only one facet can be
faster, apparently. */
/* DECIMAL_POINT seems to need _GNU_SOURCE defined to get it from glibc. */ #if HAVE_NL_LANGINFO && defined (DECIMAL_POINT) #define GMP_DECIMAL_POINT (nl_langinfo (DECIMAL_POINT)) #endif /* RADIXCHAR is deprecated, still in unix98 or some such. */ #if HAVE_NL_LANGINFO && defined (RADIXCHAR) && ! defined (GMP_DECIMAL_POINT) #define GMP_DECIMAL_POINT (nl_langinfo (RADIXCHAR)) #endif /* localeconv is slower since it returns all locale stuff */ #if HAVE_LOCALECONV && ! defined (GMP_DECIMAL_POINT) #define GMP_DECIMAL_POINT (localeconv()->decimal_point) #endif #if ! defined (GMP_DECIMAL_POINT) #define GMP_DECIMAL_POINT (".") #endif
struct doprnt_params_t { int base; /* negative for upper case */ int conv; /* choices above */ constchar *expfmt; /* exponent format */ int exptimes4; /* exponent multiply by 4 */ char fill; /* character */ int justify; /* choices above */ int prec; /* prec field, or -1 for all digits */ int showbase; /* choices above */ int showpoint; /* if radix point always shown */ int showtrailing; /* if trailing zeros wanted */ char sign; /* '+', ' ', or '\0' */ int width; /* width field */
};
/* "buf" is a __gmp_allocate_func block of "alloc" many bytes. The first "size" of these have been written. "alloc > size" is maintained, so there's room to store a '\0' at the end. "result" is where the
application wants the final block pointer. */ struct gmp_asprintf_t { char **result; char *buf;
size_t size;
size_t alloc;
};
/* If a realloc is necessary, use twice the size actually required, so as to
avoid repeated small reallocs. */ #define GMP_ASPRINTF_T_NEED(d, n) \ do { \
size_t alloc, newsize, newalloc; \
ASSERT ((d)->alloc >= (d)->size + 1); \
\
alloc = (d)->alloc; \
newsize = (d)->size + (n); \ if (alloc <= newsize) \
{ \
newalloc = 2*newsize; \
(d)->alloc = newalloc; \
(d)->buf = __GMP_REALLOCATE_FUNC_TYPE ((d)->buf, \
alloc, newalloc, char); \
} \
} while (0)
__GMP_DECLSPEC int __gmp_asprintf_memory (struct gmp_asprintf_t *, constchar *, size_t);
__GMP_DECLSPEC int __gmp_asprintf_reps (struct gmp_asprintf_t *, int, int);
__GMP_DECLSPEC int __gmp_asprintf_final (struct gmp_asprintf_t *);
/* buf is where to write the next output, and size is how much space is left there. If the application passed size==0 then that's what we'll have
here, and nothing at all should be written. */ struct gmp_snprintf_t { char *buf;
size_t size;
};
/* Add the bytes printed by the call to the total retval, or bail out on an
error. */ #define DOPRNT_ACCUMULATE(call) \ do { \ int __ret; \
__ret = call; \ if (__ret == -1) \ goto error; \
retval += __ret; \
} while (0) #define DOPRNT_ACCUMULATE_FUN(fun, params) \ do { \
ASSERT ((fun) != NULL); \
DOPRNT_ACCUMULATE ((*(fun)) params); \
} while (0)
/* Enhancement: The "mod" and "gcd_1" functions below could have __GMP_ATTRIBUTE_PURE, but currently (gcc 3.3) that's not supported on function pointers, only actual functions. It probably doesn't make much difference to the gmp code, since hopefully we arrange calls so there's
no great need for the compiler to move things around. */
#if WANT_FAT_BINARY && (HAVE_HOST_CPU_FAMILY_x86 || HAVE_HOST_CPU_FAMILY_x86_64) /* NOTE: The function pointers in this struct are also in CPUVEC_FUNCS_LIST in mpn/x86/x86-defs.m4 and mpn/x86_64/x86_64-defs.m4. Be sure to update
those when changing here. */ struct cpuvec_t {
DECL_add_n ((*add_n));
DECL_addlsh1_n ((*addlsh1_n));
DECL_addlsh2_n ((*addlsh2_n));
DECL_addmul_1 ((*addmul_1));
DECL_addmul_2 ((*addmul_2));
DECL_bdiv_dbm1c ((*bdiv_dbm1c));
DECL_cnd_add_n ((*cnd_add_n));
DECL_cnd_sub_n ((*cnd_sub_n));
DECL_com ((*com));
DECL_copyd ((*copyd));
DECL_copyi ((*copyi));
DECL_divexact_1 ((*divexact_1));
DECL_divrem_1 ((*divrem_1));
DECL_gcd_11 ((*gcd_11));
DECL_lshift ((*lshift));
DECL_lshiftc ((*lshiftc));
DECL_mod_1 ((*mod_1));
DECL_mod_1_1p ((*mod_1_1p));
DECL_mod_1_1p_cps ((*mod_1_1p_cps));
DECL_mod_1s_2p ((*mod_1s_2p));
DECL_mod_1s_2p_cps ((*mod_1s_2p_cps));
DECL_mod_1s_4p ((*mod_1s_4p));
DECL_mod_1s_4p_cps ((*mod_1s_4p_cps));
DECL_mod_34lsub1 ((*mod_34lsub1));
DECL_modexact_1c_odd ((*modexact_1c_odd));
DECL_mul_1 ((*mul_1));
DECL_mul_basecase ((*mul_basecase));
DECL_mullo_basecase ((*mullo_basecase));
DECL_preinv_divrem_1 ((*preinv_divrem_1));
DECL_preinv_mod_1 ((*preinv_mod_1));
DECL_redc_1 ((*redc_1));
DECL_redc_2 ((*redc_2));
DECL_rshift ((*rshift));
DECL_sqr_basecase ((*sqr_basecase));
DECL_sub_n ((*sub_n));
DECL_sublsh1_n ((*sublsh1_n));
DECL_submul_1 ((*submul_1));
mp_size_t mul_toom22_threshold;
mp_size_t mul_toom33_threshold;
mp_size_t sqr_toom2_threshold;
mp_size_t sqr_toom3_threshold;
mp_size_t bmod_1_to_mod_1_threshold;
};
__GMP_DECLSPEC externstruct cpuvec_t __gmpn_cpuvec;
__GMP_DECLSPEC externint __gmpn_cpuvec_initialized; #endif/* x86 fat binary */
__GMP_DECLSPEC void __gmpn_cpuvec_init (void);
/* Get a threshold "field" from __gmpn_cpuvec, running __gmpn_cpuvec_init()
if that hasn't yet been done (to establish the right values). */ #define CPUVEC_THRESHOLD(field) \
((LIKELY (__gmpn_cpuvec_initialized) ? 0 : (__gmpn_cpuvec_init (), 0)), \
__gmpn_cpuvec.field)
#if TUNE_PROGRAM_BUILD /* Some extras wanted when recompiling some .c files for use by the tune program. Not part of a normal build.
It's necessary to keep these thresholds as #defines (just to an identically named variable), since various defaults are established based on #ifdef in the .c files. For some this is not so (the defaults are
instead established above), but all are done this way for consistency. */
/* A native mpn_sqr_basecase is not tuned and SQR_BASECASE_THRESHOLD should
remain as zero (always use it). */ #if ! HAVE_NATIVE_mpn_sqr_basecase #undef SQR_BASECASE_THRESHOLD #define SQR_BASECASE_THRESHOLD sqr_basecase_threshold extern mp_size_t sqr_basecase_threshold; #endif
#undef FFT_TABLE_ATTRS #define FFT_TABLE_ATTRS extern mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE]; #define FFT_TABLE3_SIZE 2000 /* generous space for tuning */ externstruct fft_table_nk mpn_fft_table3[2][FFT_TABLE3_SIZE];
/* Sizes the tune program tests up to, used in a couple of recompilations. */ #undef MUL_TOOM22_THRESHOLD_LIMIT #undef MUL_TOOM33_THRESHOLD_LIMIT #undef MULLO_BASECASE_THRESHOLD_LIMIT #undef SQRLO_BASECASE_THRESHOLD_LIMIT #undef SQRLO_DC_THRESHOLD_LIMIT #undef SQR_TOOM3_THRESHOLD_LIMIT #define SQR_TOOM2_MAX_GENERIC 200 #define MUL_TOOM22_THRESHOLD_LIMIT 700 #define MUL_TOOM33_THRESHOLD_LIMIT 700 #define SQR_TOOM3_THRESHOLD_LIMIT 400 #define MUL_TOOM44_THRESHOLD_LIMIT 1000 #define SQR_TOOM4_THRESHOLD_LIMIT 1000 #define MUL_TOOM6H_THRESHOLD_LIMIT 1100 #define SQR_TOOM6_THRESHOLD_LIMIT 1100 #define MUL_TOOM8H_THRESHOLD_LIMIT 1200 #define SQR_TOOM8_THRESHOLD_LIMIT 1200 #define MULLO_BASECASE_THRESHOLD_LIMIT 200 #define SQRLO_BASECASE_THRESHOLD_LIMIT 200 #define SQRLO_DC_THRESHOLD_LIMIT 400 #define GET_STR_THRESHOLD_LIMIT 150 #define FAC_DSC_THRESHOLD_LIMIT 2048
#endif/* TUNE_PROGRAM_BUILD */
#ifdefined (__cplusplus)
} #endif
/* FIXME: Make these itch functions less conservative. Also consider making them dependent on just 'an', and compute the allocation directly from 'an'
instead of via n. */
/* toom22/toom2: Scratch need is 2*(an + k), k is the recursion depth. k is ths smallest k such that ceil(an/2^k) < MUL_TOOM22_THRESHOLD. which implies that k = bitsize of floor ((an-1)/(MUL_TOOM22_THRESHOLD-1)) = 1 + floor (log_2 (floor ((an-1)/(MUL_TOOM22_THRESHOLD-1))))
*/ #define mpn_toom22_mul_itch(an, bn) \
(2 * ((an) + GMP_NUMB_BITS)) #define mpn_toom2_sqr_itch(an) \
(2 * ((an) + GMP_NUMB_BITS))
/* toom33/toom3: Scratch need is 5an/2 + 10k, k is the recursion depth. We use 3an + C, so that we can use a smaller constant.
*/ #define mpn_toom33_mul_itch(an, bn) \
(3 * (an) + GMP_NUMB_BITS) #define mpn_toom3_sqr_itch(an) \
(3 * (an) + GMP_NUMB_BITS)
/* toom33/toom3: Scratch need is 8an/3 + 13k, k is the recursion depth. We use 3an + C, so that we can use a smaller constant.
*/ #define mpn_toom44_mul_itch(an, bn) \
(3 * (an) + GMP_NUMB_BITS) #define mpn_toom4_sqr_itch(an) \
(3 * (an) + GMP_NUMB_BITS)
/* A little helper for a null-terminated __gmp_allocate_func string. The destructor ensures it's freed even if an exception is thrown. The len field is needed by the destructor, and can be used by anyone else to avoid a second strlen pass over the data.
Since our input is a C string, using strlen is correct. Perhaps it'd be more C++-ish style to use std::char_traits<char>::length, but char_traits
isn't available in gcc 2.95.4. */
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.98Bemerkung:
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.