/* Create tuned thresholds for various algorithms.
Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version.
or
* the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
or both in parallel, as here.
The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received copies of the GNU General Public License and the GNU Lesser General Public License along with the GNU MP Library. If not,
see https://www.gnu.org/licenses/. */
/* Usage: tuneup [-t] [-t] [-p precision]
-t turns on some diagnostic traces, a second -t turns on more traces.
Notes:
The code here isn't a vision of loveliness, mainly because it's subject to ongoing changes according to new things wanting to be tuned, and practical requirements of systems tested.
Sometimes running the program twice produces slightly different results. This is probably because there's so little separating algorithms near their crossover, and on that basis it should make little or no difference to the final speed of the relevant routines, but nothing has been done to check that carefully.
Algorithm:
The thresholds are determined as follows. A crossover may not be a single size but rather a range where it oscillates between method A or method B faster. If the threshold is set making B used where A is faster (or vice versa) that's bad. Badness is the percentage time lost and total badness is the sum of this over all sizes measured. The threshold is set to minimize total badness.
Suppose, as sizes increase, method B becomes faster than method A. The effect of the rule is that, as you look at increasing sizes, isolated points where B is faster are ignored, but when it's consistently faster, or faster on balance, then the threshold is set there. The same result is obtained thinking in the other direction of A becoming faster at smaller sizes.
In practice the thresholds tend to be chosen to bring on the next algorithm fairly quickly.
This rule is attractive because it's got a basis in reason and is fairly easy to implement, but no work has been done to actually compare it in absolute terms to other possibilities.
Implementation:
In a normal library build the thresholds are constants. To tune them selected objects are recompiled with the thresholds as global variables instead. #define TUNE_PROGRAM_BUILD does this, with help from code at the end of gmp-impl.h, and rules in tune/Makefile.am.
MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The threshold is set to "size+1" to avoid karatsuba, or to "size" to use one level, but recurse into the basecase.
MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value. Other routines in turn will make use of both of those. Naturally the dependants must be tuned first.
In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling, just a threshold based on comparing two routines (mpn_divrem_1 and mpn_divexact_1), and no further use of the value determined.
Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being just comparisons between certain routines on representative data.
Shortcuts are applied when native (assembler) versions of routines exist. For instance a native mpn_sqr_basecase is assumed to be always faster than mpn_mul_basecase, with no measuring.
No attempt is made to tune within assembler routines, for instance DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be written and tuned all by hand. Assembler routines that might have hard limits are recompiled though, to make them accept a bigger range of sizes than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
Limitations:
The FFTs aren't subject to the same badness rule as the other thresholds, so each k is probably being brought on a touch early. This isn't likely to make a difference, and the simpler probing means fewer tests.
struct dat_t {
mp_size_t size; double d;
} *dat = NULL; int ndat = 0; int allocdat = 0;
/* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that
case use zero here, which for params.max_size means no limit. */ #ifndef TUNE_SQR_TOOM2_MAX #define TUNE_SQR_TOOM2_MAX 0 #endif
struct param_t { constchar *name;
speed_function_t function;
speed_function_t function2; double step_factor; /* how much to step relatively */ int step; /* how much to step absolutely */ double function_fudge; /* multiplier for "function" speeds */ int stop_since_change; double stop_factor;
mp_size_t min_size; int min_is_always;
mp_size_t max_size;
mp_size_t check_size;
mp_size_t size_extra;
#define DATA_HIGH_LT_R 1 #define DATA_HIGH_GE_R 2 int data_high;
int noprint;
};
/* These are normally undefined when false, which suits "#if" fine.
But give them zero values so they can be used in plain C "if"s. */ #ifndef UDIV_PREINV_ALWAYS #define UDIV_PREINV_ALWAYS 0 #endif #ifndef HAVE_NATIVE_mpn_divexact_1 #define HAVE_NATIVE_mpn_divexact_1 0 #endif #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1 #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0 #endif #ifndef HAVE_NATIVE_mpn_divrem_1 #define HAVE_NATIVE_mpn_divrem_1 0 #endif #ifndef HAVE_NATIVE_mpn_divrem_2 #define HAVE_NATIVE_mpn_divrem_2 0 #endif #ifndef HAVE_NATIVE_mpn_mod_1 #define HAVE_NATIVE_mpn_mod_1 0 #endif #ifndef HAVE_NATIVE_mpn_mod_1_1p #define HAVE_NATIVE_mpn_mod_1_1p 0 #endif #ifndef HAVE_NATIVE_mpn_modexact_1_odd #define HAVE_NATIVE_mpn_modexact_1_odd 0 #endif #ifndef HAVE_NATIVE_mpn_preinv_divrem_1 #define HAVE_NATIVE_mpn_preinv_divrem_1 0 #endif #ifndef HAVE_NATIVE_mpn_preinv_mod_1 #define HAVE_NATIVE_mpn_preinv_mod_1 0 #endif #ifndef HAVE_NATIVE_mpn_sqr_basecase #define HAVE_NATIVE_mpn_sqr_basecase 0 #endif
mp_limb_t
randlimb_half (void)
{
mp_limb_t n;
mpn_random (&n, 1);
n &= GMP_NUMB_HALFMASK;
n += (n==0); return n;
}
/* Add an entry to the end of the dat[] array, reallocing to make it bigger
if necessary. */ void
add_dat (mp_size_t size, double d)
{ #define ALLOCDAT_STEP 500
/* Return the threshold size based on the data accumulated. */
mp_size_t
analyze_dat (int final)
{ double x, min_x; int j, min_j;
/* If the threshold is set at dat[0].size, any positive values are bad. */
x = 0.0; for (j = 0; j < ndat; j++) if (dat[j].d > 0.0)
x += dat[j].d;
if (option_trace >= 2 && final)
{
printf ("\n");
printf ("x is the sum of the badness from setting thresh at given size\n");
printf (" (minimum x is sought)\n");
printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x);
}
min_x = x;
min_j = 0;
/* When stepping to the next dat[j].size, positive values are no longer bad (so subtracted), negative values become bad (so add the absolute
value, meaning subtract). */ for (j = 0; j < ndat; x -= dat[j].d, j++)
{ if (option_trace >= 2 && final)
printf ("size=%ld x=%.4f\n", (long) dat[j].size, x);
if (x < min_x)
{
min_x = x;
min_j = j;
}
}
return min_j;
}
/* Measuring for recompiled mpn/generic/div_qr_1.c,
* mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
*threshold = s.size;
t2 = tuneup_measure (param->function2, param, &s); if (t1 == -1.0 || t2 == -1.0)
{
printf ("Oops, can't run both functions at size %ld\n",
(long) s.size);
abort ();
}
t1 *= param->function_fudge;
/* ask that t2 is at least 4% below t1 */ if (t1 < t2*1.04)
{ if (option_trace)
printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
*threshold = MP_SIZE_T_MAX; if (! param->noprint)
print_define (param->name, *threshold); return;
}
if (option_trace >= 2)
printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
(long) s.size, t1, t2);
}
if (! param->noprint || option_trace)
print_define_start (param->name);
/* FIXME: check minimum size requirements are met, possibly by just checking for the -1 returns from the speed functions.
*/
/* using method A at this size */
*threshold = s.size+1;
ti = tuneup_measure (param->function, param, &s); if (ti == -1.0)
abort ();
ti *= param->function_fudge;
/* using method B at this size */
*threshold = s.size;
tiplus1 = tuneup_measure (param->function2, param, &s); if (tiplus1 == -1.0)
abort ();
/* Calculate the fraction by which the one or the other routine is
slower. */ if (tiplus1 >= ti)
d = (tiplus1 - ti) / tiplus1; /* negative */ else
d = (tiplus1 - ti) / ti; /* positive */
/* Stop if the last time method i was faster was more than a
certain number of measurements ago. */ #define STOP_SINCE_POSITIVE 200 if (d >= 0)
since_positive = 0; else if (++since_positive > STOP_SINCE_POSITIVE)
{ if (option_trace >= 1)
printf ("stopped due to since_positive (%d)\n",
STOP_SINCE_POSITIVE); break;
}
/* Stop if method A has become slower by a certain factor. */ if (ti >= tiplus1 * param->stop_factor)
{ if (option_trace >= 1)
printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
param->stop_factor); break;
}
/* Stop if the threshold implied hasn't changed in a certain number of measurements. (It's this condition that usually
stops the loop.) */ if (thresh_idx != new_thresh_idx)
since_thresh_change = 0, thresh_idx = new_thresh_idx; else if (++since_thresh_change > param->stop_since_change)
{ if (option_trace >= 1)
printf ("stopped due to since_thresh_change (%d)\n",
param->stop_since_change); break;
}
/* Stop if the threshold implied is more than a certain number of
measurements ago. */ #define STOP_SINCE_AFTER 500 if (ndat - thresh_idx > STOP_SINCE_AFTER)
{ if (option_trace >= 1)
printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
STOP_SINCE_AFTER); break;
}
/* Stop when the size limit is reached before the end of the crossover, but only show this as an error for >= the default max size. FIXME: Maybe should make it a param choice whether this is
an error. */ if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
{
fprintf (stderr, "%s\n", param->name);
fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
(long) dat[0].size, (long) dat[ndat-1].size, ndat);
fprintf (stderr, " max size reached before end of crossover\n"); break;
}
}
if (option_trace >= 1)
printf ("sizes %ld to %ld total %d measurements\n",
(long) dat[0].size, (long) dat[ndat-1].size, ndat);
*threshold = dat[analyze_dat (1)].size;
if (param->min_is_always)
{ if (*threshold == param->min_size)
*threshold = 0;
}
if (! param->noprint || option_trace)
print_define_end (param->name, *threshold);
}
/* Time N different FUNCTIONS with the same parameters and size, to select the fastest. Since *_METHOD defines start numbering from one, if functions[i] is fastest, the value of the define is i+1. Also output a comment with speedup compared to the next fastest function. The NAME argument is used only for trace output.
Returns the index of the fastest function.
*/ int
one_method (int n, speed_function_t *functions, constchar *name, constchar *define, conststruct param_t *param)
{ double *t; int i; int method; int method_runner_up;
TMP_DECL;
TMP_MARK;
t = (double*) TMP_ALLOC (n * sizeof (*t));
for (i = 0; i < n; i++)
{
t[i] = tuneup_measure (functions[i], param, &s); if (option_trace >= 1)
printf ("size=%ld, %s, method %d %.9f\n",
(long) s.size, name, i + 1, t[i]); if (t[i] == -1.0)
{
printf ("Oops, can't measure all %s methods\n", name);
abort ();
}
}
method = 0; for (i = 1; i < n; i++) if (t[i] < t[method])
method = i;
method_runner_up = (method == 0); for (i = 0; i < n; i++) if (i != method && t[i] < t[method_runner_up])
method_runner_up = i;
/* Special probing for the fft thresholds. The size restrictions on the FFTs mean the graph of time vs size has a step effect. See this for example using
The current approach is to compare routines at the midpoint of relevant steps. Arguably a more sophisticated system of threshold data is wanted
if this step effect remains. */
/* mpn_mul_fft requires pl a multiple of 2^k limbs, but with N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
of 2^(k-1) bits. */
#if 1
{ /* Use plain one() mechanism, for some reasonable initial values of k. The advantage is that we don't depend on mpn_fft_table3, which can therefore
leave it completely uninitialized. */
staticstruct param_t param;
mp_size_t thres, best_thres; int best_k; char buf[20];
/* Compare mpn_mul_1 to whatever fast exact single-limb division we have. This is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch.
This is used in get_str and set_str. */ void
relspeed_div_1_vs_mul_1 (void)
{ const size_t max_opsize = 100;
mp_size_t n; long j;
mp_limb_t rp[max_opsize];
mp_limb_t ap[max_opsize]; double multime, divtime;
mpn_random (ap, max_opsize);
multime = 0; for (n = max_opsize; n > 1; n--)
{
mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
speed_starttime (); for (j = speed_precision; j != 0 ; j--)
mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
multime += speed_endtime () / n;
}
divtime = 0; for (n = max_opsize; n > 1; n--)
{ /* Make input divisible for good measure. */
ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10);
/* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
giving wrong results. */ void
tune_mul_n (void)
{ staticstruct param_t param;
mp_size_t next_toom_start; int something_changed;
param.function = speed_mpn_mul_n;
param.name = "MUL_TOOM22_THRESHOLD";
param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
one (&mul_toom22_threshold, ¶m);
param.noprint = 1;
/* Threshold sequence loop. Disable functions that would be used in a very
narrow range, re-measuring things when that happens. */
something_changed = 1; while (something_changed)
{
something_changed = 0;
next_toom_start = mul_toom22_threshold;
if (mul_toom33_threshold != 0)
{
param.name = "MUL_TOOM33_THRESHOLD";
param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
one (&mul_toom33_threshold, ¶m);
/* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
param.function = speed_mpn_toom43_for_toom54_mul;
param.function2 = speed_mpn_toom54_for_toom43_mul;
param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
one (&thres, ¶m);
mul_toom43_to_toom54_threshold = thres * 5 / 6;
print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
}
/* Start the basecase from 3, since 1 is a special case, and if mul_basecase is faster only at size==2 then we don't want to bother with extra code
just for that. Start karatsuba from 4 same as MUL above. */
if (! HAVE_NATIVE_mpn_sqr_basecase
&& sqr_toom2_threshold < sqr_basecase_threshold)
{ /* Karatsuba becomes faster than mul_basecase before sqr_basecase does. Arrange for the expression "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which selects mpn_sqr_basecase in mpn_sqr to be false, by setting SQR_TOOM2_THRESHOLD to zero, making
SQR_BASECASE_THRESHOLD the toom2 threshold. */
/* Threshold sequence loop. Disable functions that would be used in a very
narrow range, re-measuring things when that happens. */
something_changed = 1; while (something_changed)
{
something_changed = 0;
next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
param.name = "SQR_TOOM3_THRESHOLD";
param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
one (&sqr_toom3_threshold, ¶m);
next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
if (sqr_toom4_threshold != 0)
{
param.name = "SQR_TOOM4_THRESHOLD";
sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
one (&sqr_toom4_threshold, ¶m);
/* In tune_powm_sec we compute the table used by the win_size function. The cutoff points are in exponent bits, disregarding other operand sizes. It is not possible to use the one framework since it currently uses a granularity of full limbs.
*/
/* This win_size replaces the variant in the powm code, allowing us to
control k in the k-ary algorithms. */ int winsize; int
win_size (mp_bitcnt_t eb)
{ return winsize;
}
/* How about taking the M operand size into account?
An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming B = O(M)).
Using k-ary and no sliding window, the precomputation will need time O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
An operation R=powm_sec(B,E,N) will take time like powm.
Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) + O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
table reads, respectively. */
printf ("#define POWM_SEC_TABLE ");
/* For nbits == 1, we should always use k == 1, so no need to tune that. Starting with nbits == 2 also ensure that nbits always is
larger than the windowsize k+1. */ for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
{
n = (nbits - 1) / GMP_NUMB_BITS + 1;
/* Generate E such that sliding-window for k and k+1 works equally
well/poorly (but sliding is not used in powm_sec, of course). */ for (i = 0; i < n; i++)
ep[i] = ~CNST_LIMB(0);
winsize = k; for (i = 0; i < n_measurements; i++)
{
speed_starttime ();
mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
ttab[i] = speed_endtime ();
}
tk = median (ttab, n_measurements);
winsize = k + 1;
speed_starttime (); for (i = 0; i < n_measurements; i++)
{
speed_starttime ();
mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
ttab[i] = speed_endtime ();
}
tkp1 = median (ttab, n_measurements); /* printf ("testing: %ld, %d", nbits, k, ep[n-1]); printf (" %10.5f %10.5f\n", tk, tkp1);
*/ if (tkp1 < tk)
{ if (possible_nbits_cutoff)
{ /* Two consecutive sizes indicate k increase, obey. */
/* Must always have x[k] >= k */
ASSERT_ALWAYS (possible_nbits_cutoff >= k);
if (k > 1)
printf (",");
printf ("%ld", (long) possible_nbits_cutoff);
k++;
possible_nbits_cutoff = 0;
} else
{ /* One measurement indicate k increase, save nbits for further
consideration. */ /* The new larger k gets used for sizes > the cutoff value, hence the cutoff should be one less than the
smallest size where it gives a speedup. */
possible_nbits_cutoff = nbits - 1;
}
} else
possible_nbits_cutoff = 0;
/* size_extra==1 reflects the fact that with high<divisor one division is always skipped. Forcing high<divisor while testing ensures consistency while stepping through sizes, ie. that size-1 divides will be done each time.
min_size==2 and min_is_always are used so that if plain division is only better at size==1 then don't bother including that code just for that
case, instead go with preinv always and get a size saving. */
void
tune_divrem_1 (void)
{ /* plain version by default */
tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
/* No support for tuning native assembler code, do that by hand and put the results in the .asm file, there's no need for such thresholds to
appear in gmp-mparam.h. */ if (HAVE_NATIVE_mpn_divrem_1) return;
if (GMP_NAIL_BITS != 0)
{
print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX, "no preinv with nails");
print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, "no preinv with nails"); return;
}
/* Tune for the integer part of mpn_divrem_1. This will very possibly be a bit out for the fractional part, but that's too bad, the integer part
is more important. */
{ staticstruct param_t param;
param.name = "DIVREM_1_NORM_THRESHOLD";
DIV_1_PARAMS;
s.r = randlimb_norm ();
param.function = speed_mpn_divrem_1_tune;
one (&divrem_1_norm_threshold, ¶m);
}
{ staticstruct param_t param;
param.name = "DIVREM_1_UNNORM_THRESHOLD";
DIV_1_PARAMS;
s.r = randlimb_half ();
param.function = speed_mpn_divrem_1_tune;
one (&divrem_1_unnorm_threshold, ¶m);
}
}
void
tune_mod_1 (void)
{ /* No support for tuning native assembler code, do that by hand and put the results in the .asm file, there's no need for such thresholds to
appear in gmp-mparam.h. */ if (HAVE_NATIVE_mpn_mod_1) return;
if (GMP_NAIL_BITS != 0)
{
print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX, "no preinv with nails");
print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, "no preinv with nails"); return;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.