diff options
author | Torbjörn Granlund <tg@gmplib.org> | 2012-09-16 22:27:20 +0200 |
---|---|---|
committer | Jim Meyering <meyering@redhat.com> | 2012-10-04 21:34:24 +0200 |
commit | a38890b27fe296c582d1266cb5b3b02f186132dc (patch) | |
tree | 728a4c1f16eabddcf3caae0e7b225385dc0925a1 | |
parent | d836ea73400d66ecfa5ecf1287b983976342c4c4 (diff) | |
download | coreutils-a38890b27fe296c582d1266cb5b3b02f186132dc.tar.xz |
factor: new much-improved implementation; not yet integrated
* src/factor-ng.c: New file, from nt-factor.
* src/longlong.h: New file.
* NEWS (Improvements): Mention the upcoming improvements.
Co-authored-by: Niels Möller
-rw-r--r-- | NEWS | 7 | ||||
-rw-r--r-- | src/factor-ng.c | 2388 | ||||
-rw-r--r-- | src/longlong.h | 2156 |
3 files changed, 4551 insertions, 0 deletions
@@ -50,6 +50,13 @@ GNU coreutils NEWS -*- outline -*- ** Improvements + factor's core has been rewritten for speed and increased range. + It can now factor numbers up to 2^128, even without GMP support. + Its speed is from a few times better (for small numbers) to over + 10,000 times better (just below 2^64). The new code also runs a + deterministic primality test for each prime factor, not just a + probabilistic test. + seq is now up to 70 times faster than it was in coreutils-8.19 and prior, but only with non-negative whole numbers, an increment of 1, and no format-changing options. diff --git a/src/factor-ng.c b/src/factor-ng.c new file mode 100644 index 000000000..776aaa653 --- /dev/null +++ b/src/factor-ng.c @@ -0,0 +1,2388 @@ +/* Factoring of uintmax_t numbers. + + Contributed to the GNU project by Torbjörn Granlund and Niels Möller + Contains code from GNU MP. + +Copyright 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2005, 2009, 2012 +Free Software Foundation, Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see http://www.gnu.org/licenses/. */ + +/* Efficiently factor numbers that fit in one or two words (word = uintmax_t), + or, with GMP, numbers of any size. + + Code organisation: + + There are several variants of many functions, for handling one word, two + words, and GMP's mpz_t type. If the one-word variant is called foo, the + two-word variant will be foo2, and the one for mpz_t will be mp_foo. In + some cases, the plain function variants will handle both one-word and + two-word numbers, evidenced by function arguments. + + The factoring code for two words will fall into the code for one word when + progress allows that. + + Using GMP is optional. Define HAVE_GMP to make this code include GMP + factoring code. The GMP factoring code is based on GMP's demos/factorize.c + (last synched 2012-09-07). The GMP-based factoring code will stay in GMP + factoring code even if numbers get small enough for using the two-word + code. + + Algorithm: + + (1) Perform trial division using a small primes table, but without hardware + division since the primes table store inverses modulo the word base. + (The GMP variant of this code doesn't make use of the precomputed + inverses, but instead relies on GMP for fast divisibility testing.) + (2) Check the nature of any non-factored part using Miller-Rabin for + detecting composites, and Lucas for detecting primes. + (3) Factor any remaining composite part using the Pollard-Brent rho + algorithm or the SQUFOF algorithm, checking status of found factors + again using Miller-Rabin and Lucas. + + We prefer using Hensel norm in the divisions, not the more familiar + Euclidian norm, since the former leads to much faster code. In the + Pollard-Brent rho code and the the prime testing code, we use Montgomery's + trick of multiplying all n-residues by the word base, allowing cheap Hensel + reductions mod n. + + Improvements: + + * Use modular inverses also for exact division in the Lucas code, and + elsewhere. A problem is to locate the inverses not from an index, but + from a prime. We might instead compute the inverse on-the-fly. + + * Tune trial division table size (not forgetting that this is a standalone + program where the table will be read from disk for each invocation). + + * Implement less naive powm, using k-ary exponentiation for k = 3 or + perhaps k = 4. + + * Try to speed trial division code for single uintmax_t numbers, i.e., the + code using DIVBLOCK. It currently runs at 2 cycles per prime (Intel SBR, + IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when + using gcc 4.6 and 4.7. Some software pipelining should help; 1, 2, and 4 + respectively cycles ought to be possible. + + * The redcify function could be vastly improved by using (plain Euclidian) + pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from + GMP's gmp-impl.h). The redcify2 function could be vastly improved using + similar methoods. These functions currently dominate run time when using + the -w option. +*/ + +#include <config.h> + +#include <stdlib.h> +#include <stdio.h> +#include <inttypes.h> +#include <errno.h> +#include <assert.h> +#include <ctype.h> +#include <string.h> /* for memmove */ +#include <unistd.h> /* for getopt */ +#include <stdbool.h> + +#include "system.h" + +#if HAVE_GMP +# include <gmp.h> +#endif + +#ifndef STAT_SQUFOF +# define STAT_SQUFOF 0 +#endif + +#ifndef USE_LONGLONG_H +# define USE_LONGLONG_H 1 +#endif + +#if USE_LONGLONG_H + +/* Make definitions for longlong.h to make it do what it can do for us */ +# define W_TYPE_SIZE 64 /* bitcount for uintmax_t */ +# define UWtype uintmax_t +# define UHWtype unsigned long int +# undef UDWtype +# if HAVE_ATTRIBUTE_MODE +typedef unsigned int UQItype __attribute__ ((mode (QI))); +typedef int SItype __attribute__ ((mode (SI))); +typedef unsigned int USItype __attribute__ ((mode (SI))); +typedef int DItype __attribute__ ((mode (DI))); +typedef unsigned int UDItype __attribute__ ((mode (DI))); +# else +typedef unsigned char UQItype; +typedef long SItype; +typedef unsigned long int USItype; +# if HAVE_LONG_LONG +typedef long long int DItype; +typedef unsigned long long int UDItype; +# else /* Assume `long' gives us a wide enough type. Needed for hppa2.0w. */ +typedef long int DItype; +typedef unsigned long int UDItype; +# endif +# endif +# define LONGLONG_STANDALONE /* Don't require GMP's longlong.h mdep files */ +# define ASSERT(x) /* FIXME make longlong.h really standalone */ +# define __GMP_DECLSPEC /* FIXME make longlong.h really standalone */ +# define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */ +# ifndef __GMP_GNUC_PREREQ +# define __GMP_GNUC_PREREQ(a,b) 1 +# endif +# if _ARCH_PPC +# define HAVE_HOST_CPU_FAMILY_powerpc 1 +# endif +# include "longlong.h" +# ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB +const unsigned char factor_clz_tab[129] = +{ + 1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 9 +}; +# endif + +#else /* not USE_LONGLONG_H */ + +# define W_TYPE_SIZE (8 * sizeof (uintmax_t)) +# define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2)) +# define __ll_lowpart(t) ((uintmax_t) (t) & (__ll_B - 1)) +# define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2)) + +#endif + +enum alg_type { ALG_POLLARD_RHO = 1, ALG_SQUFOF = 2 }; + +static enum alg_type alg; + +/* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */ +#define MAX_NFACTS 26 + +struct factors +{ + uintmax_t plarge[2]; /* Can have a single large factor */ + uintmax_t p[MAX_NFACTS]; + unsigned char e[MAX_NFACTS]; + unsigned char nfactors; +}; + +#if HAVE_GMP +struct mp_factors +{ + mpz_t *p; + unsigned long int *e; + unsigned long int nfactors; +}; +#endif + +static void factor (uintmax_t, uintmax_t, struct factors *); + +#ifndef umul_ppmm +# define umul_ppmm(w1, w0, u, v) \ + do { \ + uintmax_t __x0, __x1, __x2, __x3; \ + unsigned long int __ul, __vl, __uh, __vh; \ + uintmax_t __u = (u), __v = (v); \ + \ + __ul = __ll_lowpart (__u); \ + __uh = __ll_highpart (__u); \ + __vl = __ll_lowpart (__v); \ + __vh = __ll_highpart (__v); \ + \ + __x0 = (uintmax_t) __ul * __vl; \ + __x1 = (uintmax_t) __ul * __vh; \ + __x2 = (uintmax_t) __uh * __vl; \ + __x3 = (uintmax_t) __uh * __vh; \ + \ + __x1 += __ll_highpart (__x0);/* this can't give carry */ \ + __x1 += __x2; /* but this indeed can */ \ + if (__x1 < __x2) /* did we get it? */ \ + __x3 += __ll_B; /* yes, add it in the proper pos. */ \ + \ + (w1) = __x3 + __ll_highpart (__x1); \ + (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0); \ + } while (0) +#endif + +#if !defined(udiv_qrnnd) || defined(UDIV_NEEDS_NORMALIZATION) +/* Define our own, not needing normalization. This function is + currently not performance critical, so keep it simple. Similar to + the mod macro below. */ +# undef udiv_qrnnd +# define udiv_qrnnd(q, r, n1, n0, d) \ + do { \ + uintmax_t __d1, __d0, __q, __r1, __r0; \ + \ + assert ((n1) < (d)); \ + __d1 = (d); __d0 = 0; \ + __r1 = (n1); __r0 = (n0); \ + __q = 0; \ + for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--) \ + { \ + rsh2 (__d1, __d0, __d1, __d0, 1); \ + __q <<= 1; \ + if (ge2 (__r1, __r0, __d1, __d0)) \ + { \ + __q++; \ + sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0); \ + } \ + } \ + (r) = __r0; \ + (q) = __q; \ + } while (0) +#endif + +#if !defined (add_ssaaaa) +# define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + do { \ + uintmax_t _add_x; \ + _add_x = (al) + (bl); \ + (sh) = (ah) + (bh) + (_add_x < (al)); \ + (sl) = _add_x; \ + } while (0) +#endif + +#define rsh2(rh, rl, ah, al, cnt) \ + do { \ + (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt)); \ + (rh) = (ah) >> (cnt); \ + } while (0) + +#define lsh2(rh, rl, ah, al, cnt) \ + do { \ + (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt))); \ + (rl) = (al) << (cnt); \ + } while (0) + +#define ge2(ah, al, bh, bl) \ + ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl))) + +#define gt2(ah, al, bh, bl) \ + ((ah) > (bh) || ((ah) == (bh) && (al) > (bl))) + +#ifndef sub_ddmmss +# define sub_ddmmss(rh, rl, ah, al, bh, bl) \ + do { \ + uintmax_t _cy; \ + _cy = (al) < (bl); \ + (rl) = (al) - (bl); \ + (rh) = (ah) - (bh) - _cy; \ + } while (0) +#endif + +#ifndef count_leading_zeros +# define count_leading_zeros(count, x) do { \ + uintmax_t __clz_x = (x); \ + unsigned int __clz_c; \ + for (__clz_c = 0; \ + (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0; \ + __clz_c += 8) \ + __clz_x <<= 8; \ + for (; (intmax_t)__clz_x >= 0; __clz_c++) \ + __clz_x <<= 1; \ + (count) = __clz_c; \ + } while (0) +#endif + +#ifndef count_trailing_zeros +# define count_trailing_zeros(count, x) do { \ + uintmax_t __ctz_x = (x); \ + unsigned int __ctz_c = 0; \ + while ((__ctz_x & 1) == 0) \ + { \ + __ctz_x >>= 1; \ + __ctz_c++; \ + } \ + (count) = __ctz_c; \ + } while (0) +#endif + +/* Requires that a < n and b <= n */ +#define submod(r,a,b,n) \ + do { \ + uintmax_t _t = - (uintmax_t) (a < b); \ + (r) = ((n) & _t) + (a) - (b); \ + } while (0) + +#define addmod(r,a,b,n) \ + submod((r), (a), ((n) - (b)), (n)) + +/* Modular two-word addition and subtraction. For performance reasons, the + most significant bit of n1 must be clear. The destination variables must be + distinct from the mod operand. */ +#define addmod2(r1, r0, a1, a0, b1, b0, n1, n0) \ + do { \ + add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0)); \ + if (ge2 ((r1), (r0), (n1), (n0))) \ + sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0)); \ + } while (0) +#define submod2(r1, r0, a1, a0, b1, b0, n1, n0) \ + do { \ + sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0)); \ + if ((intmax_t) (r1) < 0) \ + add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0)); \ + } while (0) + +#define HIGHBIT_TO_MASK(x) \ + (((intmax_t)-1 >> 1) < 0 \ + ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1)) \ + : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1)) \ + ? UINTMAX_MAX : (uintmax_t) 0)) + +/* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>. + Requires that d1 != 0. */ +static uintmax_t +mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0) +{ + int cntd, cnta; + + assert (d1 != 0); + + if (a1 == 0) + { + *r1 = 0; + return a0; + } + + count_leading_zeros (cntd, d1); + count_leading_zeros (cnta, a1); + int cnt = cntd - cnta; + lsh2 (d1, d0, d1, d0, cnt); + for (int i = 0; i < cnt; i++) + { + if (ge2 (a1, a0, d1, d0)) + sub_ddmmss (a1, a0, a1, a0, d1, d0); + rsh2 (d1, d0, d1, d0, 1); + } + + *r1 = a1; + return a0; +} + +static uintmax_t _GL_ATTRIBUTE_CONST +gcd_odd (uintmax_t a, uintmax_t b) +{ + if ( (b & 1) == 0) + { + uintmax_t t = b; + b = a; + a = t; + } + if (a == 0) + return b; + + /* Take out least significant one bit, to make room for sign */ + b >>= 1; + + for (;;) + { + uintmax_t t; + uintmax_t bgta; + + while ((a & 1) == 0) + a >>= 1; + a >>= 1; + + t = a - b; + if (t == 0) + return (a << 1) + 1; + + bgta = HIGHBIT_TO_MASK (t); + + /* b <-- min (a, b) */ + b += (bgta & t); + + /* a <-- |a - b| */ + a = (t ^ bgta) - bgta; + } +} + +static uintmax_t +gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0) +{ + while ((a0 & 1) == 0) + rsh2 (a1, a0, a1, a0, 1); + while ((b0 & 1) == 0) + rsh2 (b1, b0, b1, b0, 1); + + for (;;) + { + if ((b1 | a1) == 0) + { + *r1 = 0; + return gcd_odd (b0, a0); + } + + if (gt2 (a1, a0, b1, b0)) + { + sub_ddmmss (a1, a0, a1, a0, b1, b0); + do + rsh2 (a1, a0, a1, a0, 1); + while ((a0 & 1) == 0); + } + else if (gt2 (b1, b0, a1, a0)) + { + sub_ddmmss (b1, b0, b1, b0, a1, a0); + do + rsh2 (b1, b0, b1, b0, 1); + while ((b0 & 1) == 0); + } + else + break; + } + + *r1 = a1; + return a0; +} + +static void +factor_insert_multiplicity (struct factors *factors, + uintmax_t prime, unsigned int m) +{ + unsigned int nfactors = factors->nfactors; + uintmax_t *p = factors->p; + unsigned char *e = factors->e; + + /* Locate position for insert new or increment e. */ + int i; + for (i = nfactors - 1; i >= 0; i--) + { + if (p[i] <= prime) + break; + } + + if (i < 0 || p[i] != prime) + { + for (int j = nfactors - 1; j > i; j--) + { + p[j + 1] = p[j]; + e[j + 1] = e[j]; + } + p[i + 1] = prime; + e[i + 1] = m; + factors->nfactors = nfactors + 1; + } + else + { + e[i] += m; + } +} + +#define factor_insert(f, p) factor_insert_multiplicity(f, p, 1) + +static void +factor_insert_large (struct factors *factors, + uintmax_t p1, uintmax_t p0) +{ + if (p1 > 0) + { + assert (factors->plarge[1] == 0); + factors->plarge[0] = p0; + factors->plarge[1] = p1; + } + else + factor_insert (factors, p0); +} + +#if HAVE_GMP +static void mp_factor (mpz_t, struct mp_factors *); + +static void +mp_factor_init (struct mp_factors *factors) +{ + factors->p = malloc (1); + factors->e = malloc (1); + factors->nfactors = 0; +} + +static void +mp_factor_clear (struct mp_factors *factors) +{ + for (unsigned int i = 0; i < factors->nfactors; i++) + mpz_clear (factors->p[i]); + + free (factors->p); + free (factors->e); +} + +static void +mp_factor_insert (struct mp_factors *factors, mpz_t prime) +{ + unsigned long int nfactors = factors->nfactors; + mpz_t *p = factors->p; + unsigned long int *e = factors->e; + long i; + + /* Locate position for insert new or increment e. */ + for (i = nfactors - 1; i >= 0; i--) + { + if (mpz_cmp (p[i], prime) <= 0) + break; + } + + if (i < 0 || mpz_cmp (p[i], prime) != 0) + { + p = realloc (p, (nfactors + 1) * sizeof p[0]); + e = realloc (e, (nfactors + 1) * sizeof e[0]); + + mpz_init (p[nfactors]); + for (long j = nfactors - 1; j > i; j--) + { + mpz_set (p[j + 1], p[j]); + e[j + 1] = e[j]; + } + mpz_set (p[i + 1], prime); + e[i + 1] = 1; + + factors->p = p; + factors->e = e; + factors->nfactors = nfactors + 1; + } + else + { + e[i] += 1; + } +} + +static void +mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime) +{ + mpz_t pz; + + mpz_init_set_ui (pz, prime); + mp_factor_insert (factors, pz); + mpz_clear (pz); +} +#endif /* HAVE_GMP */ + + +#define P(a,b,c,d) a, +static const unsigned char primes_diff[] = { +#include "primes.h" +0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */ +}; +#undef P + +#define PRIMES_PTAB_ENTRIES (sizeof(primes_diff) / sizeof(primes_diff[0]) - 8 + 1) + +#define P(a,b,c,d) b, +static const unsigned char primes_diff8[] = { +#include "primes.h" +0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */ +}; +#undef P + +struct primes_dtab +{ + uintmax_t binv, lim; +}; + +#define P(a,b,c,d) {c,d}, +static const struct primes_dtab primes_dtab[] = { +#include "primes.h" +{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */ +}; +#undef P + +/* This flag is honoured just in the GMP code. */ +static int flag_verbose = 0; + +/* Prove primality or run probabilistic tests. */ +static int flag_prove_primality = 1; + +/* Number of Miller-Rabin tests to run when not proving primality. */ +#define MR_REPS 25 + +#define LIKELY(cond) __builtin_expect ((cond) != 0, 1) +#define UNLIKELY(cond) __builtin_expect ((cond) != 0, 0) + +static void +factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i, + unsigned int off) +{ + for (unsigned int j = 0; j < off; j++) + p += primes_diff[i + j]; + factor_insert (factors, p); +} + +/* Trial division with odd primes uses the following trick. + + Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity, + consider the case t < B (this is the second loop below). + + From our tables we get + + binv = p^{-1} (mod B) + lim = floor ( (B-1) / p ). + + First assume that t is a multiple of p, t = q * p. Then 0 <= q <= + lim (and all quotients in this range occur for some t). + + Then t = q * p is true also (mod B), and p is invertible we get + + q = t * binv (mod B). + + Next, assume that t is *not* divisible by p. Since multiplication + by binv (mod B) is a one-to-one mapping, + + t * binv (mod B) > lim, + + because all the smaller values are already taken. + + This can be summed up by saying that the function + + q(t) = binv * t (mod B) + + is a permutation of the range 0 <= t < B, with the curious property + that it maps the multiples of p onto the range 0 <= q <= lim, in + order, and the non-multiples of p onto the range lim < q < B. + */ + +static uintmax_t +factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0, + struct factors *factors) +{ + if (t0 % 2 == 0) + { + unsigned int cnt; + + if (t0 == 0) + { + count_trailing_zeros (cnt, t1); + t0 = t1 >> cnt; + t1 = 0; + cnt += W_TYPE_SIZE; + } + else + { + count_trailing_zeros (cnt, t0); + rsh2 (t1, t0, t1, t0, cnt); + } + + factor_insert_multiplicity (factors, 2, cnt); + } + + uintmax_t p = 3; + unsigned int i; + for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++) + { + for (;;) + { + uintmax_t q1, q0, hi, lo; + + q0 = t0 * primes_dtab[i].binv; + umul_ppmm (hi, lo, q0, p); + if (hi > t1) + break; + hi = t1 - hi; + q1 = hi * primes_dtab[i].binv; + if (LIKELY (q1 > primes_dtab[i].lim)) + break; + t1 = q1; t0 = q0; + factor_insert (factors, p); + } + p += primes_diff[i + 1]; + } + if (t1p) + *t1p = t1; + +#define DIVBLOCK(I) \ + do { \ + for (;;) \ + { \ + q = t0 * pd[I].binv; \ + if (LIKELY (q > pd[I].lim)) \ + break; \ + t0 = q; \ + factor_insert_refind (factors, p, i + 1, I); \ + } \ + } while (0) + + for (; i < PRIMES_PTAB_ENTRIES; i += 8) + { + uintmax_t q; + const struct primes_dtab *pd = &primes_dtab[i]; + DIVBLOCK (0); + DIVBLOCK (1); + DIVBLOCK (2); + DIVBLOCK (3); + DIVBLOCK (4); + DIVBLOCK (5); + DIVBLOCK (6); + DIVBLOCK (7); + + p += primes_diff8[i]; + if (p * p > t0) + break; + } + + return t0; +} + +#if HAVE_GMP +static void +mp_factor_using_division (mpz_t t, struct mp_factors *factors) +{ + mpz_t q; + unsigned long int p; + + if (flag_verbose > 0) + { + printf ("[trial division] "); + } + + mpz_init (q); + + p = mpz_scan1 (t, 0); + mpz_div_2exp (t, t, p); + while (p) + { + mp_factor_insert_ui (factors, 2); + --p; + } + + p = 3; + for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;) + { + if (! mpz_divisible_ui_p (t, p)) + { + p += primes_diff[i++]; + if (mpz_cmp_ui (t, p * p) < 0) + break; + } + else + { + mpz_tdiv_q_ui (t, t, p); + mp_factor_insert_ui (factors, p); + } + } + + mpz_clear (q); +} +#endif + +/* Entry i contains (2i+1)^(-1) mod 2^8. */ +static const unsigned char binvert_table[128] = +{ + 0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF, + 0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF, + 0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF, + 0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF, + 0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF, + 0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F, + 0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F, + 0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F, + 0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F, + 0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F, + 0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F, + 0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F, + 0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F, + 0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F, + 0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F, + 0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF +}; + +/* Compute n^(-1) mod B, using a Newton iteration. */ +#define binv(inv,n) \ + do { \ + uintmax_t __n = (n); \ + uintmax_t __inv; \ + \ + __inv = binvert_table[(__n / 2) & 0x7F]; /* 8 */ \ + if (W_TYPE_SIZE > 8) __inv = 2 * __inv - __inv * __inv * __n; \ + if (W_TYPE_SIZE > 16) __inv = 2 * __inv - __inv * __inv * __n; \ + if (W_TYPE_SIZE > 32) __inv = 2 * __inv - __inv * __inv * __n; \ + \ + if (W_TYPE_SIZE > 64) \ + { \ + int __invbits = 64; \ + do { \ + __inv = 2 * __inv - __inv * __inv * __n; \ + __invbits *= 2; \ + } while (__invbits < W_TYPE_SIZE); \ + } \ + \ + (inv) = __inv; \ + } while (0) + +/* q = u / d, assuming d|u. */ +#define divexact_21(q1, q0, u1, u0, d) \ + do { \ + uintmax_t _di, _q0; \ + binv (_di, (d)); \ + _q0 = (u0) * _di; \ + if ((u1) >= (d)) \ + { \ + uintmax_t _p1, _p0; \ + umul_ppmm (_p1, _p0, _q0, d); \ + (q1) = ((u1) - _p1) * _di; \ + (q0) = _q0; \ + } \ + else \ + { \ + (q0) = _q0; \ + (q1) = 0; \ + } \ + } while(0) + +/* x B (mod n). */ +#define redcify(r_prim, r, n) \ + do { \ + uintmax_t _redcify_q; \ + udiv_qrnnd (_redcify_q, r_prim, r, 0, n); \ + } while (0) + +/* x B^2 (mod n). Requires x > 0, n1 < B/2 */ +#define redcify2(r1, r0, x, n1, n0) \ + do { \ + uintmax_t _r1, _r0, _i; \ + if ((x) < (n1)) \ + { \ + _r1 = (x); _r0 = 0; \ + _i = W_TYPE_SIZE; \ + } \ + else \ + { \ + _r1 = 0; _r0 = (x); \ + _i = 2*W_TYPE_SIZE; \ + } \ + while (_i-- > 0) \ + { \ + lsh2 (_r1, _r0, _r1, _r0, 1); \ + if (ge2 (_r1, _r0, (n1), (n0))) \ + sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0)); \ + } \ + (r1) = _r1; \ + (r0) = _r0; \ + } while (0) + +/* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B. + Both a and b must be in redc form, the result will be in redc form too. */ +static inline uintmax_t +mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi) +{ + uintmax_t rh, rl, q, th, tl, xh; + + umul_ppmm (rh, rl, a, b); + q = rl * mi; + umul_ppmm (th, tl, q, m); + xh = rh - th; + if (rh < th) + xh += m; + + return xh; +} + +/* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B. + Both a and b must be in redc form, the result will be in redc form too. + For performance reasons, the most significant bit of m must be clear. */ +static uintmax_t +mulredc2 (uintmax_t *r1p, + uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0, + uintmax_t m1, uintmax_t m0, uintmax_t mi) +{ + uintmax_t r1, r0, q, p1, p0, t1, t0, s1, s0; + mi = -mi; + assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0); + assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0); + assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0); + + /* First compute a0 * <b1, b0> B^{-1} + +-----+ + |a0 b0| + +--+--+--+ + |a0 b1| + +--+--+--+ + |q0 m0| + +--+--+--+ + |q0 m1| + -+--+--+--+ + |r1|r0| 0| + +--+--+--+ + */ + umul_ppmm (t1, t0, a0, b0); + umul_ppmm (r1, r0, a0, b1); + q = mi * t0; + umul_ppmm (p1, p0, q, m0); + umul_ppmm (s1, s0, q, m1); + r0 += (t0 != 0); /* Carry */ + add_ssaaaa (r1, r0, r1, r0, 0, p1); + add_ssaaaa (r1, r0, r1, r0, 0, t1); + add_ssaaaa (r1, r0, r1, r0, s1, s0); + + /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1} + +-----+ + |a1 b0| + +--+--+ + |r1|r0| + +--+--+--+ + |a1 b1| + +--+--+--+ + |q1 m0| + +--+--+--+ + |q1 m1| + -+--+--+--+ + |r1|r0| 0| + +--+--+--+ + */ + umul_ppmm (t1, t0, a1, b0); + umul_ppmm (s1, s0, a1, b1); + add_ssaaaa (t1, t0, t1, t0, 0, r0); + q = mi * t0; + add_ssaaaa (r1, r0, s1, s0, 0, r1); + umul_ppmm (p1, p0, q, m0); + umul_ppmm (s1, s0, q, m1); + r0 += (t0 != 0); /* Carry */ + add_ssaaaa (r1, r0, r1, r0, 0, p1); + add_ssaaaa (r1, r0, r1, r0, 0, t1); + add_ssaaaa (r1, r0, r1, r0, s1, s0); + + if (ge2 (r1, r0, m1, m0)) + sub_ddmmss (r1, r0, r1, r0, m1, m0); + + *r1p = r1; + return r0; +} + +static uintmax_t _GL_ATTRIBUTE_CONST +powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one) +{ + uintmax_t y = one; + + if (e & 1) + y = b; + + while (e != 0) + { + b = mulredc (b, b, n, ni); + e >>= 1; + + if (e & 1) + y = mulredc (y, b, n, ni); + } + + return y; +} + +static uintmax_t +powm2 (uintmax_t *r1m, + const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np, + uintmax_t ni, const uintmax_t *one) +{ + uintmax_t r1, r0, b1, b0, n1, n0; + unsigned int i; + uintmax_t e; + + b0 = bp[0]; + b1 = bp[1]; + n0 = np[0]; + n1 = np[1]; + + r0 = one[0]; + r1 = one[1]; + + for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1) + { + if (e & 1) + { + r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni); + r1 = *r1m; + } + b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni); + b1 = *r1m; + } + for (e = ep[1]; e > 0; e >>= 1) + { + if (e & 1) + { + r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni); + r1 = *r1m; + } + b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni); + b1 = *r1m; + } + *r1m = r1; + return r0; +} + +static bool _GL_ATTRIBUTE_CONST +millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q, + unsigned int k, uintmax_t one) +{ + uintmax_t y = powm (b, q, n, ni, one); + + uintmax_t nm1 = n - one; /* -1, but in redc representation. */ + + if (y == one || y == nm1) + return true; + + for (unsigned int i = 1; i < k; i++) + { + y = mulredc (y, y, n, ni); + + if (y == nm1) + return true; + if (y == one) + return false; + } + return false; +} + +static bool +millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp, + const uintmax_t *qp, unsigned int k, const uintmax_t *one) +{ + uintmax_t y1, y0, nm1_1, nm1_0, r1m; + + y0 = powm2 (&r1m, bp, qp, np, ni, one); + y1 = r1m; + + if (y0 == one[0] && y1 == one[1]) + return true; + + sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]); + + if (y0 == nm1_0 && y1 == nm1_1) + return true; + + for (unsigned int i = 1; i < k; i++) + { + y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni); + y1 = r1m; + + if (y0 == nm1_0 && y1 == nm1_1) + return true; + if (y0 == one[0] && y1 == one[1]) + return false; + } + return false; +} + +#if HAVE_GMP +static bool +mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y, + mpz_srcptr q, unsigned long int k) +{ + mpz_powm (y, x, q, n); + + if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0) + return true; + + for (unsigned long int i = 1; i < k; i++) + { + mpz_powm_ui (y, y, 2, n); + if (mpz_cmp (y, nm1) == 0) + return true; + if (mpz_cmp_ui (y, 1) == 0) + return false; + } + return false; +} +#endif + +/* Lucas' prime test. The number of iterations vary greatly, up to a few dozen + have been observed. The average seem to be about 2. */ +static bool +prime_p (uintmax_t n) +{ + int k; + bool is_prime; + uintmax_t a_prim, one, ni; + struct factors factors; + + if (n <= 1) + return false; + + /* We have already casted out small primes. */ + if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) + return true; + + /* Precomputation for Miller-Rabin. */ + uintmax_t q = n - 1; + for (k = 0; (q & 1) == 0; k++) + q >>= 1; + + uintmax_t a = 2; + binv (ni, n); /* ni <- 1/n mod B */ + redcify (one, 1, n); + addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */ + + /* Perform a Miller-Rabin test, finds most composites quickly. */ + if (!millerrabin (n, ni, a_prim, q, k, one)) + return false; + + if (flag_prove_primality) + { + /* Factor n-1 for Lucas. */ + factor (0, n - 1, &factors); + } + + /* Loop until Lucas proves our number prime, or Miller-Rabin proves our + number composite. */ + for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++) + { + if (flag_prove_primality) + { + is_prime = true; + for (unsigned int i = 0; i < factors.nfactors && is_prime; i++) + { + is_prime = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one; + } + } + else + { + /* After enough Miller-Rabin runs, be content. */ + is_prime = (r == MR_REPS - 1); + } + + if (is_prime) + return true; + + a += primes_diff[r]; /* Establish new base. */ + + /* The following is equivalent to redcify (a_prim, a, n). It runs faster + on most processors, since it avoids udiv_qrnnd. If we go down the + udiv_qrnnd_preinv path, this code should be replaced. */ + { + uintmax_t dummy, s1, s0; + umul_ppmm (s1, s0, one, a); + if (LIKELY (s1 == 0)) + a_prim = s0 % n; + else + udiv_qrnnd (dummy, a_prim, s1, s0, n); + } + + if (!millerrabin (n, ni, a_prim, q, k, one)) + return false; + } + + fprintf (stderr, "Lucas prime test failure. This should not happen\n"); + abort (); +} + +static bool +prime2_p (uintmax_t n1, uintmax_t n0) +{ + uintmax_t q[2], nm1[2]; + uintmax_t a_prim[2]; + uintmax_t one[2]; + uintmax_t na[2]; + uintmax_t ni; + unsigned int k; + struct factors factors; + + if (n1 == 0) + return prime_p (n0); + + nm1[1] = n1 - (n0 == 0); + nm1[0] = n0 - 1; + if (nm1[0] == 0) + { + count_trailing_zeros (k, nm1[1]); + + q[0] = nm1[1] >> k; + q[1] = 0; + k += W_TYPE_SIZE; + } + else + { + count_trailing_zeros (k, nm1[0]); + rsh2 (q[1], q[0], nm1[1], nm1[0], k); + } + + uintmax_t a = 2; + binv (ni, n0); + redcify2 (one[1], one[0], 1, n1, n0); + addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0); + + /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */ + na[0] = n0; + na[1] = n1; + + if (!millerrabin2 (na, ni, a_prim, q, k, one)) + return false; + + if (flag_prove_primality) + { + /* Factor n-1 for Lucas. */ + factor (nm1[1], nm1[0], &factors); + } + + /* Loop until Lucas proves our number prime, or Miller-Rabin proves our + number composite. */ + for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++) + { + bool is_prime; + uintmax_t e[2], y[2]; + + if (flag_prove_primality) + { + is_prime = true; + if (factors.plarge[1]) + { + uintmax_t pi; + binv (pi, factors.plarge[0]); + e[0] = pi * nm1[0]; + e[1] = 0; + y[0] = powm2 (&y[1], a_prim, e, na, ni, one); + is_prime = (y[0] != one[0] || y[1] != one[1]); + } + for (unsigned int i = 0; i < factors.nfactors && is_prime; i++) + { + /* FIXME: We always have the factor 2. Do we really need to handle it + here? We have done the same powering as part of millerrabin. */ + if (factors.p[i] == 2) + rsh2 (e[1], e[0], nm1[1], nm1[0], 1); + else + divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]); + y[0] = powm2 (&y[1], a_prim, e, na, ni, one); + is_prime = (y[0] != one[0] || y[1] != one[1]); + } + } + else + { + /* After enough Miller-Rabin runs, be content. */ + is_prime = (r == MR_REPS - 1); + } + + if (is_prime) + return true; + + a += primes_diff[r]; /* Establish new base. */ + redcify2 (a_prim[1], a_prim[0], a, n1, n0); + + if (!millerrabin2 (na, ni, a_prim, q, k, one)) + return false; + } + + fprintf (stderr, "Lucas prime test failure. This should not happen\n"); + abort (); +} + +#if HAVE_GMP +static bool +mp_prime_p (mpz_t n) +{ + bool is_prime; + mpz_t q, a, nm1, tmp; + struct mp_factors factors; + + if (mpz_cmp_ui (n, 1) <= 0) + return false; + + /* We have already casted out small primes. */ + if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0) + return true; + + mpz_inits (q, a, nm1, tmp, NULL); + + /* Precomputation for Miller-Rabin. */ + mpz_sub_ui (nm1, n, 1); + + /* Find q and k, where q is odd and n = 1 + 2**k * q. */ + unsigned long int k = mpz_scan1 (nm1, 0); + mpz_tdiv_q_2exp (q, nm1, k); + + mpz_set_ui (a, 2); + + /* Perform a Miller-Rabin test, finds most composites quickly. */ + if (!mp_millerrabin (n, nm1, a, tmp, q, k)) + { + is_prime = false; + goto ret2; + } + + if (flag_prove_primality) + { + /* Factor n-1 for Lucas. */ + mpz_set (tmp, nm1); + mp_factor (tmp, &factors); + } + + /* Loop until Lucas proves our number prime, or Miller-Rabin proves our + number composite. */ + for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++) + { + if (flag_prove_primality) + { + is_prime = true; + for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++) + { + mpz_divexact (tmp, nm1, factors.p[i]); + mpz_powm (tmp, a, tmp, n); + is_prime = mpz_cmp_ui (tmp, 1) != 0; + } + } + else + { + /* After enough Miller-Rabin runs, be content. */ + is_prime = (r == MR_REPS - 1); + } + + if (is_prime) + goto ret1; + + mpz_add_ui (a, a, primes_diff[r]); /* Establish new base. */ + + if (!mp_millerrabin (n, nm1, a, tmp, q, k)) + { + is_prime = false; + goto ret1; + } + } + + fprintf (stderr, "Lucas prime test failure. This should not happen\n"); + abort (); + + ret1: + if (flag_prove_primality) + mp_factor_clear (&factors); + ret2: + mpz_clears (q, a, nm1, tmp, NULL); + + return is_prime; +} +#endif + +static void +factor_using_pollard_rho (uintmax_t n, unsigned long int a, + struct factors *factors) +{ + uintmax_t x, z, y, P, t, ni, g; + + unsigned long int k = 1; + unsigned long int l = 1; + + redcify (P, 1, n); + addmod (x, P, P, n); /* i.e., redcify(2) */ + y = z = x; + + while (n != 1) + { + assert (a < n); + + binv (ni, n); /* FIXME: when could we use old 'ni' value? */ + + for (;;) + { + do + { + x = mulredc (x, x, n, ni); + addmod (x, x, a, n); + + submod (t, z, x, n); + P = mulredc (P, t, n, ni); + + if (k % 32 == 1) + { + if (gcd_odd (P, n) != 1) + goto factor_found; + y = x; + } + } + while (--k != 0); + + z = x; + k = l; + l = 2 * l; + for (unsigned long int i = 0; i < k; i++) + { + x = mulredc (x, x, n, ni); + addmod (x, x, a, n); + } + y = x; + } + + factor_found: + do + { + y = mulredc (y, y, n, ni); + addmod (y, y, a, n); + + submod (t, z, y, n); + g = gcd_odd (t, n); + } + while (g == 1); + + n = n / g; + + if (!prime_p (g)) + factor_using_pollard_rho (g, a + 1, factors); + else + factor_insert (factors, g); + + if (prime_p (n)) + { + factor_insert (factors, n); + break; + } + + x = x % n; + z = z % n; + y = y % n; + } +} + +static void +factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a, + struct factors *factors) +{ + uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m; + + unsigned long int k = 1; + unsigned long int l = 1; + + redcify2 (P1, P0, 1, n1, n0); + addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */ + y1 = z1 = x1; + y0 = z0 = x0; + + while (n1 != 0 || n0 != 1) + { + binv (ni, n0); + + for (;;) + { + do + { + x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni); + x1 = r1m; + addmod2 (x1, x0, x1, x0, 0, a, n1, n0); + + submod2 (t1, t0, z1, z0, x1, x0, n1, n0); + P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni); + P1 = r1m; + + if (k % 32 == 1) + { + g0 = gcd2_odd (&g1, P1, P0, n1, n0); + if (g1 != 0 || g0 != 1) + goto factor_found; + y1 = x1; y0 = x0; + } + } + while (--k != 0); + + z1 = x1; z0 = x0; + k = l; + l = 2 * l; + for (unsigned long int i = 0; i < k; i++) + { + x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni); + x1 = r1m; + addmod2 (x1, x0, x1, x0, 0, a, n1, n0); + } + y1 = x1; y0 = x0; + } + + factor_found: + do + { + y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni); + y1 = r1m; + addmod2 (y1, y0, y1, y0, 0, a, n1, n0); + + submod2 (t1, t0, z1, z0, y1, y0, n1, n0); + g0 = gcd2_odd (&g1, t1, t0, n1, n0); + } + while (g1 == 0 && g0 == 1); + + if (g1 == 0) + { + /* The found factor is one word. */ + divexact_21 (n1, n0, n1, n0, g0); /* n = n / g */ + + if (!prime_p (g0)) + factor_using_pollard_rho (g0, a + 1, factors); + else + factor_insert (factors, g0); + } + else + { + /* The found factor is two words. This is highly unlikely, thus hard + to trigger. Please be careful before you change this code! */ + uintmax_t ginv; + + binv (ginv, g0); /* Compute n = n / g. Since the result will */ + n0 = ginv * n0; /* fit one word, we can compute the quotient */ + n1 = 0; /* modulo B, ignoring the high divisor word. */ + + if (!prime2_p (g1, g0)) + factor_using_pollard_rho2 (g1, g0, a + 1, factors); + else + factor_insert_large (factors, g1, g0); + } + + if (n1 == 0) + { + if (prime_p (n0)) + { + factor_insert (factors, n0); + break; + } + + factor_using_pollard_rho (n0, a, factors); + return; + } + + if (prime2_p (n1, n0)) + { + factor_insert_large (factors, n1, n0); + break; + } + + x0 = mod2 (&x1, x1, x0, n1, n0); + z0 = mod2 (&z1, z1, z0, n1, n0); + y0 = mod2 (&y1, y1, y0, n1, n0); + } +} + +#if HAVE_GMP +static void +mp_factor_using_pollard_rho (mpz_t n, unsigned long int a, + struct mp_factors *factors) +{ + mpz_t x, z, y, P; + mpz_t t, t2; + unsigned long long int k, l; + + if (flag_verbose > 0) + { + printf ("[pollard-rho (%lu)] ", a); + } + + mpz_inits (t, t2, NULL); + mpz_init_set_si (y, 2); + mpz_init_set_si (x, 2); + mpz_init_set_si (z, 2); + mpz_init_set_ui (P, 1); + k = 1; + l = 1; + + while (mpz_cmp_ui (n, 1) != 0) + { + for (;;) + { + do + { + mpz_mul (t, x, x); + mpz_mod (x, t, n); + mpz_add_ui (x, x, a); + + mpz_sub (t, z, x); + mpz_mul (t2, P, t); + mpz_mod (P, t2, n); + + if (k % 32 == 1) + { + mpz_gcd (t, P, n); + if (mpz_cmp_ui (t, 1) != 0) + goto factor_found; + mpz_set (y, x); + } + } + while (--k != 0); + + mpz_set (z, x); + k = l; + l = 2 * l; + for (unsigned long long int i = 0; i < k; i++) + { + mpz_mul (t, x, x); + mpz_mod (x, t, n); + mpz_add_ui (x, x, a); + } + mpz_set (y, x); + } + + factor_found: + do + { + mpz_mul (t, y, y); + mpz_mod (y, t, n); + mpz_add_ui (y, y, a); + + mpz_sub (t, z, y); + mpz_gcd (t, t, n); + } + while (mpz_cmp_ui (t, 1) == 0); + + mpz_divexact (n, n, t); /* divide by t, before t is overwritten */ + + if (!mp_prime_p (t)) + { + if (flag_verbose > 0) + { + printf ("[composite factor--restarting pollard-rho] "); + } + mp_factor_using_pollard_rho (t, a + 1, factors); + } + else + { + mp_factor_insert (factors, t); + } + + if (mp_prime_p (n)) + { + mp_factor_insert (factors, n); + break; + } + + mpz_mod (x, x, n); + mpz_mod (z, z, n); + mpz_mod (y, y, n); + } + + mpz_clears (P, t2, t, z, x, y, NULL); +} +#endif + +/* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)? If + algorithm is replaced, consider also returning the remainder. */ +static uintmax_t _GL_ATTRIBUTE_CONST +isqrt (uintmax_t n) +{ + uintmax_t x; + unsigned c; + if (n == 0) + return 0; + + count_leading_zeros (c, n); + + /* Make x > sqrt(n). This will be invariant through the loop. */ + x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2); + + for (;;) + { + uintmax_t y = (x + n/x) / 2; + if (y >= x) + return x; + + x = y; + } +} + +static uintmax_t _GL_ATTRIBUTE_CONST +isqrt2 (uintmax_t nh, uintmax_t nl) +{ + unsigned int shift; + uintmax_t x; + + /* Ensures the remainder fits in an uintmax_t. */ + assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2))); + + if (nh == 0) + return isqrt (nl); + + count_leading_zeros (shift, nh); + shift &= ~1; + + /* Make x > sqrt(n) */ + x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1; + x <<= (W_TYPE_SIZE - shift) / 2; + + /* Do we need more than one iteration? */ + for (;;) + { + uintmax_t q, r, y; + udiv_qrnnd (q, r, nh, nl, x); + y = (x + q) / 2; + + if (y >= x) + { + uintmax_t hi, lo; + umul_ppmm (hi, lo, x + 1, x + 1); + assert (gt2 (hi, lo, nh, nl)); + + umul_ppmm (hi, lo, x, x); + assert (ge2 (nh, nl, hi, lo)); + sub_ddmmss (hi, lo, nh, nl, hi, lo); + assert (hi == 0); + + return x; + } + + x = y; + } +} + +/* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */ +#define MAGIC64 ((uint64_t) 0x0202021202030213ULL) +#define MAGIC63 ((uint64_t) 0x0402483012450293ULL) +#define MAGIC65 ((uint64_t) 0x218a019866014613ULL) +#define MAGIC11 0x23b + +/* Returns the square root if the input is a square, otherwise 0. */ +static uintmax_t _GL_ATTRIBUTE_CONST +is_square (uintmax_t x) +{ + /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before + computing the square root. */ + if (((MAGIC64 >> (x & 63)) & 1) + && ((MAGIC63 >> (x % 63)) & 1) + /* Both 0 and 64 are squares mod (65) */ + && ((MAGIC65 >> ((x % 65) & 63)) & 1) + && ((MAGIC11 >> (x % 11) & 1))) + { + uintmax_t r = isqrt (x); + if (r*r == x) + return r; + } + return 0; +} + +static const unsigned short invtab[] = + { + 0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1, + 0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7, + 0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af, + 0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199, + 0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186, + 0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174, + 0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164, + 0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155, + 0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147, + 0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b, + 0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f, + 0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124, + 0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a, + 0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111, + 0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108, + 0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100, + }; + +/* Compute q = [u/d], r = u mod d. Avoids slow hardware division for the case + that q < 0x40; here it instead uses a table of (Euclidian) inverses. */ +#define div_smallq(q, r, u, d) \ + do { \ + if (0 && (u) / 0x40 < (d)) \ + { \ + int _cnt; \ + uintmax_t _dinv, _mask, _q, _r; \ + count_leading_zeros (_cnt, (d)); \ + \ + _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) \ + - (1 << (8 - 1))]; \ + \ + _r = (u); \ + _q = _r * _dinv >> (W_TYPE_SIZE + 8 - _cnt); \ + _r -= _q*(d); \ + \ + _mask = -(uintmax_t) (_r >= (d)); \ + (r) = _r - (_mask & (d)); \ + (q) = _q - _mask; \ + assert ( (q) * (d) + (r) == u); \ + } \ + else \ + { \ + uintmax_t _q = (u) / (d); \ + (r) = (u) - _q * (d); \ + (q) = _q; \ + } \ + } while (0) + +/* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q + = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025), + with 3025 = 55^2. + + Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652, + representing G_0 = (-55, 2*4652, 8653). + + In the notation of the paper: + + S_{-1} = 55, S_0 = 8653, R_0 = 4652 + + Put + + t_0 = floor([q_0 + R_0] / S0) = 1 + R_1 = t_0 * S_0 - R_0 = 4001 + S_1 = S_{-1} +t_0 (R_0 - R_1) = 706 +*/ + +/* Multipliers, in order of efficiency: + 0.7268 3*5*7*11 = 1155 = 3 (mod 4) + 0.7317 3*5*7 = 105 = 1 + 0.7820 3*5*11 = 165 = 1 + 0.7872 3*5 = 15 = 3 + 0.8101 3*7*11 = 231 = 3 + 0.8155 3*7 = 21 = 1 + 0.8284 5*7*11 = 385 = 1 + 0.8339 5*7 = 35 = 3 + 0.8716 3*11 = 33 = 1 + 0.8774 3 = 3 = 3 + 0.8913 5*11 = 55 = 3 + 0.8972 5 = 5 = 1 + 0.9233 7*11 = 77 = 1 + 0.9295 7 = 7 = 3 + 0.9934 11 = 11 = 3 +*/ +#define QUEUE_SIZE 50 + +#if STAT_SQUFOF +# define Q_FREQ_SIZE 50 +/* Element 0 keeps the total */ +static unsigned int q_freq[Q_FREQ_SIZE + 1]; +# define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif + + +static void +factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors) +{ + /* Uses algorithm and notation from + + SQUARE FORM FACTORIZATION + JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR. + + http://homes.cerias.purdue.edu/~ssw/squfof.pdf + */ + + static const unsigned int multipliers_1[] = + { /* = 1 (mod 4) */ + 105, 165, 21, 385, 33, 5, 77, 1, 0 + }; + static const unsigned int multipliers_3[] = + { /* = 3 (mod 4) */ + 1155, 15, 231, 35, 3, 55, 7, 11, 0 + }; + + uintmax_t S; + const unsigned int *m; + + struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE]; + + S = isqrt2 (n1, n0); + + if (n0 == S * S) + { + uintmax_t p1, p0; + + umul_ppmm (p1, p0, S, S); + assert (p0 == n0); + + if (n1 == p1) + { + if (prime_p (S)) + factor_insert_multiplicity (factors, S, 2); + else + { + struct factors f; + + f.nfactors = 0; + factor_using_squfof (0, S, &f); + /* Duplicate the new factors */ + for (unsigned int i = 0; i < f.nfactors; i++) + factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]); + } + return; + } + } + + /* Select multipliers so we always get n * mu = 3 (mod 4) */ + for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1; + *m; m++) + { + uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B; + unsigned int i; + unsigned int mu = *m; + unsigned int qpos = 0; + + assert (mu * n0 % 4 == 3); + + /* In the notation of the paper, with mu * n == 3 (mod 4), we + get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as + I understand it, the necessary bound is 4 \mu^3 < n, or 32 + mu^3 < n. + + However, this seems insufficient: With n = 37243139 and mu = + 105, we get a trivial factor, from the square 38809 = 197^2, + without any corresponding Q earlier in the iteration. + + Requiring 64 mu^3 < n seems sufficient. */ + if (n1 == 0) + { + if ((uintmax_t) mu*mu*mu >= n0 / 64) + continue; + } + else + { + if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu) + continue; + } + umul_ppmm (Dh, Dl, n0, mu); + Dh += n1 * mu; + + assert (Dl % 4 != 1); + + S = isqrt2 (Dh, Dl); + + Q1 = 1; + P = S; + + /* Square root remainder fits in one word, so ignore high part. */ + Q = Dl - P*P; + /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */ + L = isqrt (2*S); + B = 2*L; + L1 = mu * 2 * L; + + /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) = + 4 D. */ + + for (i = 0; i <= B; i++) + { + uintmax_t q, P1, t, r; + + div_smallq (q, r, S+P, Q); + P1 = S - r; /* P1 = q*Q - P */ + +#if STAT_SQUFOF + assert (q > 0); + q_freq[0]++; + q_freq[MIN(q, Q_FREQ_SIZE)]++; +#endif + + if (Q <= L1) + { + uintmax_t g = Q; + + if ( (Q & 1) == 0) + g /= 2; + + g /= gcd_odd (g, mu); + + if (g <= L) + { + if (qpos >= QUEUE_SIZE) + { + fprintf (stderr, "squfof queue overflow.\n"); + exit (EXIT_FAILURE); + } + queue[qpos].Q = g; + queue[qpos].P = P % g; + qpos++; + } + } + + /* I think the difference can be either sign, but mod + 2^W_TYPE_SIZE arithmetic should be fine. */ + t = Q1 + q * (P - P1); + Q1 = Q; + Q = t; + P = P1; + + if ( (i & 1) == 0) + { + uintmax_t r = is_square (Q); + if (r) + { + unsigned int j; + + for (j = 0; j < qpos; j++) + { + if (queue[j].Q == r) + { + if (r == 1) + /* Traversed entire cycle. */ + goto next_multiplier; + + /* Need the absolute value for divisibility test. */ + if (P >= queue[j].P) + t = P - queue[j].P; + else + t = queue[j].P - P; + if (t % r == 0) + { + /* Delete entries up to and including entry + j, which matched. */ + memmove (queue, queue + j + 1, + (qpos - j - 1) * sizeof (queue[0])); + qpos -= (j + 1); + } + goto next_i; + } + } + + /* We have found a square form, which should give a + factor. */ + Q1 = r; + assert (S >= P); /* What signs are possible? */ + P += r * ((S - P) / r); + + /* Note: Paper says (N - P*P) / Q1, that seems incorrect + for the case D = 2N. */ + /* Compute Q = (D - P*P) / Q1, but we need double + precision. */ + { + uintmax_t hi, lo, rem; + umul_ppmm (hi, lo, P, P); + sub_ddmmss (hi, lo, Dh, Dl, hi, lo); + udiv_qrnnd (Q, rem, hi, lo, Q1); + assert (rem == 0); + } + + for (;;) + { + uintmax_t r; + + /* Note: There appears to by a typo in the paper, + Step 4a in the algorithm description says q <-- + floor([S+P]/\hat Q), but looking at the equations + in Sec. 3.1, it should be q <-- floor([S+P] / Q). + (In this code, \hat Q is Q1). */ + div_smallq (q, r, S+P, Q); + P1 = S - r; /* P1 = q*Q - P */ + +#if STAT_SQUFOF + q_freq[0]++; + q_freq[MIN(q, Q_FREQ_SIZE)]++; +#endif + if (P == P1) + break; + t = Q1 + q * (P - P1); + Q1 = Q; + Q = t; + P = P1; + } + + if ( (Q & 1) == 0) + Q /= 2; + Q /= gcd_odd (Q, mu); + + assert (Q > 1 && (n1 || Q < n0)); + + if (prime_p (Q)) + factor_insert (factors, Q); + else + factor_using_squfof (0, Q, factors); + + divexact_21 (n1, n0, n1, n0, Q); + + if (prime2_p (n1, n0)) + factor_insert_large (factors, n1, n0); + else + { + if (n1 == 0) + factor_using_pollard_rho (n0, 1, factors); + else + factor_using_squfof (n1, n0, factors); + } + + return; + } + } + next_i: + ; + } + next_multiplier: + ; + } + fprintf (stderr, "squfof failed.\n"); + exit (EXIT_FAILURE); +} + +static void +factor (uintmax_t t1, uintmax_t t0, struct factors *factors) +{ + factors->nfactors = 0; + factors->plarge[1] = 0; + + if (t1 == 0 && t0 < 2) + return; + + t0 = factor_using_division (&t1, t1, t0, factors); + + if (t1 == 0) + { + if (t0 != 1) + { + if (prime_p (t0)) + factor_insert (factors, t0); + else if (alg == ALG_POLLARD_RHO) + factor_using_pollard_rho (t0, 1, factors); + else + factor_using_squfof (0, t0, factors); + } + } + else + { + if (prime2_p (t1, t0)) + factor_insert_large (factors, t1, t0); + else if (alg == ALG_POLLARD_RHO) + factor_using_pollard_rho2 (t1, t0, 1, factors); + else + factor_using_squfof (t1, t0, factors); + } +} + +#if HAVE_GMP +static void +mp_factor (mpz_t t, struct mp_factors *factors) +{ + mp_factor_init (factors); + + if (mpz_sgn (t) != 0) + { + mp_factor_using_division (t, factors); + + if (mpz_cmp_ui (t, 1) != 0) + { + if (flag_verbose > 0) + { + printf ("[is number prime?] "); + } + if (mp_prime_p (t)) + mp_factor_insert (factors, t); + else + mp_factor_using_pollard_rho (t, 1, factors); + } + } +} +#endif + +static int +strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s) +{ + int errcode; + unsigned int lo_carry; + uintmax_t hi, lo; + + errcode = -1; + + hi = lo = 0; + for (;;) + { + unsigned int c = *s++; + if (c == 0) + break; + + if (UNLIKELY (c < '0' || c > '9')) + { + errcode = -1; + break; + } + c -= '0'; + + errcode = 0; /* we've seen at least one valid digit */ + + if (UNLIKELY (hi > ~(uintmax_t)0 / 10)) + { + errcode = -1; /* overflow */ + break; + } + hi = 10 * hi; + + lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1)); + lo_carry += 10 * lo < 2 * lo; + + lo = 10 * lo; + lo += c; + + lo_carry += lo < c; + hi += lo_carry; + if (UNLIKELY (hi < lo_carry)) + { + errcode = -1; /* overflow */ + break; + } + } + + *hip = hi; + *lop = lo; + + return errcode; +} + +static void +print_uintmaxes (uintmax_t t1, uintmax_t t0) +{ + uintmax_t q, r; + + if (t1 == 0) + printf ("%ju", t0); + else + { + /* Use very plain code here since it seems hard to write fast code + without assuming a specific word size. */ + q = t1 / 1000000000; + r = t1 % 1000000000; + udiv_qrnnd (t0, r, r, t0, 1000000000); + print_uintmaxes (q, t0); + printf ("%09u", (int) r); + } +} + +static void +factor_one (const char *input) +{ + uintmax_t t1, t0; + int errcode; + + /* Try converting the number to one or two words. If it fails, use GMP or + print an error message. The 2nd condition checks that the most + significant bit of the two-word number is clear, in a typesize neutral + way. */ + errcode = strto2uintmax (&t1, &t0, input); + if (errcode == 0 && ((t1 << 1) >> 1) == t1) + { + struct factors factors; + + print_uintmaxes (t1, t0); + printf (":"); + + factor (t1, t0, &factors); + + for (unsigned int j = 0; j < factors.nfactors; j++) + for (unsigned int k = 0; k < factors.e[j]; k++) + printf (" %ju", factors.p[j]); + + if (factors.plarge[1]) + { + printf (" "); + print_uintmaxes (factors.plarge[1], factors.plarge[0]); + } + puts (""); + } + else + { +#if HAVE_GMP + mpz_t t; + struct mp_factors factors; + + mpz_init_set_str (t, input, 10); + + gmp_printf ("%Zd:", t); + mp_factor (t, &factors); + + for (unsigned int j = 0; j < factors.nfactors; j++) + for (unsigned int k = 0; k < factors.e[j]; k++) + gmp_printf (" %Zd", factors.p[j]); + + mp_factor_clear (&factors); + mpz_clear (t); + puts (""); +#else + fprintf (stderr, "error: number %s not parsable or too large\n", input); + exit (EXIT_FAILURE); +#endif + } +} + +struct inbuf +{ + char *buf; + size_t alloc; +}; + +/* Read white-space delimited items. Return 1 on success, 0 on EOF. + Exit on I/O errors. */ +int +read_item (struct inbuf *bufstruct) +{ + int c; + size_t i; + char *buf = bufstruct->buf; + + do + c = getchar_unlocked (); + while (isspace (c)); + + for (i = 0; !isspace(c); i++) + { + if (c < 0) + { + if (ferror (stdin)) + { + fprintf (stderr, "read error on stdin: %s\n", strerror(errno)); + exit (EXIT_FAILURE); + } + if (i == 0) + return 0; + else + break; + } + + if (UNLIKELY (bufstruct->alloc <= i + 1)) /* +1 byte for terminating NUL */ + { + bufstruct->alloc = bufstruct->alloc * 5 / 4 + 1; + bufstruct->buf = realloc (bufstruct->buf, bufstruct->alloc); + buf = bufstruct->buf; + } + + buf[i] = c; + c = getchar_unlocked (); + } + + buf[i] = '\0'; + return 1; +} + +int +main (int argc, char *argv[]) +{ + int c; + + alg = ALG_POLLARD_RHO; /* Default to Pollard rho */ + + while ( (c = getopt(argc, argv, "svw")) != -1) + switch (c) + { + case 's': + alg = ALG_SQUFOF; + break; + case 'v': + flag_verbose = 1; + break; + case 'w': + flag_prove_primality = 0; + break; + case '?': + printf ("Usage: %s [-s] number ...\n", argv[0]); + return EXIT_FAILURE; + } +#if STAT_SQUFOF + if (alg == ALG_SQUFOF) + memset (q_freq, 0, sizeof (q_freq)); +#endif + + if (optind < argc) + for (int i = optind; i < argc; i++) + factor_one (argv[i]); + else + { + struct inbuf bufstruct; + bufstruct.alloc = 50; /* enough unless HAVE_GMP */ + bufstruct.buf = malloc (bufstruct.alloc); + while (read_item (&bufstruct)) + factor_one (bufstruct.buf); + } + +#if STAT_SQUFOF + if (alg == ALG_SQUFOF && q_freq[0] > 0) + { + double acc_f; + printf ("q freq. cum. freq.(total: %d)\n", q_freq[0]); + for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++) + { + double f = (double) q_freq[i] / q_freq[0]; + acc_f += f; + printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i, + 100.0 * f, 100.0 * acc_f); + } + } +#endif + + exit (EXIT_SUCCESS); +} diff --git a/src/longlong.h b/src/longlong.h new file mode 100644 index 000000000..84e32ee46 --- /dev/null +++ b/src/longlong.h @@ -0,0 +1,2156 @@ +/* longlong.h -- definitions for mixed size 32/64 bit arithmetic. + +Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, +2004, 2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it under the +terms of the GNU Lesser General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +details. + +You should have received a copy of the GNU Lesser General Public License +along with this file. If not, see http://www.gnu.org/licenses/. */ + +/* You have to define the following before including this file: + + UWtype -- An unsigned type, default type for operations (typically a "word") + UHWtype -- An unsigned type, at least half the size of UWtype + UDWtype -- An unsigned type, at least twice as large a UWtype + W_TYPE_SIZE -- size in bits of UWtype + + SItype, USItype -- Signed and unsigned 32 bit types + DItype, UDItype -- Signed and unsigned 64 bit types + + On a 32 bit machine UWtype should typically be USItype; + on a 64 bit machine, UWtype should typically be UDItype. + + Optionally, define: + + LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files + NO_ASM -- Disable inline asm + + + CAUTION! Using this version of longlong.h outside of GMP is not safe. You + need to include gmp.h and gmp-impl.h, or certain things might not work as + expected. +*/ + +#define __BITS4 (W_TYPE_SIZE / 4) +#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2)) +#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1)) +#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2)) + +/* This is used to make sure no undesirable sharing between different libraries + that use this file takes place. */ +#ifndef __MPN +#define __MPN(x) __##x +#endif + +/* Define auxiliary asm macros. + + 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two + UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype + word product in HIGH_PROD and LOW_PROD. + + 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a + UDWtype product. This is just a variant of umul_ppmm. + + 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator, + denominator) divides a UDWtype, composed by the UWtype integers + HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient + in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less + than DENOMINATOR for correct operation. If, in addition, the most + significant bit of DENOMINATOR must be 1, then the pre-processor symbol + UDIV_NEEDS_NORMALIZATION is defined to 1. + + 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, + denominator). Like udiv_qrnnd but the numbers are signed. The quotient + is rounded towards 0. + + 5) count_leading_zeros(count, x) counts the number of zero-bits from the + msb to the first non-zero bit in the UWtype X. This is the number of + steps X needs to be shifted left to set the msb. Undefined for X == 0, + unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value. + + 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts + from the least significant end. + + 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1, + high_addend_2, low_addend_2) adds two UWtype integers, composed by + HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2 + respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow + (i.e. carry out) is not stored anywhere, and is lost. + + 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend, + high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers, + composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and + LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE + and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere, + and is lost. + + If any of these macros are left undefined for a particular CPU, + C macros are used. + + + Notes: + + For add_ssaaaa the two high and two low addends can both commute, but + unfortunately gcc only supports one "%" commutative in each asm block. + This has always been so but is only documented in recent versions + (eg. pre-release 3.3). Having two or more "%"s can cause an internal + compiler error in certain rare circumstances. + + Apparently it was only the last "%" that was ever actually respected, so + the code has been updated to leave just that. Clearly there's a free + choice whether high or low should get it, if there's a reason to favour + one over the other. Also obviously when the constraints on the two + operands are identical there's no benefit to the reloader in any "%" at + all. + + */ + +/* The CPUs come in alphabetical order below. + + Please add support for more CPUs here, or improve the current support + for the CPUs below! */ + + +/* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc + 3.4 __builtin_clzl or __builtin_clzll, according to our limb size. + Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or + __builtin_ctzll. + + These builtins are only used when we check what code comes out, on some + chips they're merely libgcc calls, where we will instead want an inline + in that case (either asm or generic C). + + These builtins are better than an asm block of the same insn, since an + asm block doesn't give gcc any information about scheduling or resource + usage. We keep an asm block for use on prior versions of gcc though. + + For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but + it's not used (for count_leading_zeros) because it generally gives extra + code to ensure the result is 0 when the input is 0, which we don't need + or want. */ + +#ifdef _LONG_LONG_LIMB +#define count_leading_zeros_gcc_clz(count,x) \ + do { \ + ASSERT ((x) != 0); \ + (count) = __builtin_clzll (x); \ + } while (0) +#else +#define count_leading_zeros_gcc_clz(count,x) \ + do { \ + ASSERT ((x) != 0); \ + (count) = __builtin_clzl (x); \ + } while (0) +#endif + +#ifdef _LONG_LONG_LIMB +#define count_trailing_zeros_gcc_ctz(count,x) \ + do { \ + ASSERT ((x) != 0); \ + (count) = __builtin_ctzll (x); \ + } while (0) +#else +#define count_trailing_zeros_gcc_ctz(count,x) \ + do { \ + ASSERT ((x) != 0); \ + (count) = __builtin_ctzl (x); \ + } while (0) +#endif + + +/* FIXME: The macros using external routines like __MPN(count_leading_zeros) + don't need to be under !NO_ASM */ +#if ! defined (NO_ASM) + +#if defined (__alpha) && W_TYPE_SIZE == 64 +/* Most alpha-based machines, except Cray systems. */ +#if defined (__GNUC__) +#if __GMP_GNUC_PREREQ (3,3) +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + UDItype __m0 = (m0), __m1 = (m1); \ + (ph) = __builtin_alpha_umulh (__m0, __m1); \ + (pl) = __m0 * __m1; \ + } while (0) +#else +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + UDItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("umulh %r1,%2,%0" \ + : "=r" (ph) \ + : "%rJ" (m0), "rI" (m1)); \ + (pl) = __m0 * __m1; \ + } while (0) +#endif +#define UMUL_TIME 18 +#else /* ! __GNUC__ */ +#include <machine/builtins.h> +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + UDItype __m0 = (m0), __m1 = (m1); \ + (ph) = __UMULH (m0, m1); \ + (pl) = __m0 * __m1; \ + } while (0) +#endif +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { UWtype __di; \ + __di = __MPN(invert_limb) (d); \ + udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ + } while (0) +#define UDIV_PREINV_ALWAYS 1 +#define UDIV_NEEDS_NORMALIZATION 1 +#define UDIV_TIME 220 +#endif /* LONGLONG_STANDALONE */ + +/* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm + always goes into libgmp.so, even when not actually used. */ +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB + +#if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX +#define count_leading_zeros(COUNT,X) \ + __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X)) +#define count_trailing_zeros(COUNT,X) \ + __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X)) +#endif /* clz/ctz using cix */ + +#if ! defined (count_leading_zeros) \ + && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE) +/* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0. + "$31" is written explicitly in the asm, since an "r" constraint won't + select reg 31. There seems no need to worry about "r31" syntax for cray, + since gcc itself (pre-release 3.4) emits just $31 in various places. */ +#define ALPHA_CMPBGE_0(dst, src) \ + do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0) +/* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts + them, locating the highest non-zero byte. A second __clz_tab lookup + counts the leading zero bits in that byte, giving the result. */ +#define count_leading_zeros(count, x) \ + do { \ + UWtype __clz__b, __clz__c, __clz__x = (x); \ + ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \ + __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \ + __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \ + __clz__x >>= __clz__b; \ + __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \ + __clz__b = 65 - __clz__b; \ + (count) = __clz__b - __clz__c; \ + } while (0) +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB +#endif /* clz using cmpbge */ + +#if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE) +#if HAVE_ATTRIBUTE_CONST +long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const)); +#else +long __MPN(count_leading_zeros) (UDItype); +#endif +#define count_leading_zeros(count, x) \ + ((count) = __MPN(count_leading_zeros) (x)) +#endif /* clz using mpn */ +#endif /* __alpha */ + +#if defined (__AVR) && W_TYPE_SIZE == 8 +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + unsigned short __p = (unsigned short) (m0) * (m1); \ + (ph) = __p >> 8; \ + (pl) = __p; \ + } while (0) +#endif /* AVR */ + +#if defined (_CRAY) && W_TYPE_SIZE == 64 +#include <intrinsics.h> +#define UDIV_PREINV_ALWAYS 1 +#define UDIV_NEEDS_NORMALIZATION 1 +#define UDIV_TIME 220 +long __MPN(count_leading_zeros) (UDItype); +#define count_leading_zeros(count, x) \ + ((count) = _leadz ((UWtype) (x))) +#if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */ +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + UDItype __m0 = (m0), __m1 = (m1); \ + (ph) = _int_mult_upper (m0, m1); \ + (pl) = __m0 * __m1; \ + } while (0) +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { UWtype __di; \ + __di = __MPN(invert_limb) (d); \ + udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ + } while (0) +#endif /* LONGLONG_STANDALONE */ +#endif /* _CRAYIEEE */ +#endif /* _CRAY */ + +#if defined (__ia64) && W_TYPE_SIZE == 64 +/* This form encourages gcc (pre-release 3.4 at least) to emit predicated + "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic + code using "al<bl" arithmetically comes out making an actual 0 or 1 in a + register, which takes an extra cycle. */ +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + do { \ + UWtype __x; \ + __x = (al) - (bl); \ + if ((al) < (bl)) \ + (sh) = (ah) - (bh) - 1; \ + else \ + (sh) = (ah) - (bh); \ + (sl) = __x; \ + } while (0) +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) +/* Do both product parts in assembly, since that gives better code with + all gcc versions. Some callers will just use the upper part, and in + that situation we waste an instruction, but not any cycles. */ +#define umul_ppmm(ph, pl, m0, m1) \ + __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \ + : "=&f" (ph), "=f" (pl) \ + : "f" (m0), "f" (m1)) +#define UMUL_TIME 14 +#define count_leading_zeros(count, x) \ + do { \ + UWtype _x = (x), _y, _a, _c; \ + __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \ + __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \ + _c = (_a - 1) << 3; \ + _x >>= _c; \ + if (_x >= 1 << 4) \ + _x >>= 4, _c += 4; \ + if (_x >= 1 << 2) \ + _x >>= 2, _c += 2; \ + _c += _x >> 1; \ + (count) = W_TYPE_SIZE - 1 - _c; \ + } while (0) +/* similar to what gcc does for __builtin_ffs, but 0 based rather than 1 + based, and we don't need a special case for x==0 here */ +#define count_trailing_zeros(count, x) \ + do { \ + UWtype __ctz_x = (x); \ + __asm__ ("popcnt %0 = %1" \ + : "=r" (count) \ + : "r" ((__ctz_x-1) & ~__ctz_x)); \ + } while (0) +#endif +#if defined (__INTEL_COMPILER) +#include <ia64intrin.h> +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + UWtype _m0 = (m0), _m1 = (m1); \ + ph = _m64_xmahu (_m0, _m1, 0); \ + pl = _m0 * _m1; \ + } while (0) +#endif +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { UWtype __di; \ + __di = __MPN(invert_limb) (d); \ + udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ + } while (0) +#define UDIV_PREINV_ALWAYS 1 +#define UDIV_NEEDS_NORMALIZATION 1 +#endif +#define UDIV_TIME 220 +#endif + + +#if defined (__GNUC__) + +/* We sometimes need to clobber "cc" with gcc2, but that would not be + understood by gcc1. Use cpp to avoid major code duplication. */ +#if __GNUC__ < 2 +#define __CLOBBER_CC +#define __AND_CLOBBER_CC +#else /* __GNUC__ >= 2 */ +#define __CLOBBER_CC : "cc" +#define __AND_CLOBBER_CC , "cc" +#endif /* __GNUC__ < 2 */ + +#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl)) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "r" (al), "rI" (bl)) +#define umul_ppmm(xh, xl, m0, m1) \ + do { \ + USItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("multiplu %0,%1,%2" \ + : "=r" (xl) \ + : "r" (__m0), "r" (__m1)); \ + __asm__ ("multmu %0,%1,%2" \ + : "=r" (xh) \ + : "r" (__m0), "r" (__m1)); \ + } while (0) +#define udiv_qrnnd(q, r, n1, n0, d) \ + __asm__ ("dividu %0,%3,%4" \ + : "=r" (q), "=q" (r) \ + : "1" (n1), "r" (n0), "r" (d)) +#define count_leading_zeros(count, x) \ + __asm__ ("clz %0,%1" \ + : "=r" (count) \ + : "r" (x)) +#define COUNT_LEADING_ZEROS_0 32 +#endif /* __a29k__ */ + +#if defined (__arc__) +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \ + : "=r" (sh), \ + "=&r" (sl) \ + : "r" ((USItype) (ah)), \ + "rIJ" ((USItype) (bh)), \ + "%r" ((USItype) (al)), \ + "rIJ" ((USItype) (bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ + : "=r" (sh), \ + "=&r" (sl) \ + : "r" ((USItype) (ah)), \ + "rIJ" ((USItype) (bh)), \ + "r" ((USItype) (al)), \ + "rIJ" ((USItype) (bl))) +#endif + +#if defined (__arm__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + do { \ + if (__builtin_constant_p (al)) \ + { \ + if (__builtin_constant_p (ah)) \ + __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ + : "=r" (sh), "=&r" (sl) \ + : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ + else \ + __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ + } \ + else if (__builtin_constant_p (ah)) \ + { \ + if (__builtin_constant_p (bl)) \ + __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ + : "=r" (sh), "=&r" (sl) \ + : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ + else \ + __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ + : "=r" (sh), "=&r" (sl) \ + : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ + } \ + else if (__builtin_constant_p (bl)) \ + { \ + if (__builtin_constant_p (bh)) \ + __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ + else \ + __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ + : "=r" (sh), "=&r" (sl) \ + : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ + } \ + else /* only bh might be a constant */ \ + __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\ + } while (0) +#if 1 || defined (__arm_m__) /* `M' series has widening multiply support */ +#define umul_ppmm(xh, xl, a, b) \ + __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) +#define UMUL_TIME 5 +#define smul_ppmm(xh, xl, a, b) \ + __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { UWtype __di; \ + __di = __MPN(invert_limb) (d); \ + udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ + } while (0) +#define UDIV_PREINV_ALWAYS 1 +#define UDIV_NEEDS_NORMALIZATION 1 +#define UDIV_TIME 70 +#endif /* LONGLONG_STANDALONE */ +#else +#define umul_ppmm(xh, xl, a, b) \ + __asm__ ("%@ Inlined umul_ppmm\n" \ +" mov %|r0, %2, lsr #16\n" \ +" mov %|r2, %3, lsr #16\n" \ +" bic %|r1, %2, %|r0, lsl #16\n" \ +" bic %|r2, %3, %|r2, lsl #16\n" \ +" mul %1, %|r1, %|r2\n" \ +" mul %|r2, %|r0, %|r2\n" \ +" mul %|r1, %0, %|r1\n" \ +" mul %0, %|r0, %0\n" \ +" adds %|r1, %|r2, %|r1\n" \ +" addcs %0, %0, #65536\n" \ +" adds %1, %1, %|r1, lsl #16\n" \ +" adc %0, %0, %|r1, lsr #16" \ + : "=&r" (xh), "=r" (xl) \ + : "r" (a), "r" (b) \ + : "r0", "r1", "r2") +#define UMUL_TIME 20 +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { UWtype __r; \ + (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ + (r) = __r; \ + } while (0) +extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); +#define UDIV_TIME 200 +#endif /* LONGLONG_STANDALONE */ +#endif +/* This is a bizarre test, but GCC doesn't define useful common symbol. */ +#if defined (__ARM_ARCH_5__) || defined (__ARM_ARCH_5T__) || \ + defined (__ARM_ARCH_5E__) || defined (__ARM_ARCH_5TE__)|| \ + defined (__ARM_ARCH_6__) || defined (__ARM_ARCH_6J__) || \ + defined (__ARM_ARCH_6K__) || defined (__ARM_ARCH_6Z__) || \ + defined (__ARM_ARCH_6ZK__)|| defined (__ARM_ARCH_6T2__)|| \ + defined (__ARM_ARCH_6M__) || defined (__ARM_ARCH_7__) || \ + defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7R__) || \ + defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__) +#define count_leading_zeros(count, x) \ + __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x)) +#define COUNT_LEADING_ZEROS_0 32 +#endif +#endif /* __arm__ */ + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rZ" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + do { \ + if (__builtin_constant_p (bl)) \ + { \ + __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ + } \ + else /* only bh might be a constant */ \ + __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rZ" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\ + } while (0) +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + UDItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (m0), "r" (m1)); \ + (pl) = __m0 * __m1; \ + } while (0) +#define count_leading_zeros(count, x) \ + __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x)) +#define COUNT_LEADING_ZEROS_0 64 +#endif /* __aarch64__ */ + +#if defined (__clipper__) && W_TYPE_SIZE == 32 +#define umul_ppmm(w1, w0, u, v) \ + ({union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __x; \ + __asm__ ("mulwux %2,%0" \ + : "=r" (__x.__ll) \ + : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ + (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) +#define smul_ppmm(w1, w0, u, v) \ + ({union {DItype __ll; \ + struct {SItype __l, __h;} __i; \ + } __x; \ + __asm__ ("mulwx %2,%0" \ + : "=r" (__x.__ll) \ + : "%0" ((SItype)(u)), "r" ((SItype)(v))); \ + (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) +#define __umulsidi3(u, v) \ + ({UDItype __w; \ + __asm__ ("mulwux %2,%0" \ + : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \ + __w; }) +#endif /* __clipper__ */ + +/* Fujitsu vector computers. */ +#if defined (__uxp__) && W_TYPE_SIZE == 32 +#define umul_ppmm(ph, pl, u, v) \ + do { \ + union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x; \ + __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\ + (ph) = __x.__i.__h; \ + (pl) = __x.__i.__l; \ + } while (0) +#define smul_ppmm(ph, pl, u, v) \ + do { \ + union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x; \ + __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \ + (ph) = __x.__i.__h; \ + (pl) = __x.__i.__l; \ + } while (0) +#endif + +#if defined (__gmicro__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add.w %5,%1\n\taddx %3,%0" \ + : "=g" (sh), "=&g" (sl) \ + : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ + "%1" ((USItype)(al)), "g" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \ + : "=g" (sh), "=&g" (sl) \ + : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ + "1" ((USItype)(al)), "g" ((USItype)(bl))) +#define umul_ppmm(ph, pl, m0, m1) \ + __asm__ ("mulx %3,%0,%1" \ + : "=g" (ph), "=r" (pl) \ + : "%0" ((USItype)(m0)), "g" ((USItype)(m1))) +#define udiv_qrnnd(q, r, nh, nl, d) \ + __asm__ ("divx %4,%0,%1" \ + : "=g" (q), "=r" (r) \ + : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d))) +#define count_leading_zeros(count, x) \ + __asm__ ("bsch/1 %1,%0" \ + : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0)) +#endif + +#if defined (__hppa) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) +#if defined (_PA_RISC1_1) +#define umul_ppmm(wh, wl, u, v) \ + do { \ + union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x; \ + __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \ + (wh) = __x.__i.__h; \ + (wl) = __x.__i.__l; \ + } while (0) +#define UMUL_TIME 8 +#define UDIV_TIME 60 +#else +#define UMUL_TIME 40 +#define UDIV_TIME 80 +#endif +#define count_leading_zeros(count, x) \ + do { \ + USItype __tmp; \ + __asm__ ( \ + "ldi 1,%0\n" \ +" extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \ +" extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \ +" ldo 16(%0),%0 ; Yes. Perform add.\n" \ +" extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \ +" extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \ +" ldo 8(%0),%0 ; Yes. Perform add.\n" \ +" extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \ +" extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \ +" ldo 4(%0),%0 ; Yes. Perform add.\n" \ +" extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \ +" extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \ +" ldo 2(%0),%0 ; Yes. Perform add.\n" \ +" extru %1,30,1,%1 ; Extract bit 1.\n" \ +" sub %0,%1,%0 ; Subtract it.\n" \ + : "=r" (count), "=r" (__tmp) : "1" (x)); \ + } while (0) +#endif /* hppa */ + +/* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC + (3.2) puts longlong into two adjacent 32-bit registers. Presumably this + is just a case of no direct support for 2.0n but treating it like 1.0. */ +#if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB) +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl)) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl)) +#endif /* hppa */ + +#if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32 +#if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch) +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + do { \ +/* if (__builtin_constant_p (bl)) \ + __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\ + else \ +*/ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \ + } while (0) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + do { \ +/* if (__builtin_constant_p (bl)) \ + __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \ + else \ +*/ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \ + } while (0) +#if __GMP_GNUC_PREREQ (4,5) +#define umul_ppmm(xh, xl, m0, m1) \ + do { \ + union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x; \ + __x.__ll = (UDItype) (m0) * (UDItype) (m1); \ + (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ + } while (0) +#else +#if 0 +/* FIXME: this fails if gcc knows about the 64-bit registers. Use only + with a new enough processor pretending we have 32-bit registers. */ +#define umul_ppmm(xh, xl, m0, m1) \ + do { \ + union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x; \ + __asm__ ("mlr\t%0,%2" \ + : "=r" (__x.__ll) \ + : "%0" (m0), "r" (m1)); \ + (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ + } while (0) +#else +#define umul_ppmm(xh, xl, m0, m1) \ + do { \ + /* When we have 64-bit regs and gcc is aware of that, we cannot simply use + DImode for the product, since that would be allocated to a single 64-bit + register, whereas mlr uses the low 32-bits of an even-odd register pair. + */ \ + register USItype __r0 __asm__ ("0"); \ + register USItype __r1 __asm__ ("1") = (m0); \ + __asm__ ("mlr\t%0,%3" \ + : "=r" (__r0), "=r" (__r1) \ + : "r" (__r1), "r" (m1)); \ + (xh) = __r0; (xl) = __r1; \ + } while (0) +#endif /* if 0 */ +#endif +#if 0 +/* FIXME: this fails if gcc knows about the 64-bit registers. Use only + with a new enough processor pretending we have 32-bit registers. */ +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { \ + union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x; \ + __x.__i.__h = n1; __x.__i.__l = n0; \ + __asm__ ("dlr\t%0,%2" \ + : "=r" (__x.__ll) \ + : "0" (__x.__ll), "r" (d)); \ + (q) = __x.__i.__l; (r) = __x.__i.__h; \ + } while (0) +#else +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { \ + register USItype __r0 __asm__ ("0") = (n1); \ + register USItype __r1 __asm__ ("1") = (n0); \ + __asm__ ("dlr\t%0,%4" \ + : "=r" (__r0), "=r" (__r1) \ + : "r" (__r0), "r" (__r1), "r" (d)); \ + (q) = __r1; (r) = __r0; \ + } while (0) +#endif /* if 0 */ +#else /* if __zarch__ */ +/* FIXME: this fails if gcc knows about the 64-bit registers. */ +#define smul_ppmm(xh, xl, m0, m1) \ + do { \ + union {DItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x; \ + __asm__ ("mr\t%0,%2" \ + : "=r" (__x.__ll) \ + : "%0" (m0), "r" (m1)); \ + (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ + } while (0) +/* FIXME: this fails if gcc knows about the 64-bit registers. */ +#define sdiv_qrnnd(q, r, n1, n0, d) \ + do { \ + union {DItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x; \ + __x.__i.__h = n1; __x.__i.__l = n0; \ + __asm__ ("dr\t%0,%2" \ + : "=r" (__x.__ll) \ + : "0" (__x.__ll), "r" (d)); \ + (q) = __x.__i.__l; (r) = __x.__i.__h; \ + } while (0) +#endif /* if __zarch__ */ +#endif + +#if defined (__s390x__) && W_TYPE_SIZE == 64 +/* We need to cast operands with register constraints, otherwise their types + will be assumed to be SImode by gcc. For these machines, such operations + will insert a value into the low 32 bits, and leave the high 32 bits with + garbage. */ +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + do { \ + __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ + "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ + } while (0) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + do { \ + __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \ + "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \ + } while (0) +#define umul_ppmm(xh, xl, m0, m1) \ + do { \ + union {unsigned int __attribute__ ((mode(TI))) __ll; \ + struct {UDItype __h, __l;} __i; \ + } __x; \ + __asm__ ("mlgr\t%0,%2" \ + : "=r" (__x.__ll) \ + : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \ + (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ + } while (0) +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { \ + union {unsigned int __attribute__ ((mode(TI))) __ll; \ + struct {UDItype __h, __l;} __i; \ + } __x; \ + __x.__i.__h = n1; __x.__i.__l = n0; \ + __asm__ ("dlgr\t%0,%2" \ + : "=r" (__x.__ll) \ + : "0" (__x.__ll), "r" ((UDItype)(d))); \ + (q) = __x.__i.__l; (r) = __x.__i.__h; \ + } while (0) +#if 0 /* FIXME: Enable for z10 (?) */ +#define count_leading_zeros(cnt, x) \ + do { \ + union {unsigned int __attribute__ ((mode(TI))) __ll; \ + struct {UDItype __h, __l;} __i; \ + } __clr_cnt; \ + __asm__ ("flogr\t%0,%1" \ + : "=r" (__clr_cnt.__ll) \ + : "r" (x) __CLOBBER_CC); \ + (cnt) = __clr_cnt.__i.__h; \ + } while (0) +#endif +#endif + +#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ + "%1" ((USItype)(al)), "g" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ + "1" ((USItype)(al)), "g" ((USItype)(bl))) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("mull %3" \ + : "=a" (w0), "=d" (w1) \ + : "%0" ((USItype)(u)), "rm" ((USItype)(v))) +#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ + __asm__ ("divl %4" /* stringification in K&R C */ \ + : "=a" (q), "=d" (r) \ + : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx))) + +#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx +/* Pentium bsrl takes between 10 and 72 cycles depending where the most + significant 1 bit is, hence the use of the following alternatives. bsfl + is slow too, between 18 and 42 depending where the least significant 1 + bit is, so let the generic count_trailing_zeros below make use of the + count_leading_zeros here too. */ + +#if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE) +/* The following should be a fixed 14 or 15 cycles, but possibly plus an L1 + cache miss reading from __clz_tab. For P55 it's favoured over the float + below so as to avoid mixing MMX and x87, since the penalty for switching + between the two is about 100 cycles. + + The asm block sets __shift to -3 if the high 24 bits are clear, -2 for + 16, -1 for 8, or 0 otherwise. This could be written equivalently as + follows, but as of gcc 2.95.2 it results in conditional jumps. + + __shift = -(__n < 0x1000000); + __shift -= (__n < 0x10000); + __shift -= (__n < 0x100); + + The middle two sbbl and cmpl's pair, and with luck something gcc + generates might pair with the first cmpl and the last sbbl. The "32+1" + constant could be folded into __clz_tab[], but it doesn't seem worth + making a different table just for that. */ + +#define count_leading_zeros(c,n) \ + do { \ + USItype __n = (n); \ + USItype __shift; \ + __asm__ ("cmpl $0x1000000, %1\n" \ + "sbbl %0, %0\n" \ + "cmpl $0x10000, %1\n" \ + "sbbl $0, %0\n" \ + "cmpl $0x100, %1\n" \ + "sbbl $0, %0\n" \ + : "=&r" (__shift) : "r" (__n)); \ + __shift = __shift*8 + 24 + 1; \ + (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \ + } while (0) +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB +#define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */ + +#else /* ! pentiummmx || LONGLONG_STANDALONE */ +/* The following should be a fixed 14 cycles or so. Some scheduling + opportunities should be available between the float load/store too. This + sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is + apparently suggested by the Intel optimizing manual (don't know exactly + where). gcc 2.95 or up will be best for this, so the "double" is + correctly aligned on the stack. */ +#define count_leading_zeros(c,n) \ + do { \ + union { \ + double d; \ + unsigned a[2]; \ + } __u; \ + ASSERT ((n) != 0); \ + __u.d = (UWtype) (n); \ + (c) = 0x3FF + 31 - (__u.a[1] >> 20); \ + } while (0) +#define COUNT_LEADING_ZEROS_0 (0x3FF + 31) +#endif /* pentiummx */ + +#else /* ! pentium */ + +#if __GMP_GNUC_PREREQ (3,4) /* using bsrl */ +#define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x) +#endif /* gcc clz */ + +/* On P6, gcc prior to 3.0 generates a partial register stall for + __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former + being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the + cost of one extra instruction. Do this for "i386" too, since that means + generic x86. */ +#if ! defined (count_leading_zeros) && __GNUC__ < 3 \ + && (HAVE_HOST_CPU_i386 \ + || HAVE_HOST_CPU_i686 \ + || HAVE_HOST_CPU_pentiumpro \ + || HAVE_HOST_CPU_pentium2 \ + || HAVE_HOST_CPU_pentium3) +#define count_leading_zeros(count, x) \ + do { \ + USItype __cbtmp; \ + ASSERT ((x) != 0); \ + __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ + (count) = 31 - __cbtmp; \ + } while (0) +#endif /* gcc<3 asm bsrl */ + +#ifndef count_leading_zeros +#define count_leading_zeros(count, x) \ + do { \ + USItype __cbtmp; \ + ASSERT ((x) != 0); \ + __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ + (count) = __cbtmp ^ 31; \ + } while (0) +#endif /* asm bsrl */ + +#if __GMP_GNUC_PREREQ (3,4) /* using bsfl */ +#define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x) +#endif /* gcc ctz */ + +#ifndef count_trailing_zeros +#define count_trailing_zeros(count, x) \ + do { \ + ASSERT ((x) != 0); \ + __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \ + } while (0) +#endif /* asm bsfl */ + +#endif /* ! pentium */ + +#ifndef UMUL_TIME +#define UMUL_TIME 10 +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 40 +#endif +#endif /* 80x86 */ + +#if defined (__amd64__) && W_TYPE_SIZE == 64 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ + "%1" ((UDItype)(al)), "rme" ((UDItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ + "1" ((UDItype)(al)), "rme" ((UDItype)(bl))) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("mulq %3" \ + : "=a" (w0), "=d" (w1) \ + : "%0" ((UDItype)(u)), "rm" ((UDItype)(v))) +#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ + __asm__ ("divq %4" /* stringification in K&R C */ \ + : "=a" (q), "=d" (r) \ + : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx))) +/* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */ +#define count_leading_zeros(count, x) \ + do { \ + UDItype __cbtmp; \ + ASSERT ((x) != 0); \ + __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ + (count) = __cbtmp ^ 63; \ + } while (0) +/* bsfq destination must be a 64-bit register, "%q0" forces this in case + count is only an int. */ +#define count_trailing_zeros(count, x) \ + do { \ + ASSERT ((x) != 0); \ + __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \ + } while (0) +#endif /* x86_64 */ + +#if defined (__i860__) && W_TYPE_SIZE == 32 +#define rshift_rhlc(r,h,l,c) \ + __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \ + "=r" (r) : "r" (h), "r" (l), "rn" (c)) +#endif /* i860 */ + +#if defined (__i960__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl)) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl)) +#define umul_ppmm(w1, w0, u, v) \ + ({union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __x; \ + __asm__ ("emul %2,%1,%0" \ + : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \ + (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) +#define __umulsidi3(u, v) \ + ({UDItype __w; \ + __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \ + __w; }) +#define udiv_qrnnd(q, r, nh, nl, d) \ + do { \ + union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __nn; \ + __nn.__i.__h = (nh); __nn.__i.__l = (nl); \ + __asm__ ("ediv %d,%n,%0" \ + : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \ + (r) = __rq.__i.__l; (q) = __rq.__i.__h; \ + } while (0) +#define count_leading_zeros(count, x) \ + do { \ + USItype __cbtmp; \ + __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \ + (count) = __cbtmp ^ 31; \ + } while (0) +#define COUNT_LEADING_ZEROS_0 (-32) /* sic */ +#if defined (__i960mx) /* what is the proper symbol to test??? */ +#define rshift_rhlc(r,h,l,c) \ + do { \ + union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __nn; \ + __nn.__i.__h = (h); __nn.__i.__l = (l); \ + __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \ + } +#endif /* i960mx */ +#endif /* i960 */ + +#if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \ + || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \ + || defined (__mc5307__)) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \ + : "=d" (sh), "=&d" (sl) \ + : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ + "%1" ((USItype)(al)), "g" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \ + : "=d" (sh), "=&d" (sl) \ + : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \ + "1" ((USItype)(al)), "g" ((USItype)(bl))) +/* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */ +#if defined (__mc68020__) || defined(mc68020) \ + || defined (__mc68030__) || defined (mc68030) \ + || defined (__mc68040__) || defined (mc68040) \ + || defined (__mcpu32__) || defined (mcpu32) \ + || defined (__NeXT__) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("mulu%.l %3,%1:%0" \ + : "=d" (w0), "=d" (w1) \ + : "%0" ((USItype)(u)), "dmi" ((USItype)(v))) +#define UMUL_TIME 45 +#define udiv_qrnnd(q, r, n1, n0, d) \ + __asm__ ("divu%.l %4,%1:%0" \ + : "=d" (q), "=d" (r) \ + : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) +#define UDIV_TIME 90 +#define sdiv_qrnnd(q, r, n1, n0, d) \ + __asm__ ("divs%.l %4,%1:%0" \ + : "=d" (q), "=d" (r) \ + : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d))) +#else /* for other 68k family members use 16x16->32 multiplication */ +#define umul_ppmm(xh, xl, a, b) \ + do { USItype __umul_tmp1, __umul_tmp2; \ + __asm__ ("| Inlined umul_ppmm\n" \ +" move%.l %5,%3\n" \ +" move%.l %2,%0\n" \ +" move%.w %3,%1\n" \ +" swap %3\n" \ +" swap %0\n" \ +" mulu%.w %2,%1\n" \ +" mulu%.w %3,%0\n" \ +" mulu%.w %2,%3\n" \ +" swap %2\n" \ +" mulu%.w %5,%2\n" \ +" add%.l %3,%2\n" \ +" jcc 1f\n" \ +" add%.l %#0x10000,%0\n" \ +"1: move%.l %2,%3\n" \ +" clr%.w %2\n" \ +" swap %2\n" \ +" swap %3\n" \ +" clr%.w %3\n" \ +" add%.l %3,%1\n" \ +" addx%.l %2,%0\n" \ +" | End inlined umul_ppmm" \ + : "=&d" (xh), "=&d" (xl), \ + "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \ + : "%2" ((USItype)(a)), "d" ((USItype)(b))); \ + } while (0) +#define UMUL_TIME 100 +#define UDIV_TIME 400 +#endif /* not mc68020 */ +/* The '020, '030, '040 and '060 have bitfield insns. + GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to + exclude bfffo on that chip (bitfield insns not available). */ +#if (defined (__mc68020__) || defined (mc68020) \ + || defined (__mc68030__) || defined (mc68030) \ + || defined (__mc68040__) || defined (mc68040) \ + || defined (__mc68060__) || defined (mc68060) \ + || defined (__NeXT__)) \ + && ! defined (__mcpu32__) +#define count_leading_zeros(count, x) \ + __asm__ ("bfffo %1{%b2:%b2},%0" \ + : "=d" (count) \ + : "od" ((USItype) (x)), "n" (0)) +#define COUNT_LEADING_ZEROS_0 32 +#endif +#endif /* mc68000 */ + +#if defined (__m88000__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \ + : "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl)) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \ + : "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl)) +#define count_leading_zeros(count, x) \ + do { \ + USItype __cbtmp; \ + __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \ + (count) = __cbtmp ^ 31; \ + } while (0) +#define COUNT_LEADING_ZEROS_0 63 /* sic */ +#if defined (__m88110__) +#define umul_ppmm(wh, wl, u, v) \ + do { \ + union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x; \ + __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \ + (wh) = __x.__i.__h; \ + (wl) = __x.__i.__l; \ + } while (0) +#define udiv_qrnnd(q, r, n1, n0, d) \ + ({union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x, __q; \ + __x.__i.__h = (n1); __x.__i.__l = (n0); \ + __asm__ ("divu.d %0,%1,%2" \ + : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \ + (r) = (n0) - __q.__l * (d); (q) = __q.__l; }) +#define UMUL_TIME 5 +#define UDIV_TIME 25 +#else +#define UMUL_TIME 17 +#define UDIV_TIME 150 +#endif /* __m88110__ */ +#endif /* __m88000__ */ + +#if defined (__mips) && W_TYPE_SIZE == 32 +#if __GMP_GNUC_PREREQ (4,4) +#define umul_ppmm(w1, w0, u, v) \ + do { \ + UDItype __ll = (UDItype)(u) * (v); \ + w1 = __ll >> 32; \ + w0 = __ll; \ + } while (0) +#endif +#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v)) +#endif +#if !defined (umul_ppmm) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \ + : "=d" (w0), "=d" (w1) : "d" (u), "d" (v)) +#endif +#define UMUL_TIME 10 +#define UDIV_TIME 100 +#endif /* __mips */ + +#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64 +#if __GMP_GNUC_PREREQ (4,4) +#define umul_ppmm(w1, w0, u, v) \ + do { \ + typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ + __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ + w1 = __ll >> 64; \ + w0 = __ll; \ + } while (0) +#endif +#if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v)) +#endif +#if !defined (umul_ppmm) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \ + : "=d" (w0), "=d" (w1) : "d" (u), "d" (v)) +#endif +#define UMUL_TIME 20 +#define UDIV_TIME 140 +#endif /* __mips */ + +#if defined (__mmix__) && W_TYPE_SIZE == 64 +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v)) +#endif + +#if defined (__ns32000__) && W_TYPE_SIZE == 32 +#define umul_ppmm(w1, w0, u, v) \ + ({union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __x; \ + __asm__ ("meid %2,%0" \ + : "=g" (__x.__ll) \ + : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ + (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) +#define __umulsidi3(u, v) \ + ({UDItype __w; \ + __asm__ ("meid %2,%0" \ + : "=g" (__w) \ + : "%0" ((USItype)(u)), "g" ((USItype)(v))); \ + __w; }) +#define udiv_qrnnd(q, r, n1, n0, d) \ + ({union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __x; \ + __x.__i.__h = (n1); __x.__i.__l = (n0); \ + __asm__ ("deid %2,%0" \ + : "=g" (__x.__ll) \ + : "0" (__x.__ll), "g" ((USItype)(d))); \ + (r) = __x.__i.__l; (q) = __x.__i.__h; }) +#define count_trailing_zeros(count,x) \ + do { \ + __asm__ ("ffsd %2,%0" \ + : "=r" (count) \ + : "0" ((USItype) 0), "r" ((USItype) (x))); \ + } while (0) +#endif /* __ns32000__ */ + +/* In the past we had a block of various #defines tested + _ARCH_PPC - AIX + _ARCH_PWR - AIX + __powerpc__ - gcc + __POWERPC__ - BEOS + __ppc__ - Darwin + PPC - old gcc, GNU/Linux, SysV + The plain PPC test was not good for vxWorks, since PPC is defined on all + CPUs there (eg. m68k too), as a constant one is expected to compare + CPU_FAMILY against. + + At any rate, this was pretty unattractive and a bit fragile. The use of + HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of + getting the desired effect. + + ENHANCE-ME: We should test _IBMR2 here when we add assembly support for + the system vendor compilers. (Is that vendor compilers with inline asm, + or what?) */ + +#if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \ + && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + do { \ + if (__builtin_constant_p (bh) && (bh) == 0) \ + __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ + else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ + __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ + else \ + __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \ + } while (0) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + do { \ + if (__builtin_constant_p (ah) && (ah) == 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ + else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ + else if (__builtin_constant_p (bh) && (bh) == 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ + else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ + else \ + __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \ + } while (0) +#define count_leading_zeros(count, x) \ + __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x)) +#define COUNT_LEADING_ZEROS_0 32 +#if HAVE_HOST_CPU_FAMILY_powerpc +#if __GMP_GNUC_PREREQ (4,4) +#define umul_ppmm(w1, w0, u, v) \ + do { \ + UDItype __ll = (UDItype)(u) * (v); \ + w1 = __ll >> 32; \ + w0 = __ll; \ + } while (0) +#endif +#if !defined (umul_ppmm) +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + USItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ + (pl) = __m0 * __m1; \ + } while (0) +#endif +#define UMUL_TIME 15 +#define smul_ppmm(ph, pl, m0, m1) \ + do { \ + SItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ + (pl) = __m0 * __m1; \ + } while (0) +#define SMUL_TIME 14 +#define UDIV_TIME 120 +#else +#define UMUL_TIME 8 +#define smul_ppmm(xh, xl, m0, m1) \ + __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1)) +#define SMUL_TIME 4 +#define sdiv_qrnnd(q, r, nh, nl, d) \ + __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d)) +#define UDIV_TIME 100 +#endif +#endif /* 32-bit POWER architecture variants. */ + +/* We should test _IBMR2 here when we add assembly support for the system + vendor compilers. */ +#if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64 +#if !defined (_LONG_LONG_LIMB) +/* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So + use adde etc only when not _LONG_LONG_LIMB. */ +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + do { \ + if (__builtin_constant_p (bh) && (bh) == 0) \ + __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ + else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ + __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ + else \ + __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \ + } while (0) +/* We use "*rI" for the constant operand here, since with just "I", gcc barfs. + This might seem strange, but gcc folds away the dead code late. */ +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + do { \ + if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) { \ + if (__builtin_constant_p (ah) && (ah) == 0) \ + __asm__ ("{ai|addic} %1,%3,%4\n\t{sfze|subfze} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \ + else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ + __asm__ ("{ai|addic} %1,%3,%4\n\t{sfme|subfme} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \ + else if (__builtin_constant_p (bh) && (bh) == 0) \ + __asm__ ("{ai|addic} %1,%3,%4\n\t{ame|addme} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \ + else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ + __asm__ ("{ai|addic} %1,%3,%4\n\t{aze|addze} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \ + else \ + __asm__ ("{ai|addic} %1,%4,%5\n\t{sfe|subfe} %0,%3,%2" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl)); \ + } else { \ + if (__builtin_constant_p (ah) && (ah) == 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \ + else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \ + else if (__builtin_constant_p (bh) && (bh) == 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \ + else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \ + else \ + __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \ + } \ + } while (0) +#endif /* ! _LONG_LONG_LIMB */ +#define count_leading_zeros(count, x) \ + __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x)) +#define COUNT_LEADING_ZEROS_0 64 +#if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */ +#define umul_ppmm(w1, w0, u, v) \ + do { \ + typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \ + __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \ + w1 = __ll >> 64; \ + w0 = __ll; \ + } while (0) +#endif +#if !defined (umul_ppmm) +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + UDItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ + (pl) = __m0 * __m1; \ + } while (0) +#endif +#define UMUL_TIME 15 +#define smul_ppmm(ph, pl, m0, m1) \ + do { \ + DItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ + (pl) = __m0 * __m1; \ + } while (0) +#define SMUL_TIME 14 /* ??? */ +#define UDIV_TIME 120 /* ??? */ +#endif /* 64-bit PowerPC. */ + +#if defined (__pyr__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addw %5,%1\n\taddwc %3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ + "%1" ((USItype)(al)), "g" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subw %5,%1\n\tsubwb %3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ + "1" ((USItype)(al)), "g" ((USItype)(bl))) +/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */ +#define umul_ppmm(w1, w0, u, v) \ + ({union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x; \ + __asm__ ("movw %1,%R0\n\tuemul %2,%0" \ + : "=&r" (__x.__ll) \ + : "g" ((USItype) (u)), "g" ((USItype)(v))); \ + (w1) = __x.__i.__h; (w0) = __x.__i.__l;}) +#endif /* __pyr__ */ + +#if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("a %1,%5\n\tae %0,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ + "%1" ((USItype)(al)), "r" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("s %1,%5\n\tse %0,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \ + "1" ((USItype)(al)), "r" ((USItype)(bl))) +#define smul_ppmm(ph, pl, m0, m1) \ + __asm__ ( \ + "s r2,r2\n" \ +" mts r10,%2\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" cas %0,r2,r0\n" \ +" mfs r10,%1" \ + : "=r" (ph), "=r" (pl) \ + : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \ + : "r2") +#define UMUL_TIME 20 +#define UDIV_TIME 200 +#define count_leading_zeros(count, x) \ + do { \ + if ((x) >= 0x10000) \ + __asm__ ("clz %0,%1" \ + : "=r" (count) : "r" ((USItype)(x) >> 16)); \ + else \ + { \ + __asm__ ("clz %0,%1" \ + : "=r" (count) : "r" ((USItype)(x))); \ + (count) += 16; \ + } \ + } while (0) +#endif /* RT/ROMP */ + +#if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32 +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \ + : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach") +#define UMUL_TIME 5 +#endif + +#if defined (__sparc__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +/* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h + doesn't define anything to indicate that to us, it only sets __sparcv8. */ +#if defined (__sparc_v9__) || defined (__sparcv9) +/* Perhaps we should use floating-point operations here? */ +#if 0 +/* Triggers a bug making mpz/tests/t-gcd.c fail. + Perhaps we simply need explicitly zero-extend the inputs? */ +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \ + "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1") +#else +/* Use v8 umul until above bug is fixed. */ +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) +#endif +/* Use a plain v8 divide for v9. */ +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { \ + USItype __q; \ + __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ + : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ + (r) = (n0) - __q * (d); \ + (q) = __q; \ + } while (0) +#else +#if defined (__sparc_v8__) /* gcc normal */ \ + || defined (__sparcv8) /* gcc solaris */ \ + || HAVE_HOST_CPU_supersparc +/* Don't match immediate range because, 1) it is not often useful, + 2) the 'I' flag thinks of the range as a 13 bit signed interval, + while we want to match a 13 bit interval, sign extended to 32 bits, + but INTERPRETED AS UNSIGNED. */ +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) +#define UMUL_TIME 5 + +#if HAVE_HOST_CPU_supersparc +#define UDIV_TIME 60 /* SuperSPARC timing */ +#else +/* Don't use this on SuperSPARC because its udiv only handles 53 bit + dividends and will trap to the kernel for the rest. */ +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { \ + USItype __q; \ + __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ + : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \ + (r) = (n0) - __q * (d); \ + (q) = __q; \ + } while (0) +#define UDIV_TIME 25 +#endif /* HAVE_HOST_CPU_supersparc */ + +#else /* ! __sparc_v8__ */ +#if defined (__sparclite__) +/* This has hardware multiply but not divide. It also has two additional + instructions scan (ffs from high bit) and divscc. */ +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) +#define UMUL_TIME 5 +#define udiv_qrnnd(q, r, n1, n0, d) \ + __asm__ ("! Inlined udiv_qrnnd\n" \ +" wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \ +" tst %%g0\n" \ +" divscc %3,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%0\n" \ +" rd %%y,%1\n" \ +" bl,a 1f\n" \ +" add %1,%4,%1\n" \ +"1: ! End of inline udiv_qrnnd" \ + : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \ + : "%g1" __AND_CLOBBER_CC) +#define UDIV_TIME 37 +#define count_leading_zeros(count, x) \ + __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x)) +/* Early sparclites return 63 for an argument of 0, but they warn that future + implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0 + undefined. */ +#endif /* __sparclite__ */ +#endif /* __sparc_v8__ */ +#endif /* __sparc_v9__ */ +/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */ +#ifndef umul_ppmm +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("! Inlined umul_ppmm\n" \ +" wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \ +" sra %3,31,%%g2 ! Don't move this insn\n" \ +" and %2,%%g2,%%g2 ! Don't move this insn\n" \ +" andcc %%g0,0,%%g1 ! Don't move this insn\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,0,%%g1\n" \ +" add %%g1,%%g2,%0\n" \ +" rd %%y,%1" \ + : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \ + : "%g1", "%g2" __AND_CLOBBER_CC) +#define UMUL_TIME 39 /* 39 instructions */ +#endif +#ifndef udiv_qrnnd +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { UWtype __r; \ + (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ + (r) = __r; \ + } while (0) +extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); +#ifndef UDIV_TIME +#define UDIV_TIME 140 +#endif +#endif /* LONGLONG_STANDALONE */ +#endif /* udiv_qrnnd */ +#endif /* __sparc__ */ + +#if defined (__sparc__) && W_TYPE_SIZE == 64 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ( \ + "addcc %r4,%5,%1\n" \ + " addccc %r6,%7,%%g0\n" \ + " addc %r2,%3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \ + "%rJ" ((al) >> 32), "rI" ((bl) >> 32) \ + __CLOBBER_CC) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ( \ + "subcc %r4,%5,%1\n" \ + " subccc %r6,%7,%%g0\n" \ + " subc %r2,%3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl), \ + "rJ" ((al) >> 32), "rI" ((bl) >> 32) \ + __CLOBBER_CC) +#endif + +#if defined (__vax__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \ + : "=g" (sh), "=&g" (sl) \ + : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ + "%1" ((USItype)(al)), "g" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \ + : "=g" (sh), "=&g" (sl) \ + : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \ + "1" ((USItype)(al)), "g" ((USItype)(bl))) +#define smul_ppmm(xh, xl, m0, m1) \ + do { \ + union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __x; \ + USItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("emul %1,%2,$0,%0" \ + : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \ + (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ + } while (0) +#define sdiv_qrnnd(q, r, n1, n0, d) \ + do { \ + union {DItype __ll; \ + struct {SItype __l, __h;} __i; \ + } __x; \ + __x.__i.__h = n1; __x.__i.__l = n0; \ + __asm__ ("ediv %3,%2,%0,%1" \ + : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \ + } while (0) +#if 0 +/* FIXME: This instruction appears to be unimplemented on some systems (vax + 8800 maybe). */ +#define count_trailing_zeros(count,x) \ + do { \ + __asm__ ("ffs 0, 31, %1, %0" \ + : "=g" (count) \ + : "g" ((USItype) (x))); \ + } while (0) +#endif +#endif /* __vax__ */ + +#if defined (__z8000__) && W_TYPE_SIZE == 16 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ + "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \ + : "=r" (sh), "=&r" (sl) \ + : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \ + "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl))) +#define umul_ppmm(xh, xl, m0, m1) \ + do { \ + union {long int __ll; \ + struct {unsigned int __h, __l;} __i; \ + } __x; \ + unsigned int __m0 = (m0), __m1 = (m1); \ + __asm__ ("mult %S0,%H3" \ + : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \ + : "%1" (m0), "rQR" (m1)); \ + (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ + (xh) += ((((signed int) __m0 >> 15) & __m1) \ + + (((signed int) __m1 >> 15) & __m0)); \ + } while (0) +#endif /* __z8000__ */ + +#endif /* __GNUC__ */ + +#endif /* NO_ASM */ + + +/* FIXME: "sidi" here is highly doubtful, should sometimes be "diti". */ +#if !defined (umul_ppmm) && defined (__umulsidi3) +#define umul_ppmm(ph, pl, m0, m1) \ + { \ + UDWtype __ll = __umulsidi3 (m0, m1); \ + ph = (UWtype) (__ll >> W_TYPE_SIZE); \ + pl = (UWtype) __ll; \ + } +#endif + +#if !defined (__umulsidi3) +#define __umulsidi3(u, v) \ + ({UWtype __hi, __lo; \ + umul_ppmm (__hi, __lo, u, v); \ + ((UDWtype) __hi << W_TYPE_SIZE) | __lo; }) +#endif + + +/* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r" + forms have "reversed" arguments, meaning the pointer is last, which + sometimes allows better parameter passing, in particular on 64-bit + hppa. */ + +#define mpn_umul_ppmm __MPN(umul_ppmm) +extern UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype); + +#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \ + && ! defined (LONGLONG_STANDALONE) +#define umul_ppmm(wh, wl, u, v) \ + do { \ + UWtype __umul_ppmm__p0; \ + (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v)); \ + (wl) = __umul_ppmm__p0; \ + } while (0) +#endif + +#define mpn_umul_ppmm_r __MPN(umul_ppmm_r) +extern UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *); + +#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \ + && ! defined (LONGLONG_STANDALONE) +#define umul_ppmm(wh, wl, u, v) \ + do { \ + UWtype __umul_ppmm__p0; \ + (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_ppmm__p0); \ + (wl) = __umul_ppmm__p0; \ + } while (0) +#endif + +#define mpn_udiv_qrnnd __MPN(udiv_qrnnd) +extern UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype); + +#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \ + && ! defined (LONGLONG_STANDALONE) +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { \ + UWtype __udiv_qrnnd__r; \ + (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r, \ + (UWtype) (n1), (UWtype) (n0), (UWtype) d); \ + (r) = __udiv_qrnnd__r; \ + } while (0) +#endif + +#define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r) +extern UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *); + +#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \ + && ! defined (LONGLONG_STANDALONE) +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { \ + UWtype __udiv_qrnnd__r; \ + (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \ + &__udiv_qrnnd__r); \ + (r) = __udiv_qrnnd__r; \ + } while (0) +#endif + + +/* If this machine has no inline assembler, use C macros. */ + +#if !defined (add_ssaaaa) +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + do { \ + UWtype __x; \ + __x = (al) + (bl); \ + (sh) = (ah) + (bh) + (__x < (al)); \ + (sl) = __x; \ + } while (0) +#endif + +#if !defined (sub_ddmmss) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + do { \ + UWtype __x; \ + __x = (al) - (bl); \ + (sh) = (ah) - (bh) - ((al) < (bl)); \ + (sl) = __x; \ + } while (0) +#endif + +/* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of + smul_ppmm. */ +#if !defined (umul_ppmm) && defined (smul_ppmm) +#define umul_ppmm(w1, w0, u, v) \ + do { \ + UWtype __w1; \ + UWtype __xm0 = (u), __xm1 = (v); \ + smul_ppmm (__w1, w0, __xm0, __xm1); \ + (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ + + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ + } while (0) +#endif + +/* If we still don't have umul_ppmm, define it using plain C. + + For reference, when this code is used for squaring (ie. u and v identical + expressions), gcc recognises __x1 and __x2 are the same and generates 3 + multiplies, not 4. The subsequent additions could be optimized a bit, + but the only place GMP currently uses such a square is mpn_sqr_basecase, + and chips obliged to use this generic C umul will have plenty of worse + performance problems than a couple of extra instructions on the diagonal + of sqr_basecase. */ + +#if !defined (umul_ppmm) +#define umul_ppmm(w1, w0, u, v) \ + do { \ + UWtype __x0, __x1, __x2, __x3; \ + UHWtype __ul, __vl, __uh, __vh; \ + UWtype __u = (u), __v = (v); \ + \ + __ul = __ll_lowpart (__u); \ + __uh = __ll_highpart (__u); \ + __vl = __ll_lowpart (__v); \ + __vh = __ll_highpart (__v); \ + \ + __x0 = (UWtype) __ul * __vl; \ + __x1 = (UWtype) __ul * __vh; \ + __x2 = (UWtype) __uh * __vl; \ + __x3 = (UWtype) __uh * __vh; \ + \ + __x1 += __ll_highpart (__x0);/* this can't give carry */ \ + __x1 += __x2; /* but this indeed can */ \ + if (__x1 < __x2) /* did we get it? */ \ + __x3 += __ll_B; /* yes, add it in the proper pos. */ \ + \ + (w1) = __x3 + __ll_highpart (__x1); \ + (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \ + } while (0) +#endif + +/* If we don't have smul_ppmm, define it using umul_ppmm (which surely will + exist in one form or another. */ +#if !defined (smul_ppmm) +#define smul_ppmm(w1, w0, u, v) \ + do { \ + UWtype __w1; \ + UWtype __xm0 = (u), __xm1 = (v); \ + umul_ppmm (__w1, w0, __xm0, __xm1); \ + (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \ + - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \ + } while (0) +#endif + +/* Define this unconditionally, so it can be used for debugging. */ +#define __udiv_qrnnd_c(q, r, n1, n0, d) \ + do { \ + UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \ + \ + ASSERT ((d) != 0); \ + ASSERT ((n1) < (d)); \ + \ + __d1 = __ll_highpart (d); \ + __d0 = __ll_lowpart (d); \ + \ + __q1 = (n1) / __d1; \ + __r1 = (n1) - __q1 * __d1; \ + __m = __q1 * __d0; \ + __r1 = __r1 * __ll_B | __ll_highpart (n0); \ + if (__r1 < __m) \ + { \ + __q1--, __r1 += (d); \ + if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\ + if (__r1 < __m) \ + __q1--, __r1 += (d); \ + } \ + __r1 -= __m; \ + \ + __q0 = __r1 / __d1; \ + __r0 = __r1 - __q0 * __d1; \ + __m = __q0 * __d0; \ + __r0 = __r0 * __ll_B | __ll_lowpart (n0); \ + if (__r0 < __m) \ + { \ + __q0--, __r0 += (d); \ + if (__r0 >= (d)) \ + if (__r0 < __m) \ + __q0--, __r0 += (d); \ + } \ + __r0 -= __m; \ + \ + (q) = __q1 * __ll_B | __q0; \ + (r) = __r0; \ + } while (0) + +/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through + __udiv_w_sdiv (defined in libgcc or elsewhere). */ +#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) +#define udiv_qrnnd(q, r, nh, nl, d) \ + do { \ + UWtype __r; \ + (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \ + (r) = __r; \ + } while (0) +__GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype); +#endif + +/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */ +#if !defined (udiv_qrnnd) +#define UDIV_NEEDS_NORMALIZATION 1 +#define udiv_qrnnd __udiv_qrnnd_c +#endif + +#if !defined (count_leading_zeros) +#define count_leading_zeros(count, x) \ + do { \ + UWtype __xr = (x); \ + UWtype __a; \ + \ + if (W_TYPE_SIZE == 32) \ + { \ + __a = __xr < ((UWtype) 1 << 2*__BITS4) \ + ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \ + : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \ + : 3*__BITS4 + 1); \ + } \ + else \ + { \ + for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \ + if (((__xr >> __a) & 0xff) != 0) \ + break; \ + ++__a; \ + } \ + \ + (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \ + } while (0) +/* This version gives a well-defined value for zero. */ +#define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1) +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB +#define COUNT_LEADING_ZEROS_SLOW +#endif + +/* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */ +#if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB +#endif + +#ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB +extern const unsigned char __GMP_DECLSPEC __clz_tab[129]; +#endif + +#if !defined (count_trailing_zeros) +#if !defined (COUNT_LEADING_ZEROS_SLOW) +/* Define count_trailing_zeros using an asm count_leading_zeros. */ +#define count_trailing_zeros(count, x) \ + do { \ + UWtype __ctz_x = (x); \ + UWtype __ctz_c; \ + ASSERT (__ctz_x != 0); \ + count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \ + (count) = W_TYPE_SIZE - 1 - __ctz_c; \ + } while (0) +#else +/* Define count_trailing_zeros in plain C, assuming small counts are common. + We use clz_tab without ado, since the C count_leading_zeros above will have + pulled it in. */ +#define count_trailing_zeros(count, x) \ + do { \ + UWtype __ctz_x = (x); \ + int __ctz_c; \ + \ + if (LIKELY ((__ctz_x & 0xff) != 0)) \ + (count) = __clz_tab[__ctz_x & -__ctz_x] - 2; \ + else \ + { \ + for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8) \ + { \ + __ctz_x >>= 8; \ + if (LIKELY ((__ctz_x & 0xff) != 0)) \ + break; \ + } \ + \ + (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x]; \ + } \ + } while (0) +#endif +#endif + +#ifndef UDIV_NEEDS_NORMALIZATION +#define UDIV_NEEDS_NORMALIZATION 0 +#endif + +/* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and + that hence the latter should always be used. */ +#ifndef UDIV_PREINV_ALWAYS +#define UDIV_PREINV_ALWAYS 0 +#endif + +/* Give defaults for UMUL_TIME and UDIV_TIME. */ +#ifndef UMUL_TIME +#define UMUL_TIME 1 +#endif + +#ifndef UDIV_TIME +#define UDIV_TIME UMUL_TIME +#endif |