/* Computes a^{1/k - 1} (mod B^n). Both a and k must be odd. Iterates r' <-- r - r * (a^{k-1} r^k - 1) / n If a^{k-1} r^k = 1 (mod 2^m), then a^{k-1} r'^k = 1 (mod 2^{2m}), Compute the update term as r' = r - (a^{k-1} r^{k+1} - r) / k where we still have cancellation of low limbs. */ void mpn_broot_invm1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k) { mp_size_t sizes[GMP_LIMB_BITS * 2]; mp_ptr akm1, tp, rnp, ep; mp_limb_t a0, r0, km1, kp1h, kinv; mp_size_t rn; unsigned i; TMP_DECL; ASSERT (n > 0); ASSERT (ap[0] & 1); ASSERT (k & 1); ASSERT (k >= 3); TMP_MARK; akm1 = TMP_ALLOC_LIMBS (4*n); tp = akm1 + n; km1 = k-1; /* FIXME: Could arrange the iteration so we don't need to compute this up front, computing a^{k-1} * r^k as (a r)^{k-1} * r. Note that we can use wraparound also for a*r, since the low half is unchanged from the previous iteration. Or possibly mulmid. Also, a r = a^{1/k}, so we get that value too, for free? */ mpn_powlo (akm1, ap, &km1, 1, n, tp); /* 3 n scratch space */ a0 = ap[0]; binvert_limb (kinv, k); /* 4 bits: a^{1/k - 1} (mod 16): a % 8 1 3 5 7 k%4 +------- 1 |1 1 1 1 3 |1 9 9 1 */ r0 = 1 + (((k << 2) & ((a0 << 1) ^ (a0 << 2))) & 8); r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7f)); /* 8 bits */ r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7fff)); /* 16 bits */ r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); /* 32 bits */ #if GMP_NUMB_BITS > 32 { unsigned prec = 32; do { r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); prec *= 2; } while (prec < GMP_NUMB_BITS); } #endif rp[0] = r0; if (n == 1) { TMP_FREE; return; } /* For odd k, (k+1)/2 = k/2+1, and the latter avoids overflow. */ kp1h = k/2 + 1; /* FIXME: Special case for two limb iteration. */ rnp = TMP_ALLOC_LIMBS (2*n + 1); ep = rnp + n; /* FIXME: Possible to this on the fly with some bit fiddling. */ for (i = 0; n > 1; n = (n + 1)/2) sizes[i++] = n; rn = 1; while (i-- > 0) { /* Compute x^{k+1}. */ mpn_sqr (ep, rp, rn); /* For odd n, writes n+1 limbs in the final iteration. */ mpn_powlo (rnp, ep, &kp1h, 1, sizes[i], tp); /* Multiply by a^{k-1}. Can use wraparound; low part equals r. */ mpn_mullo_n (ep, rnp, akm1, sizes[i]); ASSERT (mpn_cmp (ep, rp, rn) == 0); ASSERT (sizes[i] <= 2*rn); mpn_pi1_bdiv_q_1 (rp + rn, ep + rn, sizes[i] - rn, k, kinv, 0); mpn_neg (rp + rn, rp + rn, sizes[i] - rn); rn = sizes[i]; } TMP_FREE; }
void mpn_brootinv (mp_ptr rp, mp_srcptr yp, mp_size_t bn, mp_limb_t k, mp_ptr tp) { mp_ptr tp2, tp3; mp_limb_t kinv, k2, r0, y0; mp_size_t order[GMP_LIMB_BITS + 1]; int i, d; ASSERT (bn > 0); ASSERT ((k & 1) != 0); tp2 = tp + bn; tp3 = tp + 2 * bn; k2 = k + 1; binvert_limb (kinv, k); /* 4-bit initial approximation: y%16 | 1 3 5 7 9 11 13 15, k%4 +-------------------------+k2%4 1 | 1 11 13 7 9 3 5 15 | 2 3 | 1 3 5 7 9 11 13 15 | 0 */ y0 = yp[0]; r0 = y0 ^ (((y0 << 1) ^ (y0 << 2)) & (k2 << 2) & 8); /* 4 bits */ r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2 & 0x7f)); /* 8 bits */ r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2 & 0x7fff)); /* 16 bits */ #if GMP_NUMB_BITS > 16 { unsigned prec = 16; do { r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2)); prec *= 2; } while (prec < GMP_NUMB_BITS); } #endif rp[0] = r0; if (bn == 1) return; /* This initialization doesn't matter for the result (any garbage is cancelled in the iteration), but proper initialization makes valgrind happier. */ MPN_ZERO (rp+1, bn-1); d = 0; for (; bn > 1; bn = (bn + 1) >> 1) order[d++] = bn; for (i = d - 1; i >= 0; i--) { bn = order[i]; mpn_mul_1 (tp, rp, bn, k2); mpn_powlo (tp2, rp, &k2, 1, bn, tp3); mpn_mullo_n (rp, yp, tp2, bn); mpn_sub_n (tp2, tp, rp, bn); mpn_pi1_bdiv_q_1 (rp, tp2, bn, k, kinv, 0); } }