/* based on gmp's mpz_import. * see http://gmplib.org/manual/Integer-Import-and-Export.html */ int mp_import(mp_int* rop, size_t count, int order, size_t size, int endian, size_t nails, const void* op) { int result; size_t odd_nails, nail_bytes, i, j; unsigned char odd_nail_mask; mp_zero(rop); if (endian == 0) { union { unsigned int i; char c[4]; } lint; lint.i = 0x01020304; endian = (lint.c[0] == 4) ? -1 : 1; } odd_nails = (nails % 8); odd_nail_mask = 0xff; for (i = 0; i < odd_nails; ++i) { odd_nail_mask ^= (1 << (7 - i)); } nail_bytes = nails / 8; for (i = 0; i < count; ++i) { for (j = 0; j < (size - nail_bytes); ++j) { unsigned char byte = *( (unsigned char*)op + (((order == 1) ? i : ((count - 1) - i)) * size) + ((endian == 1) ? (j + nail_bytes) : (((size - 1) - j) - nail_bytes)) ); if ( (result = mp_mul_2d(rop, ((j == 0) ? (8 - odd_nails) : 8), rop)) != MP_OKAY) { return result; } rop->dp[0] |= (j == 0) ? (byte & odd_nail_mask) : byte; rop->used += 1; } } mp_clamp(rop); return MP_OKAY; }
/* single digit division (based on routine from MPI) */ int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d) { mp_int q; mp_word w; mp_digit t; int res, ix; if (b == 0) { return MP_VAL; } if (b == 3) { return mp_div_3(a, c, d); } if ((res = mp_init_size(&q, a->used)) != MP_OKAY) { return res; } q.used = a->used; q.sign = a->sign; w = 0; for (ix = a->used - 1; ix >= 0; ix--) { w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]); if (w >= b) { t = (mp_digit)(w / b); w = w % b; } else { t = 0; } q.dp[ix] = (mp_digit)t; } if (d != NULL) { *d = (mp_digit)w; } if (c != NULL) { mp_clamp(&q); mp_exch(&q, c); } mp_clear(&q); return res; }
/* b = a/2 */ int mp_div_2 (mp_int * a, mp_int * b) { int x, res, oldused; /* copy */ if (b->alloc < a->used) { if ((res = mp_grow (b, a->used)) != MP_OKAY) { return res; } } oldused = b->used; b->used = a->used; { register mp_digit r, rr, *tmpa, *tmpb; /* source alias */ tmpa = a->dp + b->used - 1; /* dest alias */ tmpb = b->dp + b->used - 1; /* carry */ r = 0; for (x = b->used - 1; x >= 0; x--) { /* get the carry for the next iteration */ rr = *tmpa & 1; /* shift the current digit, add in carry and store */ *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1)); /* forward carry to next iteration */ r = rr; } /* zero excess digits */ tmpb = b->dp + b->used; for (x = b->used; x < oldused; x++) { *tmpb++ = 0; } } b->sign = a->sign; mp_clamp (b); return MP_OKAY; }
/* b = a/2 */ int mp_div_2(mp_int * a, mp_int * b) { int x, res, oldused; /* copy */ if (ALLOC(b) < USED(a)) { if ((res = mp_grow (b, USED(a))) != MP_OKAY) { return res; } } oldused = USED(b); SET_USED(b,USED(a)); { register mp_digit r, rr, *tmpa, *tmpb; /* source alias */ tmpa = DIGITS(a) + USED(b) - 1; /* dest alias */ tmpb = DIGITS(b) + USED(b) - 1; /* carry */ r = 0; for (x = USED(b) - 1; x >= 0; x--) { /* get the carry for the next iteration */ rr = *tmpa & 1; /* shift the current digit, add in carry and store */ *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1)); /* forward carry to next iteration */ r = rr; } /* zero excess digits */ tmpb = DIGITS(b) + USED(b); for (x = USED(b); x < oldused; x++) { *tmpb++ = 0; } } SET_SIGN(b,SIGN(a)); mp_clamp (b); return MP_OKAY; }
static void from_num(MVMnum64 d, mp_int *a) { MVMnum64 d_digit = pow(2, DIGIT_BIT); MVMnum64 da = fabs(d); MVMnum64 upper; MVMnum64 lower; MVMnum64 lowest; MVMnum64 rest; int digits = 0; mp_zero(a); while (da > d_digit * d_digit * d_digit) {; da /= d_digit; digits++; } mp_grow(a, digits + 3); /* populate the top 3 digits */ upper = da / (d_digit*d_digit); rest = fmod(da, d_digit*d_digit); lower = rest / d_digit; lowest = fmod(rest,d_digit ); if (upper >= 1) { mp_set_long(a, (unsigned long) upper); mp_mul_2d(a, DIGIT_BIT , a); DIGIT(a, 0) = (mp_digit) lower; mp_mul_2d(a, DIGIT_BIT , a); } else { if (lower >= 1) { mp_set_long(a, (unsigned long) lower); mp_mul_2d(a, DIGIT_BIT , a); a->used = 2; } else { a->used = 1; } } DIGIT(a, 0) = (mp_digit) lowest; /* shift the rest */ mp_mul_2d(a, DIGIT_BIT * digits, a); if (d < 0) mp_neg(a, a); mp_clamp(a); mp_shrink(a); }
/* reads a unsigned char array, assumes the msb is stored first [big endian] */ int mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c) { int res; mp_zero (a); while (c-- > 0) { if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) { return res; } if (DIGIT_BIT != 7) { a->dp[0] |= *b++; a->used += 1; } else { a->dp[0] = (*b & MP_MASK); a->dp[1] |= ((*b++ >> 7U) & 1); a->used += 2; } } mp_clamp (a); return MP_OKAY; }
/* Taken from mp_set_long, but portably accepts a 64-bit number. */ int MVM_bigint_mp_set_uint64(mp_int * a, MVMuint64 b) { int x, res; mp_zero (a); /* set four bits at a time */ for (x = 0; x < sizeof(MVMuint64) * 2; x++) { /* shift the number up four bits */ if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) { return res; } /* OR in the top four bits of the source */ a->dp[0] |= (b >> ((sizeof(MVMuint64)) * 8 - 4)) & 15; /* shift the source up to the next four bits */ b <<= 4; /* ensure that digits are not clamped off */ a->used += 1; } mp_clamp(a); return MP_OKAY; }
/* c = |a| * |b| using Karatsuba Multiplication using * three half size multiplications * * Let B represent the radix [e.g. 2**DIGIT_BIT] and * let n represent half of the number of digits in * the min(a,b) * * a = a1 * B**n + a0 * b = b1 * B**n + b0 * * Then, a * b => a1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0 * * Note that a1b1 and a0b0 are used twice and only need to be * computed once. So in total three half size (half # of * digit) multiplications are performed, a0b0, a1b1 and * (a1+b1)(a0+b0) * * Note that a multiplication of half the digits requires * 1/4th the number of single precision multiplications so in * total after one call 25% of the single precision multiplications * are saved. Note also that the call to mp_mul can end up back * in this function if the a0, a1, b0, or b1 are above the threshold. * This is known as divide-and-conquer and leads to the famous * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than * the standard O(N**2) that the baseline/comba methods use. * Generally though the overhead of this method doesn't pay off * until a certain size (N ~ 80) is reached. */ int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c) { mp_int x0, x1, y0, y1, t1, x0y0, x1y1; int B, err; /* default the return code to an error */ err = MP_MEM; /* min # of digits */ B = MIN (a->used, b->used); /* now divide in two */ B = B >> 1; /* init copy all the temps */ if (mp_init_size (&x0, B) != MP_OKAY) goto ERR; if (mp_init_size (&x1, a->used - B) != MP_OKAY) goto X0; if (mp_init_size (&y0, B) != MP_OKAY) goto X1; if (mp_init_size (&y1, b->used - B) != MP_OKAY) goto Y0; /* init temps */ if (mp_init_size (&t1, B * 2) != MP_OKAY) goto Y1; if (mp_init_size (&x0y0, B * 2) != MP_OKAY) goto T1; if (mp_init_size (&x1y1, B * 2) != MP_OKAY) goto X0Y0; /* now shift the digits */ x0.used = y0.used = B; x1.used = a->used - B; y1.used = b->used - B; { register int x; register mp_digit *tmpa, *tmpb, *tmpx, *tmpy; /* we copy the digits directly instead of using higher level functions * since we also need to shift the digits */ tmpa = a->dp; tmpb = b->dp; tmpx = x0.dp; tmpy = y0.dp; for (x = 0; x < B; x++) { *tmpx++ = *tmpa++; *tmpy++ = *tmpb++; } tmpx = x1.dp; for (x = B; x < a->used; x++) { *tmpx++ = *tmpa++; } tmpy = y1.dp; for (x = B; x < b->used; x++) { *tmpy++ = *tmpb++; } } /* only need to clamp the lower words since by definition the * upper words x1/y1 must have a known number of digits */ mp_clamp (&x0); mp_clamp (&y0); /* now calc the products x0y0 and x1y1 */ /* after this x0 is no longer required, free temp [x0==t2]! */ if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */ if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY) goto X1Y1; /* x1y1 = x1*y1 */ /* now calc x1+x0 and y1+y0 */ if (s_mp_add (&x1, &x0, &t1) != MP_OKAY) goto X1Y1; /* t1 = x1 - x0 */ if (s_mp_add (&y1, &y0, &x0) != MP_OKAY) goto X1Y1; /* t2 = y1 - y0 */ if (mp_mul (&t1, &x0, &t1) != MP_OKAY) goto X1Y1; /* t1 = (x1 + x0) * (y1 + y0) */ /* add x0y0 */ if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY) goto X1Y1; /* t2 = x0y0 + x1y1 */ if (s_mp_sub (&t1, &x0, &t1) != MP_OKAY) goto X1Y1; /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */ /* shift by B */ if (mp_lshd (&t1, B) != MP_OKAY) goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */ if (mp_lshd (&x1y1, B * 2) != MP_OKAY) goto X1Y1; /* x1y1 = x1y1 << 2*B */ if (mp_add (&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 */ if (mp_add (&t1, &x1y1, c) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */ /* Algorithm succeeded set the return code to MP_OKAY */ err = MP_OKAY; X1Y1: mp_clear (&x1y1); X0Y0: mp_clear (&x0y0); T1: mp_clear (&t1); Y1: mp_clear (&y1); Y0: mp_clear (&y0); X1: mp_clear (&x1); X0: mp_clear (&x0); ERR: return err; }
/* single digit division (based on routine from MPI) */ int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d) { mp_int q; mp_word w; mp_digit t; int res, ix; /* cannot divide by zero */ if (b == 0) { return MP_VAL; } /* quick outs */ if (b == 1 || mp_iszero(a) == 1) { if (d != NULL) { *d = 0; } if (c != NULL) { return mp_copy(a, c); } return MP_OKAY; } /* power of two ? */ if (s_is_power_of_two(b, &ix) == 1) { if (d != NULL) { *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1); } if (c != NULL) { return mp_div_2d(a, ix, c, NULL); } return MP_OKAY; } #ifdef BN_MP_DIV_3_C /* three? */ if (b == 3) { return mp_div_3(a, c, d); } #endif /* no easy answer [c'est la vie]. Just division */ if ((res = mp_init_size(&q, a->used)) != MP_OKAY) { return res; } q.used = a->used; q.sign = a->sign; w = 0; for (ix = a->used - 1; ix >= 0; ix--) { w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]); if (w >= b) { t = (mp_digit)(w / b); w -= ((mp_word)t) * ((mp_word)b); } else { t = 0; } q.dp[ix] = (mp_digit)t; } if (d != NULL) { *d = (mp_digit)w; } if (c != NULL) { mp_clamp(&q); mp_exch(&q, c); } mp_clear(&q); return res; }
/* single digit addition */ int mp_add_d (mp_int * a, mp_digit b, mp_int * c) { int res, ix, oldused; mp_digit *tmpa, *tmpc, mu; /* grow c as required */ if (c->alloc < a->used + 1) { if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) { return res; } } /* if a is negative and |a| >= b, call c = |a| - b */ if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) { /* temporarily fix sign of a */ a->sign = MP_ZPOS; /* c = |a| - b */ res = mp_sub_d(a, b, c); /* fix sign */ a->sign = c->sign = MP_NEG; /* clamp */ mp_clamp(c); return res; } /* old number of used digits in c */ oldused = c->used; /* sign always positive */ c->sign = MP_ZPOS; /* source alias */ tmpa = a->dp; /* destination alias */ tmpc = c->dp; /* if a is positive */ if (a->sign == MP_ZPOS) { /* add digit, after this we're propagating * the carry. */ *tmpc = *tmpa++ + b; mu = *tmpc >> DIGIT_BIT; *tmpc++ &= MP_MASK; /* now handle rest of the digits */ for (ix = 1; ix < a->used; ix++) { *tmpc = *tmpa++ + mu; mu = *tmpc >> DIGIT_BIT; *tmpc++ &= MP_MASK; } /* set final carry */ ix++; *tmpc++ = mu; /* setup size */ c->used = a->used + 1; } else {
/* integer signed division. * c*b + d == a [e.g. a/b, c=quotient, d=remainder] * HAC pp.598 Algorithm 14.20 * * Note that the description in HAC is horribly * incomplete. For example, it doesn't consider * the case where digits are removed from 'x' in * the inner loop. It also doesn't consider the * case that y has fewer than three digits, etc.. * * The overall algorithm is as described as * 14.20 from HAC but fixed to treat these cases. */ int mp_div MPA(mp_int * a, mp_int * b, mp_int * c, mp_int * d) { mp_int q, x, y, t1, t2; int res, n, t, i, norm, neg; /* is divisor zero ? */ if (mp_iszero (b) == 1) { return MP_VAL; } /* if a < b then q=0, r = a */ if (mp_cmp_mag (a, b) == MP_LT) { if (d != NULL) { res = mp_copy (MPST, a, d); } else { res = MP_OKAY; } if (c != NULL) { mp_zero (c); } return res; } if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) { return res; } q.used = a->used + 2; if ((res = mp_init (&t1)) != MP_OKAY) { goto LBL_Q; } if ((res = mp_init (&t2)) != MP_OKAY) { goto LBL_T1; } if ((res = mp_init_copy (MPST, &x, a)) != MP_OKAY) { goto LBL_T2; } if ((res = mp_init_copy (MPST, &y, b)) != MP_OKAY) { goto LBL_X; } /* fix the sign */ neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG; x.sign = y.sign = MP_ZPOS; /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */ norm = mp_count_bits(&y) % DIGIT_BIT; if (norm < (int)(DIGIT_BIT-1)) { norm = (DIGIT_BIT-1) - norm; if ((res = mp_mul_2d (MPST, &x, norm, &x)) != MP_OKAY) { goto LBL_Y; } if ((res = mp_mul_2d (MPST, &y, norm, &y)) != MP_OKAY) { goto LBL_Y; } } else { norm = 0; } /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */ n = x.used - 1; t = y.used - 1; /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */ if ((res = mp_lshd (MPST, &y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */ goto LBL_Y; } while (mp_cmp (&x, &y) != MP_LT) { ++(q.dp[n - t]); if ((res = mp_sub (MPST, &x, &y, &x)) != MP_OKAY) { goto LBL_Y; } } /* reset y by shifting it back down */ mp_rshd (&y, n - t); /* step 3. for i from n down to (t + 1) */ for (i = n; i >= (t + 1); i--) { if (i > x.used) { continue; } /* step 3.1 if xi == yt then set q{i-t-1} to b-1, * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */ if (x.dp[i] == y.dp[t]) { q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1); } else { mp_word tmp; tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT); tmp |= ((mp_word) x.dp[i - 1]); tmp /= ((mp_word) y.dp[t]); if (tmp > (mp_word) MP_MASK) tmp = MP_MASK; q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK)); } /* while (q{i-t-1} * (yt * b + y{t-1})) > xi * b**2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */ q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK; do { q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK; /* find left hand */ mp_zero (&t1); t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1]; t1.dp[1] = y.dp[t]; t1.used = 2; if ((res = mp_mul_d (MPST, &t1, q.dp[i - t - 1], &t1)) != MP_OKAY) { goto LBL_Y; } /* find right hand */ t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2]; t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1]; t2.dp[2] = x.dp[i]; t2.used = 3; } while (mp_cmp_mag(&t1, &t2) == MP_GT); /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */ if ((res = mp_mul_d (MPST, &y, q.dp[i - t - 1], &t1)) != MP_OKAY) { goto LBL_Y; } if ((res = mp_lshd (MPST, &t1, i - t - 1)) != MP_OKAY) { goto LBL_Y; } if ((res = mp_sub (MPST, &x, &t1, &x)) != MP_OKAY) { goto LBL_Y; } /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */ if (x.sign == MP_NEG) { if ((res = mp_copy (MPST, &y, &t1)) != MP_OKAY) { goto LBL_Y; } if ((res = mp_lshd (MPST, &t1, i - t - 1)) != MP_OKAY) { goto LBL_Y; } if ((res = mp_add (MPST, &x, &t1, &x)) != MP_OKAY) { goto LBL_Y; } q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK; } } /* now q is the quotient and x is the remainder * [which we have to normalize] */ /* get sign before writing to c */ x.sign = x.used == 0 ? MP_ZPOS : a->sign; if (c != NULL) { mp_clamp (&q); mp_managed_copy (MPST, &q, c); c->sign = neg; } if (d != NULL) { mp_div_2d (MPST, &x, norm, &x, NULL); mp_managed_copy (MPST, &x, d); } res = MP_OKAY; LBL_Y:mp_clear (&y); LBL_X:mp_clear (&x); LBL_T2:mp_clear (&t2); LBL_T1:mp_clear (&t1); LBL_Q:mp_clear (&q); return res; }
/* computes xR**-1 == x (mod N) via Montgomery Reduction * * This is an optimized implementation of montgomery_reduce * which uses the comba method to quickly calculate the columns of the * reduction. * * Based on Algorithm 14.32 on pp.601 of HAC. */ int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho) { int ix, res, olduse; mp_word W[MP_WARRAY] = { 0 }; /* get old used count */ olduse = x->used; /* grow a as required */ if (x->alloc < n->used + 1) { if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) { return res; } } /* first we have to get the digits of the input into * an array of double precision words W[...] */ { register mp_word *_W; register mp_digit *tmpx; /* alias for the W[] array */ _W = W; /* alias for the digits of x*/ tmpx = x->dp; /* copy the digits of a into W[0..a->used-1] */ for (ix = 0; ix < x->used; ix++) { *_W++ = *tmpx++; } /* zero the high words of W[a->used..m->used*2] */ for (; ix < n->used * 2 + 1; ix++) { *_W++ = 0; } } /* now we proceed to zero successive digits * from the least significant upwards */ for (ix = 0; ix < n->used; ix++) { /* mu = ai * m' mod b * * We avoid a double precision multiplication (which isn't required) * by casting the value down to a mp_digit. Note this requires * that W[ix-1] have the carry cleared (see after the inner loop) */ register mp_digit mu; mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK); /* a = a + mu * m * b**i * * This is computed in place and on the fly. The multiplication * by b**i is handled by offseting which columns the results * are added to. * * Note the comba method normally doesn't handle carries in the * inner loop In this case we fix the carry from the previous * column since the Montgomery reduction requires digits of the * result (so far) [see above] to work. This is * handled by fixing up one carry after the inner loop. The * carry fixups are done in order so after these loops the * first m->used words of W[] have the carries fixed */ { register int iy; register mp_digit *tmpn; register mp_word *_W; /* alias for the digits of the modulus */ tmpn = n->dp; /* Alias for the columns set by an offset of ix */ _W = W + ix; /* inner loop */ for (iy = 0; iy < n->used; iy++) { *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++); } } /* now fix carry for next digit, W[ix+1] */ W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT); } /* now we have to propagate the carries and * shift the words downward [all those least * significant digits we zeroed]. */ { register mp_digit *tmpx; register mp_word *_W, *_W1; /* nox fix rest of carries */ /* alias for current word */ _W1 = W + ix; /* alias for next word, where the carry goes */ _W = W + ++ix; for (; ix <= n->used * 2 + 1; ix++) { *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT); } /* copy out, A = A/b**n * * The result is A/b**n but instead of converting from an * array of mp_word to mp_digit than calling mp_rshd * we just copy them in the right order */ /* alias for destination word */ tmpx = x->dp; /* alias for shifted double precision result */ _W = W + n->used; for (ix = 0; ix < n->used + 1; ix++) { *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK)); } /* zero oldused digits, if the input a was larger than * m->used+1 we'll have to clear the digits */ for (; ix < olduse; ix++) { *tmpx++ = 0; } } /* set the max used and clamp */ x->used = n->used + 1; mp_clamp (x); /* if A >= m then A = A - m */ if (mp_cmp_mag (x, n) != MP_LT) { return s_mp_sub (x, n, x); } return MP_OKAY; }
/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */ int s_mp_sqr (mp_int * a, mp_int * b) { mp_int t; int res, ix, iy, pa; mp_word r; mp_digit u, tmpx, *tmpt; pa = a->used; if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) { return res; } /* default used is maximum possible size */ t.used = 2*pa + 1; for (ix = 0; ix < pa; ix++) { /* first calculate the digit at 2*ix */ /* calculate double precision result */ r = ((mp_word) t.dp[2*ix]) + ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]); /* store lower part in result */ t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK)); /* get the carry */ u = (mp_digit)(r >> ((mp_word) DIGIT_BIT)); /* left hand side of A[ix] * A[iy] */ tmpx = a->dp[ix]; /* alias for where to store the results */ tmpt = t.dp + (2*ix + 1); for (iy = ix + 1; iy < pa; iy++) { /* first calculate the product */ r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]); /* now calculate the double precision result, note we use * addition instead of *2 since it's easier to optimize */ r = ((mp_word) *tmpt) + r + r + ((mp_word) u); /* store lower part */ *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); /* get carry */ u = (mp_digit)(r >> ((mp_word) DIGIT_BIT)); } /* propagate upwards */ while (u != ((mp_digit) 0)) { r = ((mp_word) *tmpt) + ((mp_word) u); *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); u = (mp_digit)(r >> ((mp_word) DIGIT_BIT)); } } mp_clamp (&t); mp_exch (&t, b); mp_clear (&t); return MP_OKAY; }
/* low level addition, based on HAC pp.594, Algorithm 14.7 */ int s_mp_add (mp_int * a, mp_int * b, mp_int * c) { mp_int *x; int olduse, res, min, max; /* find sizes, we let |a| <= |b| which means we have to sort * them. "x" will point to the input with the most digits */ if (a->used > b->used) { min = b->used; max = a->used; x = a; } else { min = a->used; max = b->used; x = b; } /* init result */ if (c->alloc < max + 1) { if ((res = mp_grow (c, max + 1)) != MP_OKAY) { return res; } } /* get old used digit count and set new one */ olduse = c->used; c->used = max + 1; { register mp_digit u, *tmpa, *tmpb, *tmpc; register int i; /* alias for digit pointers */ /* first input */ tmpa = a->dp; /* second input */ tmpb = b->dp; /* destination */ tmpc = c->dp; /* zero the carry */ u = 0; for (i = 0; i < min; i++) { /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */ *tmpc = *tmpa++ + *tmpb++ + u; /* U = carry bit of T[i] */ u = *tmpc >> ((mp_digit)DIGIT_BIT); /* take away carry bit from T[i] */ *tmpc++ &= MP_MASK; } /* now copy higher words if any, that is in A+B * if A or B has more digits add those in */ if (min != max) { for (; i < max; i++) { /* T[i] = X[i] + U */ *tmpc = x->dp[i] + u; /* U = carry bit of T[i] */ u = *tmpc >> ((mp_digit)DIGIT_BIT); /* take away carry bit from T[i] */ *tmpc++ &= MP_MASK; } } /* add carry */ *tmpc++ = u; /* clear digits above oldused */ for (i = c->used; i < olduse; i++) { *tmpc++ = 0; } } mp_clamp (c); return MP_OKAY; }
int mp_toom_cook_5_mul(mp_int *a, mp_int *b, mp_int *c) { mp_int w1, w2, w3, w4, w5, w6, w7, w8, w9; mp_int tmp1, tmp2; mp_int a0, a1, a2, a3, a4; mp_int b0, b1, b2, b3, b4; int e = MP_OKAY; int B, count, sign; B = (MAX(a->used, b->used)) / 5; sign = (a->sign != b->sign) ? MP_NEG : MP_ZPOS; if (MIN(a->used, b->used) < TOOM_COOK_5_MUL_CO) { if ((e = mp_mul(a, b, c)) != MP_OKAY) { return e; } c->sign = sign; return MP_OKAY; } if ((e = mp_init_multi(&w1, &w2, &w3, &w4, &w5, &w6, &w7, &w8, &w9, &tmp1, &tmp2, //&a0, &a1, &a2, &a3, &a4, &b0, &b1, &b2, &b3, &b4, NULL)) != MP_OKAY) { goto ERR0; //goto ERR; } if ((e = mp_init_size(&a0, B)) != MP_OKAY) { goto ERRa0; } if ((e = mp_init_size(&a1, B)) != MP_OKAY) { goto ERRa1; } if ((e = mp_init_size(&a2, B)) != MP_OKAY) { goto ERRa2; } if ((e = mp_init_size(&a3, B)) != MP_OKAY) { goto ERRa3; } if ((e = mp_init_size(&a4, B)) != MP_OKAY) { goto ERRa4; } if ((e = mp_init_size(&b0, B)) != MP_OKAY) { goto ERRb0; } if ((e = mp_init_size(&b1, B)) != MP_OKAY) { goto ERRb1; } if ((e = mp_init_size(&b2, B)) != MP_OKAY) { goto ERRb2; } if ((e = mp_init_size(&b3, B)) != MP_OKAY) { goto ERRb3; } if ((e = mp_init_size(&b4, B)) != MP_OKAY) { goto ERRb4; } // A = a4*x^4 + a3*x^3 + a2*x^2 + a1*x + a0 for (count = 0; count < a->used; count++) { switch (count / B) { case 0: a0.dp[count] = a->dp[count]; a0.used++; break; case 1: a1.dp[count - B] = a->dp[count]; a1.used++; break; case 2: a2.dp[count - 2 * B] = a->dp[count]; a2.used++; break; case 3: a3.dp[count - 3 * B] = a->dp[count]; a3.used++; break; case 4: a4.dp[count - 4 * B] = a->dp[count]; a4.used++; break; default: a4.dp[count - 4 * B] = a->dp[count]; a4.used++; break; } } mp_clamp(&a0); mp_clamp(&a1); mp_clamp(&a2); mp_clamp(&a3); mp_clamp(&a4); // B = b4*x^4 + b3*x^3 + b2*x^2 + b1*x + b0 for (count = 0; count < b->used; count++) { switch (count / B) { case 0: b0.dp[count] = b->dp[count]; b0.used++; break; case 1: b1.dp[count - B] = b->dp[count]; b1.used++; break; case 2: b2.dp[count - 2 * B] = b->dp[count]; b2.used++; break; case 3: b3.dp[count - 3 * B] = b->dp[count]; b3.used++; break; case 4: b4.dp[count - 4 * B] = b->dp[count]; b4.used++; break; default: b4.dp[count - 4 * B] = b->dp[count]; b4.used++; break; } } mp_clamp(&b0); mp_clamp(&b1); mp_clamp(&b2); mp_clamp(&b3); mp_clamp(&b4); /* if ((e = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) { goto ERR; } if ((e = mp_copy(a, &a1)) != MP_OKAY) { goto ERR; } mp_rshd(&a1, B); mp_mod_2d(&a1, DIGIT_BIT * B, &a1); if ((e = mp_copy(a, &a2)) != MP_OKAY) { goto ERR; } mp_rshd(&a2, B * 2); mp_mod_2d(&a2, DIGIT_BIT * B, &a2); if ((e = mp_copy(a, &a3)) != MP_OKAY) { goto ERR; } mp_rshd(&a3, B * 3); mp_mod_2d(&a3, DIGIT_BIT * B, &a3); if ((e = mp_copy(a, &a4)) != MP_OKAY) { goto ERR; } mp_rshd(&a4, B * 4); if ((e = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) { goto ERR; } if ((e = mp_copy(a, &b1)) != MP_OKAY) { goto ERR; } mp_rshd(&b1, B); mp_mod_2d(&b1, DIGIT_BIT * B, &b1); if ((e = mp_copy(b, &b2)) != MP_OKAY) { goto ERR; } mp_rshd(&b2, B * 2); mp_mod_2d(&b2, DIGIT_BIT * B, &b2); if ((e = mp_copy(b, &b3)) != MP_OKAY) { goto ERR; } mp_rshd(&b3, B * 3); mp_mod_2d(&b3, DIGIT_BIT * B, &b3); if ((e = mp_copy(b, &b4)) != MP_OKAY) { goto ERR; } mp_rshd(&b4, B * 4); */ // S1 = a4*b4 if ((e = mp_mul(&a4, &b4, &w1)) != MP_OKAY) { goto ERR; } // S9 = a0*b0 if ((e = mp_mul(&a0, &b0, &w9)) != MP_OKAY) { goto ERR; } // S2 = (a0- 2*a1 +4*a2 -8*a3 +16*a4) if ((e = mp_mul_2d(&a1, 1, &tmp1)) != MP_OKAY) { goto ERR; } // 2*a1 = tmp1 if ((e = mp_sub(&a0, &tmp1, &w2)) != MP_OKAY) { goto ERR; } // a0- 2*a1 = a0 - tmp1 = w2 if ((e = mp_mul_2d(&a2, 2, &tmp1)) != MP_OKAY) { goto ERR; } // 4*a2 = tmp1 if ((e = mp_add(&w2, &tmp1, &w2)) != MP_OKAY) { goto ERR; } // a0- 2*a1 +4*a2 = w2 + tmp1 = w2 if ((e = mp_mul_2d(&a3, 3, &tmp1)) != MP_OKAY) { goto ERR; } // 8*a3 = tmp1 if ((e = mp_sub(&w2, &tmp1, &w2)) != MP_OKAY) { goto ERR; } // a0- 2*a1 +4*a2 -8*a3 = w2 - tmp1 = w2 if ((e = mp_mul_2d(&a4, 4, &tmp1)) != MP_OKAY) { goto ERR; } // 16*a4 = tmp1 if ((e = mp_add(&w2, &tmp1, &w2)) != MP_OKAY) { goto ERR; } // a0- 2*a1 +4*a2 -8*a3 +16*a4 = w2 + tmp1 = w2 // * (b0- 2*b1 +4*b2 -8*b3 +16*b4) if ((e = mp_mul_2d(&b1, 1, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&b0, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b2, 2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b3, 3, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b4, 4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul(&tmp2, &w2, &w2)) != MP_OKAY) { goto ERR; } // S5 = (a0+ 2*a1+ 4*a2+ 8*a3+ 16*a4) if ((e = mp_mul_2d(&a1, 1, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&a0, &tmp1, &w5)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a2, 2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w5, &tmp1, &w5)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a3, 3, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w5, &tmp1, &w5)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a4, 4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w5, &tmp1, &w5)) != MP_OKAY) { goto ERR; } // *(b0+ 2*b1+ 4*b2+ 8*b3+ 16*b4) if ((e = mp_mul_2d(&b1, 1, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&b0, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b2, 2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b3, 3, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b4, 4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul(&tmp2, &w5, &w5)) != MP_OKAY) { goto ERR; } // S3 = (a4+ 2*a3+ 4*a2+ 8*a1+ 16*a0) if ((e = mp_mul_2d(&a3, 1, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&a4, &tmp1, &w3)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a2, 2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w3, &tmp1, &w3)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a1, 3, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w3, &tmp1, &w3)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a0, 4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w3, &tmp1, &w3)) != MP_OKAY) { goto ERR; } // * (b4+ 2*b3+ 4*b2+ 8*b1+ 16*b0) if ((e = mp_mul_2d(&b3, 1, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&b4, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b2, 2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b1, 3, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b0, 4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul(&tmp2, &w3, &w3)) != MP_OKAY) { goto ERR; } // S8 = (a4- 2*a3+ 4*a2- 8*a1+ 16*a0) if ((e = mp_mul_2d(&a3, 1, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&a4, &tmp1, &w8)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a2, 2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w8, &tmp1, &w8)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a1, 3, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&w8, &tmp1, &w8)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a0, 4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w8, &tmp1, &w8)) != MP_OKAY) { goto ERR; } //* (b4- 2*b3+ 4*b2- 8*b1+ 16*b0) if ((e = mp_mul_2d(&b3, 1, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&b4, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b2, 2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b1, 3, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b0, 4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul(&tmp2, &w8, &w8)) != MP_OKAY) { goto ERR; } // S4 = (a0+ 4*a1+ 16*a2+ 64*a3+ 256*a4) if ((e = mp_mul_2d(&a1, 2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&a0, &tmp1, &w4)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a2, 4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w4, &tmp1, &w4)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a3, 6, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w4, &tmp1, &w4)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&a4, 8, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w4, &tmp1, &w4)) != MP_OKAY) { goto ERR; } //* (b0+ 4*b1+ 16*b2+ 64*b3+ 256*b4) if ((e = mp_mul_2d(&b1, 2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&b0, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b2, 4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b3, 6, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul_2d(&b4, 8, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp2, &tmp1, &tmp2)) != MP_OKAY) { goto ERR; } if ((e = mp_mul(&tmp2, &w4, &w4)) != MP_OKAY) { goto ERR; } // S6 = (a0- a1+ a2- a3 +a4) if ((e = mp_sub(&a0, &a1, &w6)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w6, &a2, &w6)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&w6, &a3, &w6)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w6, &a4, &w6)) != MP_OKAY) { goto ERR; } // * (b0- b1+ b2- b3+ b4) if ((e = mp_sub(&b0, &b1, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &b2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&tmp1, &b3, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &b4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_mul(&tmp1, &w6, &w6)) != MP_OKAY) { goto ERR; } // S7 = (a0+ a1+ a2+ a3+ a4) if ((e = mp_add(&a0, &a1, &w7)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w7, &a2, &w7)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w7, &a3, &w7)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w7, &a4, &w7)) != MP_OKAY) { goto ERR; } // * (b0+ b1+ b2+ b3+ b4) if ((e = mp_add(&b0, &b1, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &b2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &b3, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &b4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_mul(&tmp1, &w7, &w7)) != MP_OKAY) { goto ERR; } // S6 -= S7 if ((e = mp_sub(&w6, &w7, &w6)) != MP_OKAY) { goto ERR; } // S2 -= S5 if ((e = mp_sub(&w2, &w5, &w2)) != MP_OKAY) { goto ERR; } // S4 -= S9 if ((e = mp_sub(&w4, &w9, &w4)) != MP_OKAY) { goto ERR; } // S4 -= (2^16*S1) if ((e = mp_mul_2d(&w1, 16, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&w4, &tmp1, &w4)) != MP_OKAY) { goto ERR; } // S8 -= S3 if ((e = mp_sub(&w8, &w3, &w8)) != MP_OKAY) { goto ERR; } // S6 /= 2 if ((e = mp_div_2d(&w6, 1, &w6, NULL)) != MP_OKAY) { goto ERR; } // S5 *= 2 if ((e = mp_mul_2d(&w5, 1, &w5)) != MP_OKAY) { goto ERR; } // S5 += S2 if ((e = mp_add(&w5, &w2, &w5)) != MP_OKAY) { goto ERR; } // S2 = -S2 if ((e = mp_neg(&w2, &w2)) != MP_OKAY) { goto ERR; } // S8 = -S8 if ((e = mp_neg(&w8, &w8)) != MP_OKAY) { goto ERR; } // S7 += S6 if ((e = mp_add(&w7, &w6, &w7)) != MP_OKAY) { goto ERR; } // S6 = -S6 if ((e = mp_neg(&w6, &w6)) != MP_OKAY) { goto ERR; } // S3 -= S7 if ((e = mp_sub(&w3, &w7, &w3)) != MP_OKAY) { goto ERR; } // S5 -= (512*S7) if ((e = mp_mul_2d(&w7, 9, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&w5, &tmp1, &w5)) != MP_OKAY) { goto ERR; } // S3 *= 2 if ((e = mp_mul_2d(&w3, 1, &w3)) != MP_OKAY) { goto ERR; } // S3 -= S8 if ((e = mp_sub(&w3, &w8, &w3)) != MP_OKAY) { goto ERR; } // S7 -= S1 if ((e = mp_sub(&w7, &w1, &w7)) != MP_OKAY) { goto ERR; } // S7 -= S9 if ((e = mp_sub(&w7, &w9, &w7)) != MP_OKAY) { goto ERR; } // S8 += S2 if ((e = mp_add(&w8, &w2, &w8)) != MP_OKAY) { goto ERR; } // S5 += S3 if ((e = mp_add(&w5, &w3, &w5)) != MP_OKAY) { goto ERR; } // S8 -= (80*S6) if ((e = mp_mul_d(&w6, 80, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&w8, &tmp1, &w8)) != MP_OKAY) { goto ERR; } // S3 -= (510*S9) if ((e = mp_mul_d(&w9, 510, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) { goto ERR; } // S4 -= S2 if ((e = mp_sub(&w4, &w2, &w4)) != MP_OKAY) { goto ERR; } // S3 *= 3 if ((e = mp_mul_d(&w3, 3, &w3)) != MP_OKAY) { goto ERR; } // S3 += S5 if ((e = mp_add(&w3, &w5, &w3)) != MP_OKAY) { goto ERR; } // S8 /= 180 \\ division by 180 if ((e = mp_div_d(&w8, 180, &w8, NULL)) != MP_OKAY) { goto ERR; } // S5 += (378*S7) if ((e = mp_mul_d(&w7, 378, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w5, &tmp1, &w5)) != MP_OKAY) { goto ERR; } // S2 /= 4 if ((e = mp_div_2d(&w2, 2, &w2, NULL)) != MP_OKAY) { goto ERR; } // S6 -= S2 if ((e = mp_sub(&w6, &w2, &w6)) != MP_OKAY) { goto ERR; } // S5 /= (-72) \\ division by -72 if ((e = mp_div_d(&w5, 72, &w5, NULL)) != MP_OKAY) { goto ERR; } if (&w5.sign == MP_ZPOS) (&w5)->sign = MP_NEG; (&w5)->sign = MP_ZPOS; // S3 /= (-360) \\ division by -360 if ((e = mp_div_d(&w3, 360, &w3, NULL)) != MP_OKAY) { goto ERR; } if (&w3.sign == MP_ZPOS) (&w3)->sign = MP_NEG; (&w3)->sign = MP_ZPOS; // S2 -= S8 if ((e = mp_sub(&w2, &w8, &w2)) != MP_OKAY) { goto ERR; } // S7 -= S3 if ((e = mp_sub(&w7, &w3, &w7)) != MP_OKAY) { goto ERR; } // S4 -= (256*S5) if ((e = mp_mul_2d(&w5, 8, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&w4, &tmp1, &w4)) != MP_OKAY) { goto ERR; } // S3 -= S5 if ((e = mp_sub(&w3, &w5, &w3)) != MP_OKAY) { goto ERR; } // S4 -= (4096*S3) if ((e = mp_mul_2d(&w3, 12, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&w4, &tmp1, &w4)) != MP_OKAY) { goto ERR; } // S4 -= (16*S7) if ((e = mp_mul_2d(&w7, 4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_sub(&w4, &tmp1, &w4)) != MP_OKAY) { goto ERR; } // S4 += (256*S6) if ((e = mp_mul_2d(&w6, 8, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w4, &tmp1, &w4)) != MP_OKAY) { goto ERR; } // S6 += S2 if ((e = mp_add(&w6, &w2, &w6)) != MP_OKAY) { goto ERR; } // S2 *= 180 if ((e = mp_mul_d(&w2, 180, &w2)) != MP_OKAY) { goto ERR; } // S2 += S4 if ((e = mp_add(&w2, &w4, &w2)) != MP_OKAY) { goto ERR; } // S2 /= 11340 \\ division by 11340 if ((e = mp_div_d(&w2, 11340, &w2, NULL)) != MP_OKAY) { goto ERR; } // S4 += (720*S6) if ((e = mp_mul_d(&w6, 720, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&w4, &tmp1, &w4)) != MP_OKAY) { goto ERR; } // S4 /= (-2160) \\ division by -2160 if ((e = mp_div_d(&w4, 2160, &w4, NULL)) != MP_OKAY) { goto ERR; } if (&w4.sign == MP_ZPOS) (&w4)->sign = MP_NEG; (&w4)->sign = MP_ZPOS; // S6 -= S4 if ((e = mp_sub(&w6, &w4, &w6)) != MP_OKAY) { goto ERR; } // S8 -= S2 if ((e = mp_sub(&w8, &w2, &w8)) != MP_OKAY) { goto ERR; } // P = S1*x^8 + S2*x^7 + S3*x^6 + S4*x^5 + S5*x^4 + S6*x^3 + S7*x^2 + S8*x + S9 if ((e = mp_copy(&w9, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_lshd(&w8, B)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &w8, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_lshd(&w7, 2 * B)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &w7, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_lshd(&w6, 3 * B)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &w6, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_lshd(&w5, 4 * B)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &w5, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_lshd(&w4, 5 * B)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &w4, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_lshd(&w3, 6 * B)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &w3, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_lshd(&w2, 7 * B)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &w2, &tmp1)) != MP_OKAY) { goto ERR; } if ((e = mp_lshd(&w1, 8 * B)) != MP_OKAY) { goto ERR; } if ((e = mp_add(&tmp1, &w1, c)) != MP_OKAY) { goto ERR; } // P - A*B \\ == zero c->sign = sign; ERR: ERRb4: mp_clear(&b4); ERRb3: mp_clear(&b3); ERRb2: mp_clear(&b2); ERRb1: mp_clear(&b1); ERRb0: mp_clear(&b0); ERRa4: mp_clear(&a4); ERRa3: mp_clear(&a3); ERRa2: mp_clear(&a2); ERRa1: mp_clear(&a1); ERRa0: mp_clear(&a0); ERR0: mp_clear_multi(&w1, &w2, &w3, &w4, &w5, &w6, &w7, &w8, &w9, &tmp1, &tmp2, // &a0, &a1, &a2, &a3, &a4, &b0, &b1, &b2, &b3, &b4, NULL); return e; }
/* this is a modified version of fast_s_mul_digs that only produces * output digits *above* digs. See the comments for fast_s_mul_digs * to see how it works. * * This is used in the Barrett reduction since for one of the multiplications * only the higher digits were needed. This essentially halves the work. * * Based on Algorithm 14.12 on pp.595 of HAC. */ int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) { int olduse, res, pa, ix, iz; mp_digit W[MP_WARRAY]; mp_word _W; /* grow the destination as required */ pa = a->used + b->used; if (c->alloc < pa) { if ((res = mp_grow (c, pa)) != MP_OKAY) { return res; } } /* number of output digits to produce */ pa = a->used + b->used; _W = 0; for (ix = digs; ix < pa; ix++) { int tx, ty, iy; mp_digit *tmpx, *tmpy; /* get offsets into the two bignums */ ty = MIN(b->used-1, ix); tx = ix - ty; /* setup temp aliases */ tmpx = a->dp + tx; tmpy = b->dp + ty; /* this is the number of times the loop will iterrate, essentially its while (tx++ < a->used && ty-- >= 0) { ... } */ iy = MIN(a->used-tx, ty+1); /* execute loop */ for (iz = 0; iz < iy; iz++) { _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--); } /* store term */ W[ix] = ((mp_digit)_W) & MP_MASK; /* make next carry */ _W = _W >> ((mp_word)DIGIT_BIT); } /* setup dest */ olduse = c->used; c->used = pa; { mp_digit *tmpc; tmpc = c->dp + digs; for (ix = digs; ix < pa; ix++) { /* now extract the previous digit [below the carry] */ *tmpc++ = W[ix]; } /* clear unused digits [that existed in the old copy of c] */ for (; ix < olduse; ix++) { *tmpc++ = 0; } } mp_clamp (c); return MP_OKAY; }
/* computes xR**-1 == x (mod N) via Montgomery Reduction */ int mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho) { int ix, res, digs; mp_digit mu; /* can the fast reduction [comba] method be used? * * Note that unlike in mul you're safely allowed *less* * than the available columns [255 per default] since carries * are fixed up in the inner loop. */ digs = n->used * 2 + 1; if ((digs < MP_WARRAY) && n->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { return fast_mp_montgomery_reduce (x, n, rho); } /* grow the input as required */ if (x->alloc < digs) { if ((res = mp_grow (x, digs)) != MP_OKAY) { return res; } } x->used = digs; for (ix = 0; ix < n->used; ix++) { /* mu = ai * rho mod b * * The value of rho must be precalculated via * montgomery_setup() such that * it equals -1/n0 mod b this allows the * following inner loop to reduce the * input one digit at a time */ mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK); /* a = a + mu * m * b**i */ { register int iy; register mp_digit *tmpn, *tmpx, u; register mp_word r; /* alias for digits of the modulus */ tmpn = n->dp; /* alias for the digits of x [the input] */ tmpx = x->dp + ix; /* set the carry to zero */ u = 0; /* Multiply and add in place */ for (iy = 0; iy < n->used; iy++) { /* compute product and sum */ r = ((mp_word)mu) * ((mp_word)*tmpn++) + ((mp_word) u) + ((mp_word) * tmpx); /* get carry */ u = (mp_digit)(r >> ((mp_word) DIGIT_BIT)); /* fix digit */ *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK)); } /* At this point the ix'th digit of x should be zero */ /* propagate carries upwards as required*/ while (u) { *tmpx += u; u = *tmpx >> DIGIT_BIT; *tmpx++ &= MP_MASK; } } } /* at this point the n.used'th least * significant digits of x are all zero * which means we can shift x to the * right by n.used digits and the * residue is unchanged. */ /* x = x/b**n.used */ mp_clamp(x); mp_rshd (x, n->used); /* if x >= n then x = x - n */ if (mp_cmp_mag (x, n) != MP_LT) { return s_mp_sub (x, n, x); } return MP_OKAY; }
/* single digit subtraction */ int mp_sub_d (mp_int * a, mp_digit b, mp_int * c) { mp_digit *tmpa, *tmpc, mu; int res, ix, oldused; /* grow c as required */ if (c->alloc < a->used + 1) { if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) { return res; } } /* if a is negative just do an unsigned * addition [with fudged signs] */ if (a->sign == MP_NEG) { a->sign = MP_ZPOS; res = mp_add_d(a, b, c); a->sign = c->sign = MP_NEG; /* clamp */ mp_clamp(c); return res; } /* setup regs */ oldused = c->used; tmpa = a->dp; tmpc = c->dp; /* if a <= b simply fix the single digit */ if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) { if (a->used == 1) { *tmpc++ = b - *tmpa; } else { *tmpc++ = b; } ix = 1; /* negative/1digit */ c->sign = MP_NEG; c->used = 1; } else { /* positive/size */ c->sign = MP_ZPOS; c->used = a->used; /* subtract first digit */ *tmpc = *tmpa++ - b; mu = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1); *tmpc++ &= MP_MASK; /* handle rest of the digits */ for (ix = 1; ix < a->used; ix++) { *tmpc = *tmpa++ - mu; mu = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1); *tmpc++ &= MP_MASK; } } /* zero excess digits */ while (ix++ < oldused) { *tmpc++ = 0; } mp_clamp(c); return MP_OKAY; }
/* Fast (comba) multiplier * * This is the fast column-array [comba] multiplier. It is * designed to compute the columns of the product first * then handle the carries afterwards. This has the effect * of making the nested loops that compute the columns very * simple and schedulable on super-scalar processors. * * This has been modified to produce a variable number of * digits of output so if say only a half-product is required * you don't have to compute the upper half (a feature * required for fast Barrett reduction). * * Based on Algorithm 14.12 on pp.595 of HAC. * */ int fast_s_mp_mul_digs(mp_int * a, mp_int * b, mp_int * c, int digs) { int olduse, res, pa, ix; extern mp_word *W; /* grow the destination as required */ if (c->alloc < digs) { if ((res = mp_grow(c, digs)) != MP_OKAY) { return res; } } /* clear temp buf (the columns) */ memset(W, 0, sizeof(mp_word) * digs); /* calculate the columns */ pa = a->used; for (ix = 0; ix < pa; ix++) { /* this multiplier has been modified to allow you to * control how many digits of output are produced. * So at most we want to make upto "digs" digits of output. * * this adds products to distinct columns (at ix+iy) of W * note that each step through the loop is not dependent on * the previous which means the compiler can easily unroll * the loop without scheduling problems */ { register mp_digit tmpx, *tmpy; register mp_word *_W; register int iy, pb; /* alias for the the word on the left e.g. A[ix] * A[iy] */ tmpx = a->dp[ix]; /* alias for the right side */ tmpy = b->dp; /* alias for the columns, each step through the loop adds a new term to each column */ _W = W + ix; /* the number of digits is limited by their placement. E.g. we avoid multiplying digits that will end up above the # of digits of precision requested */ pb = MIN(b->used, digs - ix); for (iy = 0; iy < pb; iy++) { *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); } } } /* setup dest */ olduse = c->used; c->used = digs; { register mp_digit *tmpc; /* At this point W[] contains the sums of each column. To get the * correct result we must take the extra bits from each column and * carry them down * * Note that while this adds extra code to the multiplier it * saves time since the carry propagation is removed from the * above nested loop.This has the effect of reducing the work * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the * cost of the shifting. On very small numbers this is slower * but on most cryptographic size numbers it is faster. * * In this particular implementation we feed the carries from * behind which means when the loop terminates we still have one * last digit to copy */ tmpc = c->dp; for (ix = 1; ix < digs; ix++) { /* forward the carry from the previous temp */ W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); /* now extract the previous digit [below the carry] */ *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); } /* fetch the last digit */ *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK)); /* clear unused digits [that existed in the old copy of c] */ for (; ix < olduse; ix++) { *tmpc++ = 0; } } mp_clamp(c); return MP_OKAY; }
/* Karatsuba squaring, computes b = a*a using three * half size squarings * * See comments of karatsuba_mul for details. It * is essentially the same algorithm but merely * tuned to perform recursive squarings. */ int mp_karatsuba_sqr (mp_int * a, mp_int * b) { mp_int x0, x1, t1, t2, x0x0, x1x1; int B, err; err = MP_MEM; /* min # of digits */ B = USED(a); /* now divide in two */ B = B >> 1; /* init copy all the temps */ if (mp_init_size (&x0, B) != MP_OKAY) goto ERR; if (mp_init_size (&x1, USED(a) - B) != MP_OKAY) goto X0; /* init temps */ if (mp_init_size (&t1, USED(a) * 2) != MP_OKAY) goto X1; if (mp_init_size (&t2, USED(a) * 2) != MP_OKAY) goto T1; if (mp_init_size (&x0x0, B * 2) != MP_OKAY) goto T2; if (mp_init_size (&x1x1, (USED(a) - B) * 2) != MP_OKAY) goto X0X0; { register int x; register mp_digit *dst, *src; src = DIGITS(a); /* now shift the digits */ dst = DIGITS(&x0); for (x = 0; x < B; x++) { *dst++ = *src++; } dst = DIGITS(&x1); for (x = B; x < USED(a); x++) { *dst++ = *src++; } } SET_USED(&x0,B); SET_USED(&x1,USED(a) - B); mp_clamp (&x0); /* now calc the products x0*x0 and x1*x1 */ if (mp_sqr (&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */ if (mp_sqr (&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */ /* now calc (x1+x0)**2 */ if (s_mp_add (&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */ if (mp_sqr (&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (x1 - x0) */ /* add x0y0 */ if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0x0 + x1x1 */ if (s_mp_sub (&t1, &t2, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */ /* shift by B */ if (mp_lshd (&t1, B) != MP_OKAY) goto X1X1; /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */ if (mp_lshd (&x1x1, B * 2) != MP_OKAY) goto X1X1; /* x1x1 = x1x1 << 2*B */ if (mp_add (&x0x0, &t1, &t1) != MP_OKAY) goto X1X1; /* t1 = x0x0 + t1 */ if (mp_add (&t1, &x1x1, b) != MP_OKAY) goto X1X1; /* t1 = x0x0 + t1 + x1x1 */ err = MP_OKAY; X1X1:mp_clear (&x1x1); X0X0:mp_clear (&x0x0); T2:mp_clear (&t2); T1:mp_clear (&t1); X1:mp_clear (&x1); X0:mp_clear (&x0); ERR: return err; }
/* Karatsuba squaring, computes b = a*a using three * half size squarings * * See comments of karatsuba_mul for details. It * is essentially the same algorithm but merely * tuned to perform recursive squarings. */ int mp_karatsuba_sqr(const mp_int *a, mp_int *b) { mp_int x0, x1, t1, t2, x0x0, x1x1; int B, err; err = MP_MEM; /* min # of digits */ B = a->used; /* now divide in two */ B = B >> 1; /* init copy all the temps */ if (mp_init_size(&x0, B) != MP_OKAY) goto LBL_ERR; if (mp_init_size(&x1, a->used - B) != MP_OKAY) goto X0; /* init temps */ if (mp_init_size(&t1, a->used * 2) != MP_OKAY) goto X1; if (mp_init_size(&t2, a->used * 2) != MP_OKAY) goto T1; if (mp_init_size(&x0x0, B * 2) != MP_OKAY) goto T2; if (mp_init_size(&x1x1, (a->used - B) * 2) != MP_OKAY) goto X0X0; { int x; mp_digit *dst, *src; src = a->dp; /* now shift the digits */ dst = x0.dp; for (x = 0; x < B; x++) { *dst++ = *src++; } dst = x1.dp; for (x = B; x < a->used; x++) { *dst++ = *src++; } } x0.used = B; x1.used = a->used - B; mp_clamp(&x0); /* now calc the products x0*x0 and x1*x1 */ if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */ if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */ /* now calc (x1+x0)**2 */ if (s_mp_add(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */ if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (x1 - x0) */ /* add x0y0 */ if (s_mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0x0 + x1x1 */ if (s_mp_sub(&t1, &t2, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */ /* shift by B */ if (mp_lshd(&t1, B) != MP_OKAY) goto X1X1; /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */ if (mp_lshd(&x1x1, B * 2) != MP_OKAY) goto X1X1; /* x1x1 = x1x1 << 2*B */ if (mp_add(&x0x0, &t1, &t1) != MP_OKAY) goto X1X1; /* t1 = x0x0 + t1 */ if (mp_add(&t1, &x1x1, b) != MP_OKAY) goto X1X1; /* t1 = x0x0 + t1 + x1x1 */ err = MP_OKAY; X1X1: mp_clear(&x1x1); X0X0: mp_clear(&x0x0); T2: mp_clear(&t2); T1: mp_clear(&t1); X1: mp_clear(&x1); X0: mp_clear(&x0); LBL_ERR: return err; }
/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */ int s_mp_sub (mp_int * a, mp_int * b, mp_int * c) { int olduse, res, min, max; /* find sizes */ min = b->used; max = a->used; /* init result */ if (c->alloc < max) { if ((res = mp_grow (c, max)) != MP_OKAY) { return res; } } olduse = c->used; c->used = max; { register mp_digit u, *tmpa, *tmpb, *tmpc; register int i; /* alias for digit pointers */ tmpa = a->dp; tmpb = b->dp; tmpc = c->dp; /* set carry to zero */ u = 0; for (i = 0; i < min; i++) { /* T[i] = A[i] - B[i] - U */ *tmpc = *tmpa++ - *tmpb++ - u; /* U = carry bit of T[i] * Note this saves performing an AND operation since * if a carry does occur it will propagate all the way to the * MSB. As a result a single shift is enough to get the carry */ u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); /* Clear carry from T[i] */ *tmpc++ &= MP_MASK; } /* now copy higher words if any, e.g. if A has more digits than B */ for (; i < max; i++) { /* T[i] = A[i] - U */ *tmpc = *tmpa++ - u; /* U = carry bit of T[i] */ u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); /* Clear carry from T[i] */ *tmpc++ &= MP_MASK; } /* clear digits above used (since we may not have grown result above) */ for (i = c->used; i < olduse; i++) { *tmpc++ = 0; } } mp_clamp (c); return MP_OKAY; }