int z_pull_twos(z *n, int *j, z *p) { //n is overwritten int c = 0; z t1, t2; fp_digit r; zInit(&t1); zInit(&t2); while (!(n->val[0] & 1)) { zShiftRight(n,n,1); c = 1 - c; } zExp(2,p,&t2); zSub(&t2,&zOne,&t1); r = zShortDiv(&t1,16,&t2); if ((c * r) == 8) *j *= -1; zFree(&t1); zFree(&t2); return c; }
void zHex2Dec(z *u, z *v) { //convert u[] in hex to v[] in decimal by repeatedly dividing //u by 1e9 = 0x3b9aca00 //the remainder of the ith division is the ith decimal digit. //when the quotient = 0, stop z a,b; fp_digit r = 0; int su = abs(u->size); int approx_words = (int)((double)su * 1.5); //because decimal takes more room than hex to store zInit(&a); zInit(&b); if (v->alloc < approx_words) zGrow(v,approx_words); zClear(v); if (a.alloc < approx_words) { zGrow(&a,approx_words); zClear(&a); } if (b.alloc < approx_words) { zGrow(&b,approx_words); zClear(&b); } zCopy(u,&a); v->size = 1; do { r = zShortDiv(&a,MAX_DEC_WORD,&b); v->val[v->size - 1] = r; v->size++; zCopy(&b,&a); } while (zCompare(&a,&zZero) != 0); v->size--; if (u->size < 0) v->size *= -1; zFree(&a); zFree(&b); return; }
void monty_init(z *n) { //for a input modulus n, initialize constants for //montogomery representation //this assumes that n is relatively prime to 2, i.e. is odd. z g, b, q, r; //global montyconst structure zInit(&montyconst.nhat); zInit(&montyconst.r); zInit(&montyconst.rhat); zInit(&montyconst.one); if (abs(n->size) <= 16) { fp_montgomery_setup(n,&montyconst.nhat.val[0]); fp_montgomery_calc_normalization(&montyconst.r,n); montyconst.one.val[0] = 1; montyconst.one.size = 1; to_monty(&montyconst.one,n); TFM_MONTY = 1; return; } else TFM_MONTY = 0; zInit(&g); zInit(&b); zInit(&q); zInit(&r); b.val[1]=1; b.size=2; //find r = b^t > N, where b = 2 ^32 if (montyconst.r.alloc < n->size + 1) zGrow(&montyconst.r,n->size + 1); zClear(&montyconst.r); montyconst.r.size = n->size + 1; montyconst.r.val[montyconst.r.size - 1] = 1; //find nhat = -n^-1 mod b //nhat = -(n^-1 mod b) mod b = b - n^-1 mod b //since b is 2^32, this can be simplified, and made faster. xGCD(n,&b,&montyconst.nhat,&montyconst.rhat,&g); zSub(&b,&montyconst.nhat,&q); zCopy(&q,&montyconst.nhat); zCopy(&zOne,&montyconst.one); to_monty(&montyconst.one,n); zFree(&g); zFree(&b); zFree(&q); zFree(&r); return; }
void to_monty(z *x, z *n) { //given a number x in normal (hexadecimal) representation, //find its montgomery representation //this uses some precomputed monty constants //xhat = (x * r) mod n z t1,t2; zInit(&t1); zInit(&t2); zMul(x,&montyconst.r,&t1); zDiv(&t1,n,&t2,x); zFree(&t1); zFree(&t2); return; }
void zmModExp(z *a, z *b, z *u, z *nn) { //computes a^b mod m = u using the right to left binary method //see, for instance, the handbook of applied cryptography //uses monty arith //a is already in monty rep, b doesn't need to be. z n,bb,aa,t; zInit(&aa); zInit(&bb); zInit(&n); zInit(&t); //overflow possibilities: //t ranges to 2x input 'a' //u needs at least as much space as modulus zCopy(&montyconst.one,&n); zCopy(a,&aa); zCopy(b,&bb); while (!isZero(&bb)) { if (bb.val[0] & 0x1) { monty_mul(&n,&aa,&t,nn); zCopy(&t,&n); } zShiftRight(&bb,&bb,1); //compute successive squares of a monty_sqr(&aa,&t,nn); zCopy(&t,&aa); if (aa.size < 0) aa.size *= -1; } zCopy(&n,u); zFree(&aa); zFree(&bb); zFree(&n); zFree(&t); return; }
void swap(z *a, z *b) { //do I actually have to physically copy here, or can I just swap pointers? z tmp; zInit(&tmp); zCopy(a,&tmp); zCopy(b,a); zCopy(&tmp,b); zFree(&tmp); return; }
int ndigits(z *n) { int i=0; z nn,tmp; fp_digit r; //can get within one digit using zBits and logs, which would //be tons faster. Any way to 'correct' the +/- 1 error? zInit(&nn); zInit(&tmp); zCopy(n,&tmp); while (tmp.size > 1) { zCopy(&tmp,&nn); r = zShortDiv(&nn,MAX_DEC_WORD,&tmp); i += DEC_DIGIT_PER_WORD; } i += ndigits_1(tmp.val[0]); zFree(&nn); zFree(&tmp); return i; }
void xGCD(z *a, z *b, z *x, z *y, z *g) { //compute the extended GCD of a, b, returning g = GCD(a,b) and x, y //such that ax + by = GCD(a,b) if a,b are coprime z t1,t2,t3,u,v,r,R,q,tmp; // int i; /* Step 1: if a < b then Set u=0, v=1, and r=b Set U=1, V=0, and R=a else Set u=1, v=0, and r=a Set U=0, V=1, and R=b Step 2: if R = 0 then return r (for the gcd) and no inverses exist. if R = 1 then return R (for the gcd), V (for the inverse a(mod b)) and U (for the inverse of b(mod a)). Step 3: Calculate q = int(r/R) Calculate t1 = u - U*q Calculate t2 = v - V*q Calculate t3 = r - R*q set u=U, v=V, r=R set U=t1, V=t2, R=t3 goto Step 2. */ zInit(&tmp); zInit(&t1); zInit(&t2); zInit(&t3); zInit(&q); zInit(&r); zInit(&R); zInit(&u); zInit(&v); //need to check for temp allocation zClear(x); zClear(y); if (zCompare(a,b) < 0) { u.val[0]=0; v.val[0]=1; zCopy(b,&r); x->val[0]=1; y->val[0]=0; zCopy(a,&R); } else { u.val[0]=1; v.val[0]=0; zCopy(a,&r); x->val[0]=0; y->val[0]=1; zCopy(b,&R); } while (1) { if (zCompare(&zZero,&R) == 0) { zCopy(&r,g); zCopy(&zZero,x); zCopy(&zZero,y); break; } if (zCompare(&zOne,&R) == 0) { zCopy(&R,g); break; } zCopy(&r,&tmp); zDiv(&tmp,&R,&q,&t3); //q = int(r/R), t3 = r % R zMul(&q,x,&tmp); //t1 = u - U*q zSub(&u,&tmp,&t1); zMul(&q,y,&tmp); //t2 = v - V*q zSub(&v,&tmp,&t2); zCopy(x,&u); zCopy(y,&v); zCopy(&R,&r); zCopy(&t1,x); zCopy(&t2,y); zCopy(&t3,&R); } if (x->size < 0) { x->size *= -1; zSub(b,x,x); } if (y->size < 0) { y->size *= -1; zSub(a,y,y); } zFree(&tmp); zFree(&t1); zFree(&t2); zFree(&t3); zFree(&q); zFree(&r); zFree(&R); zFree(&u); zFree(&v); x->type = UNKNOWN; y->type = UNKNOWN; g->type = UNKNOWN; return; }
int isSquare(z *n) { //thanks fenderbender @ mersenneforum.org unsigned long m; unsigned long largeMod; z w2,w3; int ans; // start with mod 128 rejection. 82% rejection rate // VERY fast, can read bits directly m=n->val[0] & 127; // n mod 128 if ((m*0x8bc40d7d) & (m*0xa1e2f5d1) & 0x14020a) return 0; //Other modulii share one BigInt modulus. largeMod=zShortMod(n,(63UL*25*11*17*19*23*31)); // SLOW, bigint modulus // residues mod 63. 75% rejection m=largeMod%63; // fast, all 32-bit math if ((m*0x3d491df7) & (m*0xc824a9f9) & 0x10f14008) return 0; // residues mod 25. 56% rejection m=largeMod%25; if ((m*0x1929fc1b) & (m*0x4c9ea3b2) & 0x51001005) return 0; // residues mod 31. 48.4% rejection // Bloom filter has a little different form to keep it perfect m=0xd10d829a*(largeMod%31); if (m & (m+0x672a5354) & 0x21025115) return 0; // residues mod 23. 47.8% rejection m=largeMod%23; if ((m*0x7bd28629) & (m*0xe7180889) & 0xf8300) return 0; // residues mod 19. 47.3% rejection m=largeMod%19; if ((m*0x1b8bead3) & (m*0x4d75a124) & 0x4280082b) return 0; // residues mod 17. 47.1% rejection m=largeMod%17; if ((m*0x6736f323) & (m*0x9b1d499) & 0xc0000300) return 0; // residues mod 11. 45.5% rejection m=largeMod%11; if ((m*0xabf1a3a7) & (m*0x2612bf93) & 0x45854000) return 0; // Net nonsquare rejection rate: 99.92% // We COULD extend to another round, doing another BigInt modulus and // then followup rejections here, using // primes of 13 29 37 41 43 53. That'd give 98% further rejection. // Empirical timing shows this second round would be useful for n>10^100 or so. // VERY expensive final definitive test zInit(&w2); zInit(&w3); zNroot(n,&w2,2); //w2 = sqrt(w1) zSqr(&w2,&w3); //w3 = w2^2 ans = zCompare(n,&w3); zFree(&w2); zFree(&w3); return (ans == 0); }
void zDec2Hex(z *u, z *v) { //convert u[] in dec to v[] in hex by multiplying the ith digit by (1e9)*i //and adding to the previous digits z a,b,vv; int i,su = abs(u->size); zInit(&a); zInit(&b); zInit(&vv); if (v->alloc < su) zGrow(v,su); if (a.alloc < su) { zGrow(&a,su); zClear(&a); } if (b.alloc < su) { zGrow(&b,su); zClear(&b); } if (vv.alloc < su) { zGrow(&vv,su); zClear(&vv); } vv.size = su; //a holds the value of (1e9)*i a.size = 1; a.val[0] = 1; for (i=0;i<su;i++) { zShortMul(&a,u->val[i],&b); zAdd(&vv,&b,&vv); zShortMul(&a,MAX_DEC_WORD,&a); } //v may have unused high order limbs for (i=su-1;i>=0;i--) { if (vv.val[i] != 0) break; } vv.size = i+1; if (u->size < 0) vv.size *= -1; if (vv.size == 0) vv.size = 1; zCopy(&vv,v); zFree(&vv); zFree(&a); zFree(&b); return; }
char *z2decstr(z *n, str_t *s) { //pass in a pointer to a string. if necessary, this routine will //reallocate space for the string to accomodate its size. If this happens //the pointer to the string's (likely) new location is automatically //updated and returned. z a; int i,sza; char *tmp; //for really long inputs, a significant amount of time is spent here. //for instance, in computing 10000!, 0.047sec is spent on actually //computing the factorial, while ~.5 sec is needed for the Hex2Dec conversion //and ~.8 sec is required to print it to a string. //maybe try to unroll the loop a bit? strcpy(s->s,""); s->nchars = 1; zInit(&a); //printf("starting hex 2 dec conversion\n"); zHex2Dec(n,&a); sza = abs(a.size); if (s->alloc < DEC_DIGIT_PER_WORD*sza + 2) { s->s = (char *)realloc(s->s,(DEC_DIGIT_PER_WORD*sza + 10)*sizeof(char)); s->alloc = (DEC_DIGIT_PER_WORD*sza + 10); } tmp = (char *)malloc(30); //print negative sign, if necessary if (n->size < 0) { sprintf(s->s,"-"); s->nchars++; } //print first word #if BITS_PER_DIGIT == 32 sprintf(s->s,"%s%u",s->s,(uint32)a.val[sza - 1]); s->nchars += ndigits_1(a.val[sza-1]) - 1; //print the rest for (i=sza - 2; i>=0; i--) { //sprintf(s->s,"%s%09u",s->s,a.val[i]); //s->nchars += 9; sprintf(tmp,"%09u",(uint32)a.val[i]); memcpy(s->s + s->nchars, tmp, 9); s->nchars += 9; } #else sprintf(s->s,"%s%" PRIu64,s->s,a.val[sza - 1]); s->nchars += ndigits_1(a.val[sza-1]) - 1; //print the rest for (i=sza - 2; i>=0; i--) { //sprintf(s->s,"%s%09u",s->s,a.val[i]); //s->nchars += 9; sprintf(tmp,"%019" PRIu64,a.val[i]); memcpy(s->s + s->nchars, tmp, 19); s->nchars += 19; } #endif s->s[s->nchars] = '\0'; s->nchars++; zFree(&a); free(tmp); return s->s; }
void zmModExpw(z *a, z *e, z *u, z *n, int k) { //computes a^e mod m = u using the sliding window left to right binary method //see, for instance, the handbook of applied cryptography //uses monty arith //a is already in monty rep, b doesn't need to be. k is the window size /* INPUT: g, e = (etet-1 . . . e1e0)2 with et = 1, and an integer k >= 1. OUTPUT: g^e. 1. Precomputation. 1.1 g1 = g, g2 = g^2. 1.2 For i from 1 to (2^(k-1) - 1) do: g_{2i+1} = g_{2i-1} * g2. 2. A = 1, i = t. 3. While i >= 0 do the following: 3.1 If ei = 0 then do: A = A^2, i = i - 1. 3.2 Otherwise (ei != 0), find the longest bitstring eiei-1 . . . el such that i-l+1 <= k and el = 1, and do the following: A = A^{2^{i-l+1}} * g_{eiei-1...el}2 , i = l - 1. 4. Return(A). test -> 11749. 3 multiplications at i=7,4,0 */ //need to allocate (2^(k-1) + 1) g's for precomputation. z *g, g2, ztmp; int numg, i, j, l, t, tmp1, tmp2; fp_digit utmp1; uint8 *bitarray; //overflow possibilities: //t ranges to 2x input 'a' //u needs at least as much space as modulus numg = (int)((1<<(k-1))+1); g = (z *)malloc(numg*sizeof(z)); for (i=0;i<numg;i++) zInit(&g[i]); zInit(&g2); zInit(&ztmp); //precomputation zCopy(a,&g[0]); //g[0] = a monty_sqr(a,&g2,n); //g2 = a^2 for (i=1;i<numg;i++) monty_mul(&g[i-1],&g2,&g[i],n); //g[i] = g[i-1] * g2, where g[i] holds g^{2*i+1} zCopy(&montyconst.one,u); t = zBits(e); bitarray = (uint8 *)malloc(t * sizeof(uint8)); //get e in one array for (i=0;i< e->size - 1;i++) { utmp1 = e->val[i]; j=0; while (j<BITS_PER_DIGIT) { bitarray[BITS_PER_DIGIT*i+j] = (uint8)(utmp1 & 0x1); utmp1 >>= 1; j++; } } utmp1 = e->val[i]; j=0; while (utmp1) { bitarray[BITS_PER_DIGIT*i+j] = (uint8)(utmp1 & 0x1); utmp1 >>= 1; j++; } i=t-1; while (i >= 0) { if (bitarray[i]) { //find the longest bitstring ei,e1-1,...el such that i-l+1 <= k and el == 1 l=i; if (i >= (k-1)) { //protect against accessing bitarray past its boundaries for (j=k-1;j>0;j--) { if (bitarray[i-j]) { //this is the longest possible string, exit l=i-j; break; } } } //now, bitarray[i] to bitarray[i-j] is the longest bitstring //figure out the g value to use corresponding to this bitstring tmp1 = 1; tmp2 = 0; for (j=l;j<=i;j++) { tmp2 += tmp1 * bitarray[j]; tmp1 <<= 1; } tmp2 = (tmp2-1)/2; //do the operation A = A^{2^{i-l+1}} * g_{eiei-1...el}2 for (j=0;j<(i-l+1);j++) { monty_sqr(u,&ztmp,n); zCopy(&ztmp,u); } monty_mul(u,&g[tmp2],&ztmp,n); zCopy(&ztmp,u); //decrement bit pointer i = l-1; } else { monty_sqr(u,&ztmp,n); zCopy(&ztmp,u); i--; } } for (i=0;i<numg;i++) zFree(&g[i]); free(g); zFree(&g2); zFree(&ztmp); free(bitarray); return; }
void zREDC(z *T, z *n) { /* from handbook of applied cryptography, ch. 14 INPUT: integers m = (mn-1 . . .m1m0)b with gcd(m; b) = 1, R = b^n,m' = -m^-1 mod b, and T = (t2n-1 . . . t1t0)b <mR. OUTPUT: TR^-1 mod m, the reduction of T mod m in montgomery representation... 1. A=T . (Notation: A = (a2n-1 . . . a1a0)b.) 2. For i from 0 to (n - 1) do the following: 2.1 ui=ai*m' mod b. 2.2 A=A + ui*m*b^i. 3. A=A/b^n. 4. If A > m then A=A-m. 5. Return(A). */ int i,j,ix,su; fp_digit nhat = montyconst.nhat.val[0], ui,k; z mtmp3; if (TFM_MONTY == 1) { fp_montgomery_reduce(T,n,montyconst.nhat.val[0]); return; } //printf("shouldn't get to here\n"); zInit(&mtmp3); if (mtmp3.alloc < n->size * 2) zGrow(&mtmp3,n->size * 2); //T needs to have allocated montyconst.n.size + T.size if (T->alloc < n->size + T->size) zGrow(T,n->size + T->size + 1); for (i=0;i<n->size;i++) { //the mod b happens automatically because only the //lower 32 bits of the product is returned. ui = T->val[i] * nhat; //ui = a1*nhat mod b //zShortMul(&montyconst.n,ui,&mtmp3); //t1 = ui * n //short mul k=0; su = n->size; for (ix=0;ix<su;++ix) spMulAdd(n->val[ix],ui,0,k,&mtmp3.val[ix],&k); //if still have a carry, add a digit to w if (k) { mtmp3.val[su]=k; su++; } //check for significant digits. only necessary if v or u = 0? for (ix = su - 1;ix>=0;--ix) { if (mtmp3.val[ix] != 0) break; } mtmp3.size = ix+1; for (j=mtmp3.size - 1;j>=0;j--) //t1 *= b^i mtmp3.val[j+i] = mtmp3.val[j]; mtmp3.size += i; zAdd(T,&mtmp3,T); //A += t1 } for (j=0; j<T->size; j++) //A /= b^n T->val[j] = T->val[j+n->size]; T->size -= n->size; if (zCompare(T,n) > 0) //if A > n, A = A-n zSub(T,n,T); if (T->size == 0) zCopy(n,T); zFree(&mtmp3); return; }
void monty_mul_interleaved(z *a, z *b, z *c, z *n) { fp_digit nhat = montyconst.nhat.val[0], u; int i,j,t=n->size; int szb = abs(b->size); fp_digit k; z *t1,*t2; z s1,s2; zInit(&s1); zInit(&s2); t1 = &s1; t2 = &s2; zClear(t1); zClear(t2); for (i=0;i<t;i++) { u = (t1->val[0] + a->val[i] * b->val[0]) * nhat; //truncation will provide mod b /****** short mul of b with ai, simultaneous with addition of A (in t1) ********/ for (j=t1->size;j<szb;j++) t1->val[j] = 0; //zero any unused words up to size of b, so we can add //mul and add up to size of b k=0; for (j=0;j<szb ;j++) spMulAdd(b->val[j],a->val[i],t1->val[j],k,t2->val + j,&k); //continue with add if A has more words for (;j<t1->size;j++) spAdd(t1->val[j],k,t2->val+j,&k); //adjust size if (t1->size > szb) t2->size = t1->size; else t2->size = szb; //account for carry if (k) { t2->val[t2->size]=k; t2->size++; j++; } /****** short mul of b with ai, simultaneous with addition of A (in t1) ********/ /****** short mul of n with u, simultaneous with add. of prev step (in t2) and with right shift of one word ********/ for (;j<t;j++) t2->val[j] = 0; //zero any unused words up to size of n, so we can add //mul and add up to size of n, store into one word previous k=0; //needs first mul to get k set right, answer gets shifted to oblivion spMulAdd(n->val[0],u,t2->val[0],k,t1->val,&k); for (j=1;j<t;j++) spMulAdd(n->val[j],u,t2->val[j],k,t1->val + j - 1,&k); //continue if t2 is bigger than n for (;j<t2->size;j++) spAdd(t2->val[j],k,t1->val+j-1,&k); //adjust size if (t2->size > t) t1->size = t2->size - 1; else t1->size = t - 1; //account for carry if (k) { t1->val[t1->size]=k; t1->size++; } /****** short mul of n with u, simultaneous with add. of prev step (in t2) and with right shift of one word ********/ } //almost done if (zCompare(t1,n) >= 0) zSub(t1,n,c); else zCopy(t1,c); zFree(&s1); zFree(&s2); return; }
void fp_mul_comba(z *A, z *B, z *C) { int ix, iy, iz, tx, ty, pa, sA, sB; fp_digit c0, c1, c2, *tmpx, *tmpy; z *dst; z loc; COMBA_START; COMBA_CLEAR; /* get size of output and trim */ sA = abs(A->size); sB = abs(B->size); pa = sA + sB; if (A == C || B == C) { zInit(&loc); //dst = &atmp1; dst = &loc; } else { dst = C; } if (dst->alloc < pa) zGrow(dst,pa + LIMB_BLKSZ); zClear(dst); for (ix = 0; ix < pa; ix++) { /* get offsets into the two bignums */ ty = MIN(ix, sB-1); tx = ix - ty; /* setup temp aliases */ tmpx = A->val + tx; tmpy = B->val + ty; /* this is the number of times the loop will iterrate, essentially its while (tx++ < a->used && ty-- >= 0) { ... } */ iy = MIN(sA-tx, ty+1); /* execute loop */ COMBA_FORWARD; for (iz = 0; iz < iy; ++iz) { MULADD(*tmpx++, *tmpy--); } /* store term */ COMBA_STORE(dst->val[ix]); } COMBA_FINI; dst->size = pa; if ((A->size * B->size) < 0) dst->size *= -1; fp_clamp(dst); if (dst != C) { zCopy(dst, C); zFree(&loc); } }
void fp_sqr_comba(z *A, z *B) { int pa, ix, iz, sA; fp_digit c0, c1, c2; z *dst; z loc; #ifdef TFM_ISO uint64 tt; #endif /* get size of output and trim */ sA = abs(A->size); pa = sA + sA; /* number of output digits to produce */ COMBA_START; CLEAR_CARRY; if (A == B) { //zClear(&atmp1); zInit(&loc); //dst = &atmp1; dst = &loc; } else { zClear(B); dst = B; } if (dst->alloc < pa) { zGrow(dst,pa + LIMB_BLKSZ); } zClear(dst); for (ix = 0; ix < pa; ix++) { int tx, ty, iy; fp_digit *tmpy, *tmpx; /* get offsets into the two bignums */ ty = MIN(sA-1, ix); tx = ix - ty; /* setup temp aliases */ tmpx = A->val + tx; tmpy = A->val + ty; /* this is the number of times the loop will iterrate, while (tx++ < a->used && ty-- >= 0) { ... } */ iy = MIN(sA-tx, ty+1); /* now for squaring tx can never equal ty * we halve the distance since they approach * at a rate of 2x and we have to round because * odd cases need to be executed */ iy = MIN(iy, (ty-tx+1)>>1); /* forward carries */ CARRY_FORWARD; /* execute loop */ for (iz = 0; iz < iy; iz++) { SQRADD2(*tmpx++, *tmpy--); } /* even columns have the square term in them */ if ((ix&1) == 0) { SQRADD(A->val[ix>>1],A->val[ix>>1]); } /* store it */ COMBA_STORE(dst->val[ix]); }