/* -- FOLDING --------------------------------------------------------------- // // The performance boost is achieved by a process that I call it FOLDING. // Folding can be viewed as an extension of Shamir's trick but it is based // on break down of the scalar multiplier of a*P into a polynomial of the // form: // // a*P = SUM(a_i*2^(i*w))*P for i = 0,1,2,...n-1 // // a*P = SUM(a_i*P_i) // // where P_i = (2^(i*w))*P // n = number of folds // w = bit-length of a_i // // For folding of 8, 256-bit multiplier 'a' is chopped into 8 limbs of // 32-bits each (a_0, a_1,...a_7). P_0 - P_7 can be pre-calculated and // their 256-different permutations can be cached or hard-coded // directly into the code. // This arrangement combined with double-and-add approach reduces the // number of EC point calculations by a factor of 8. We only need 31 // double & add operations. // // +---+---+---+---+---+---+- .... -+---+---+---+---+---+---+ // a = (|255|254|253|252|251|250| | 5 | 4 | 3 | 2 | 1 | 0 |) // +---+---+---+---+---+---+- .... -+---+---+---+---+---+---+ // // a_i P_i // +---+---+---+ .... -+---+---+---+ ---------- // a7 = (|255|254|253| |226|225|226|) * (2**224)*P // +---+---+---+ .... -+---+---+---+ // a6 = (|225|224|223| |194|193|192|) * (2**192)*P // +---+---+---+ .... -+---+---+---+ // a5 = (|191|190|189| |162|161|160|) * (2**160)*P // +---+---+---+ .... -+---+---+---+ // a4 = (|159|158|157| |130|129|128|) * (2**128)*P // +---+---+---+ .... -+---+---+---+ // a3 = (|127|126|125| | 98| 97| 96|) * (2**96)*P // +---+---+---+ .... -+---+---+---+ // a2 = (| 95| 94| 93| | 66| 65| 64|) * (2**64)*P // +---+---+---+ .... -+---+---+---+ // a1 = (| 63| 62| 61| | 34| 33| 32|) * (2**32)*P // +---+---+---+ .... -+---+---+---+ // a0 = (| 31| 30| 29| | 2 | 1 | 0 |) * (2**0)*P // +---+---+---+ .... -+---+---+---+ // | | | | // | +--+ | +--+ // | | | | // V V slices V V // +---+ +---+ .... +---+ +---+ // |255| |254| |225| |226| P7 // +---+ +---+ .... +---+ +---+ // |225| |224| |193| |192| P6 // +---+ +---+ .... +---+ +---+ // |191| |190| |161| |160| P5 // +---+ +---+ .... +---+ +---+ // |159| |158| |129| |128| P4 // +---+ +---+ .... +---+ +---+ // |127| |126| | 97| | 96| P3 // +---+ +---+ .... +---+ +---+ // | 95| | 94| | 65| | 64| P2 // +---+ +---+ .... +---+ +---+ // | 63| | 62| | 33| | 32| P1 // +---+ +---+ .... +---+ +---+ // | 31| | 30| | 1 | | 0 | P0 // +---+ +---+ .... +---+ +---+ // cut[]: 0 1 .... 30 31 // -------------------------------------------------------------------------- // Return S = a*P where P is ed25519 base point and R is random */ void edp_BasePointMult( OUT Ext_POINT *S, IN const U_WORD *sk, IN const U_WORD *R) { int i = 1; U8 cut[32]; const PA_POINT *p0; ecp_8Folds(cut, sk); p0 = &_w_base_folding8[cut[0]]; ecp_SubReduce(S->x, p0->YpX, p0->YmX); /* 2x */ ecp_AddReduce(S->y, p0->YpX, p0->YmX); /* 2y */ ecp_MulReduce(S->t, p0->T2d, _w_di); /* 2xy */ /* Randomize starting point */ ecp_AddReduce(S->z, R, R); /* Z = 2R */ ecp_MulReduce(S->x, S->x, R); /* X = 2xR */ ecp_MulReduce(S->t, S->t, R); /* T = 2xyR */ ecp_MulReduce(S->y, S->y, R); /* Y = 2yR */ do { edp_DoublePoint(S); edp_AddAffinePoint(S, &_w_base_folding8[cut[i]]); } while (i++ < 31); }
/* return P = P + Q, Q = 2Q */ void ecp_Mont(XZ_POINT *P, XZ_POINT *Q, IN const U_WORD *Base) { U_WORD A[K_WORDS], B[K_WORDS], C[K_WORDS], D[K_WORDS], E[K_WORDS]; /* x3 = ((x1-z1)(x2+z2) + (x1+z1)(x2-z2))^2*zb zb=1 */ /* z3 = ((x1-z1)(x2+z2) - (x1+z1)(x2-z2))^2*xb xb=Base */ ecp_SubReduce(A, P->X, P->Z); /* A = x1-z1 */ ecp_AddReduce(B, P->X, P->Z); /* B = x1+z1 */ ecp_SubReduce(C, Q->X, Q->Z); /* C = x2-z2 */ ecp_AddReduce(D, Q->X, Q->Z); /* D = x2+z2 */ ecp_MulReduce(A, A, D); /* A = (x1-z1)(x2+z2) */ ecp_MulReduce(B, B, C); /* B = (x1+z1)(x2-z2) */ ecp_AddReduce(E, A, B); /* E = (x1-z1)(x2+z2) + (x1+z1)(x2-z2) */ ecp_SubReduce(B, A, B); /* B = (x1-z1)(x2+z2) - (x1+z1)(x2-z2) */ ecp_SqrReduce(P->X, E); /* x3 = ((x1-z1)(x2+z2) + (x1+z1)(x2-z2))^2 */ ecp_SqrReduce(A, B); /* A = ((x1-z1)(x2+z2) - (x1+z1)(x2-z2))^2 */ ecp_MulReduce(P->Z, A, Base); /* z3 = ((x1-z1)(x2+z2) - (x1+z1)(x2-z2))^2*Base */ /* x4 = (x2+z2)^2 * (x2-z2)^2 */ /* z4 = ((x2+z2)^2 - (x2-z2)^2)*((x2+z2)^2 + 121665((x2+z2)^2 - (x2-z2)^2)) */ /* C = (x2-z2) */ /* D = (x2+z2) */ ecp_SqrReduce(A, D); /* A = (x2+z2)^2 */ ecp_SqrReduce(B, C); /* B = (x2-z2)^2 */ ecp_MulReduce(Q->X, A, B); /* x4 = (x2+z2)^2 * (x2-z2)^2 */ ecp_SubReduce(B, A, B); /* B = (x2+z2)^2 - (x2-z2)^2 */ ecp_WordMulAddReduce(A, A, 121665, B); ecp_MulReduce(Q->Z, A, B); /* z4 = B*((x2+z2)^2 + 121665*B) */ }
/* Y = X + X */ void ecp_MontDouble(XZ_POINT *Y, const XZ_POINT *X) { U_WORD A[K_WORDS], B[K_WORDS]; /* x2 = (x+z)^2 * (x-z)^2 */ /* z2 = ((x+z)^2 - (x-z)^2)*((x+z)^2 + ((A-2)/4)((x+z)^2 - (x-z)^2)) */ ecp_AddReduce(A, X->X, X->Z); /* A = (x+z) */ ecp_SubReduce(B, X->X, X->Z); /* B = (x-z) */ ecp_SqrReduce(A, A); /* A = (x+z)^2 */ ecp_SqrReduce(B, B); /* B = (x-z)^2 */ ecp_MulReduce(Y->X, A, B); /* x2 = (x+z)^2 * (x-z)^2 */ ecp_SubReduce(B, A, B); /* B = (x+z)^2 - (x-z)^2 */ /* (486662-2)/4 = 121665 */ ecp_WordMulAddReduce(A, A, 121665, B); ecp_MulReduce(Y->Z, A, B); /* z2 = (B)*((x+z)^2 + ((A-2)/4)(B)) */ }
void edp_ExtPoint2PE(PE_POINT *r, const Ext_POINT *p) { ecp_AddReduce(r->YpX, p->y, p->x); ecp_SubReduce(r->YmX, p->y, p->x); ecp_MulReduce(r->T2d, p->t, _w_2d); ecp_AddReduce(r->Z2, p->z, p->z); }
/* Reference: http://eprint.iacr.org/2008/522 Cost: 4M + 4S + 7add Return: P = 2*P */ void edp_DoublePoint(Ext_POINT *p) { U_WORD a[K_WORDS], b[K_WORDS], c[K_WORDS], d[K_WORDS], e[K_WORDS]; ecp_SqrReduce(a, p->x); /* A = X1^2 */ ecp_SqrReduce(b, p->y); /* B = Y1^2 */ ecp_SqrReduce(c, p->z); /* C = 2*Z1^2 */ ecp_AddReduce(c, c, c); ecp_SubReduce(d, _w_maxP, a); /* D = -A */ ecp_SubReduce(a, d, b); /* H = D-B */ ecp_AddReduce(d, d, b); /* G = D+B */ ecp_SubReduce(b, d, c); /* F = G-C */ ecp_AddReduce(e, p->x, p->y); /* E = (X1+Y1)^2-A-B = (X1+Y1)^2+H */ ecp_SqrReduce(e, e); ecp_AddReduce(e, e, a); ecp_MulReduce(p->x, e, b); /* E*F */ ecp_MulReduce(p->y, a, d); /* H*G */ ecp_MulReduce(p->z, d, b); /* G*F */ ecp_MulReduce(p->t, e, a); /* E*H */ }
/* Assumptions: pre-computed q, q->Z=1 Cost: 7M + 7add Return: P = P + Q */ void edp_AddAffinePoint(Ext_POINT *p, const PA_POINT *q) { U_WORD a[K_WORDS], b[K_WORDS], c[K_WORDS], d[K_WORDS], e[K_WORDS]; ecp_SubReduce(a, p->y, p->x); /* A = (Y1-X1)*(Y2-X2) */ ecp_MulReduce(a, a, q->YmX); ecp_AddReduce(b, p->y, p->x); /* B = (Y1+X1)*(Y2+X2) */ ecp_MulReduce(b, b, q->YpX); ecp_MulReduce(c, p->t, q->T2d); /* C = T1*2d*T2 */ ecp_AddReduce(d, p->z, p->z); /* D = Z1*2*Z2 (Z2=1)*/ ecp_SubReduce(e, b, a); /* E = B-A */ ecp_AddReduce(b, b, a); /* H = B+A */ ecp_SubReduce(a, d, c); /* F = D-C */ ecp_AddReduce(d, d, c); /* G = D+C */ ecp_MulReduce(p->x, e, a); /* E*F */ ecp_MulReduce(p->y, b, d); /* H*G */ ecp_MulReduce(p->t, e, b); /* E*H */ ecp_MulReduce(p->z, d, a); /* G*F */ }
/* Reference: http://eprint.iacr.org/2008/522 Cost: 7M + 7add Return: R = P + BasePoint */ void edp_AddBasePoint(Ext_POINT *p) { U_WORD a[K_WORDS], b[K_WORDS], c[K_WORDS], d[K_WORDS], e[K_WORDS]; ecp_SubReduce(a, p->y, p->x); /* A = (Y1-X1)*(Y2-X2) */ ecp_MulReduce(a, a, _w_base_folding8[1].YmX); ecp_AddReduce(b, p->y, p->x); /* B = (Y1+X1)*(Y2+X2) */ ecp_MulReduce(b, b, _w_base_folding8[1].YpX); ecp_MulReduce(c, p->t, _w_base_folding8[1].T2d); /* C = T1*2d*T2 */ ecp_AddReduce(d, p->z, p->z); /* D = 2*Z1 */ ecp_SubReduce(e, b, a); /* E = B-A */ ecp_AddReduce(b, b, a); /* H = B+A */ ecp_SubReduce(a, d, c); /* F = D-C */ ecp_AddReduce(d, d, c); /* G = D+C */ ecp_MulReduce(p->x, e, a); /* E*F */ ecp_MulReduce(p->y, b, d); /* H*G */ ecp_MulReduce(p->t, e, b); /* E*H */ ecp_MulReduce(p->z, d, a); /* G*F */ }
int curve25519_SelfTest(int level) { int rc = 0; U64 A[4], B[4], C[4]; U8 a[32], b[32], c[32], d[32]; ecp_AddReduce(A, _w_I, _w_P); ECP_MOD(A); if (ecp_Cmp(A, _w_I) != 0) { rc++; printf("assert I+p == I mod p FAILED!!\n"); ecp_PrintHexWords("A_1", A, 4); } if (ecp_Cmp(_w_I, _w_P) >= 0) { rc++; printf("assert I < P FAILED!!\n"); } if (ecp_Cmp(_w_P, _w_I) <= 0) { rc++; printf("assert P > I FAILED!!\n"); } ecp_MulReduce(B, _w_I, _w_D); ECP_MOD(B); if (ecp_Cmp(B, _w_IxD) != 0) { rc++; printf("assert I*D FAILED!!\n"); ecp_PrintHexWords("A_2", B, 4); } // assert I*I == p-1 ecp_MulMod(A, _w_I, _w_I); if (ecp_Cmp(A, _w_Pm1) != 0) { rc++; printf("assert mul(I,I) == p-1 FAILED!!\n"); ecp_PrintHexWords("A_3", A, 4); } // assert I**2 == p-1 ecp_SqrReduce(B, _w_I); ECP_MOD(B); if (ecp_Cmp(B, _w_Pm1) != 0) { rc++; printf("assert square(I) == p-1 FAILED!!\n"); ecp_PrintHexWords("B_4", B, 4); } // assert (-I)*(-I) == p-1 ecp_Sub(B, _w_P, _w_I); ecp_MulMod(A, B, B); if (ecp_Cmp(A, _w_Pm1) != 0) { rc++; printf("assert mul(-I,-I) == p-1 FAILED!!\n"); ecp_PrintHexWords("A_5", A, 4); ecp_PrintHexWords("B_5", B, 4); } ecp_SetValue(A, 50153); ecp_Inverse(B, A); ecp_MulMod(A, A, B); if (ecp_Cmp(A, _w_One) != 0) { rc++; printf("invmod FAILED!!\n"); ecp_PrintHexWords("inv_50153", B, 4); ecp_PrintHexWords("expected_1", A, 4); } // assert expmod(d,(p-1)/2,p) == p-1 ecp_ExpMod(A, _w_D, _b_Pm1d2, 32); if (ecp_Cmp(A, _w_Pm1) != 0) { rc++; printf("assert expmod(d,(p-1)/2,p) == p-1 FAILED!!\n"); ecp_PrintHexWords("A_3", A, 4); } ecp_CalculateY(a, ecp_BasePoint); ecp_BytesToWords(A, a); if (ecp_Cmp(A, _w_Gy) != 0) { rc++; printf("assert clacY(Base) == Base.y FAILED!!\n"); ecp_PrintHexBytes("Calculated_Base.y", a, 32); } ecp_PointMultiply(a, ecp_BasePoint, _b_Om1, 32); if (memcmp(a, ecp_BasePoint, 32) != 0) { rc++; printf("assert (l-1).Base == Base FAILED!!\n"); ecp_PrintHexBytes("A_5", a, 32); } ecp_PointMultiply(a, ecp_BasePoint, _b_O, 32); ecp_BytesToWords(A, a); if (!ecp_IsZero(A)) { rc++; printf("assert l.Base == 0 FAILED!!\n"); ecp_PrintHexBytes("A_6", a, 32); } // Key generation ecp_PointMultiply(a, ecp_BasePoint, pk1, 32); ecp_PrintHexBytes("PublicKey1", a, 32); ecp_PointMultiply(b, ecp_BasePoint, pk2, 32); ecp_PrintHexBytes("PublicKey2", b, 32); // ECDH - key exchange ecp_PointMultiply(c, b, pk1, 32); ecp_PrintHexBytes("SharedKey1", c, 32); ecp_PointMultiply(d, a, pk2, 32); ecp_PrintHexBytes("SharedKey2", d, 32); if (memcmp(c, d, 32) != 0) { rc++; printf("ECDH key exchange FAILED!!\n"); } memset(a, 0x44, 32); // our secret key ecp_PointMultiply(b, ecp_BasePoint, a, 32); // public key ecp_PointMultiply(c, b, _b_k1, 32); ecp_PointMultiply(d, c, _b_k2, 32); if (memcmp(d, b, 32) != 0) { rc++; printf("assert k1.k2.D == D FAILED!!\n"); ecp_PrintHexBytes("D", d, 4); ecp_PrintHexBytes("C", c, 4); ecp_PrintHexBytes("A", a, 4); } ecp_BytesToWords(A, _b_k1); ecp_BytesToWords(B, _b_k2); eco_InvModBPO(C, A); if (ecp_Cmp(C, B) != 0) { rc++; printf("assert 1/k1 == k2 mod BPO FAILED!!\n"); ecp_PrintHexWords("Calc", C, 4); ecp_PrintHexWords("Expt", B, 4); } eco_MulMod(C, A, B); if (ecp_Cmp(C, _w_One) != 0) { rc++; printf("assert k1*k2 == 1 mod BPO FAILED!!\n"); ecp_PrintHexWords("Calc", C, 4); } return rc; }
/* K in a little-endian byte array */ void ecp_PointMultiply( OUT U8 *PublicKey, IN const U8 *BasePoint, IN const U8 *SecretKey, IN int len) { int i, j, k; U_WORD X[K_WORDS]; XZ_POINT P, Q, *PP[2], *QP[2]; ecp_BytesToWords(X, BasePoint); /* 1: P = (2k+1)G, Q = (2k+2)G */ /* 0: Q = (2k+1)G, P = (2k)G */ /* Find first non-zero bit */ while (len-- > 0) { k = SecretKey[len]; for (i = 0; i < 8; i++, k <<= 1) { /* P = kG, Q = (k+1)G */ if (k & 0x80) { /* We have first non-zero bit // This is always bit 254 for keys created according to the spec. // Start with randomized base point */ ecp_Add(P.Z, X, edp_custom_blinding.zr); /* P.Z = random */ ecp_MulReduce(P.X, X, P.Z); ecp_MontDouble(&Q, &P); PP[1] = &P; PP[0] = &Q; QP[1] = &Q; QP[0] = &P; /* Everything we reference in the below loop are on the stack // and already touched (cached) */ while (++i < 8) { k <<= 1; ECP_MONT(7); } while (len > 0) { k = SecretKey[--len]; ECP_MONT(7); ECP_MONT(6); ECP_MONT(5); ECP_MONT(4); ECP_MONT(3); ECP_MONT(2); ECP_MONT(1); ECP_MONT(0); } ecp_Inverse(Q.Z, P.Z); ecp_MulMod(X, P.X, Q.Z); ecp_WordsToBytes(PublicKey, X); return; } } } /* K is 0 */ mem_fill(PublicKey, 0, 32); }