void point_add(struct affine_point *p1, const struct affine_point *p2, const struct domain_params *dp) { if (! point_is_zero(p2)) { if (! point_is_zero(p1)) { if (! gcry_mpi_cmp(p1->x, p2->x)) { if (! gcry_mpi_cmp(p1->y, p2->y)) point_double(p1, dp); else point_load_zero(p1); } else { gcry_mpi_t t; t = gcry_mpi_snew(0); gcry_mpi_subm(t, p1->y, p2->y, dp->m); gcry_mpi_subm(p1->y, p1->x, p2->x, dp->m); gcry_mpi_invm(p1->y, p1->y, dp->m); gcry_mpi_mulm(p1->y, t, p1->y, dp->m); gcry_mpi_mulm(t, p1->y, p1->y, dp->m); gcry_mpi_addm(p1->x, p1->x, p2->x, dp->m); gcry_mpi_subm(p1->x, t, p1->x, dp->m); gcry_mpi_subm(t, p2->x, p1->x, dp->m); gcry_mpi_mulm(p1->y, p1->y, t, dp->m); gcry_mpi_subm(p1->y, p1->y, p2->y, dp->m); gcry_mpi_release(t); } } else point_set(p1, p2); } }
static void ec_GFp_nistp256_dbl(const EC_GROUP *group, EC_RAW_POINT *r, const EC_RAW_POINT *a) { fe x, y, z; fe_from_generic(x, &a->X); fe_from_generic(y, &a->Y); fe_from_generic(z, &a->Z); point_double(x, y, z, x, y, z); fe_to_generic(&r->X, x); fe_to_generic(&r->Y, y); fe_to_generic(&r->Z, z); }
static void point_add(struct point *r, struct point *p, struct point *q) { u8 s[20], t[20], u[20]; u8 *px, *py, *qx, *qy, *rx, *ry; struct point pp, qq; pp = *p; qq = *q; px = pp.x; py = pp.y; qx = qq.x; qy = qq.y; rx = r->x; ry = r->y; if (point_is_zero(&pp)) { elt_copy(rx, qx); elt_copy(ry, qy); return; } if (point_is_zero(&qq)) { elt_copy(rx, px); elt_copy(ry, py); return; } elt_sub(u, qx, px); if (elt_is_zero(u)) { elt_sub(u, qy, py); if (elt_is_zero(u)) point_double(r, &pp); else point_zero(r); return; } elt_inv(t, u); // t = 1/(qx-px) elt_sub(u, qy, py); // u = qy-py elt_mul(s, t, u); // s = (qy-py)/(qx-px) elt_square(rx, s); // rx = s*s elt_add(t, px, qx); // t = px+qx elt_sub(rx, rx, t); // rx = s*s - (px+qx) elt_sub(t, px, rx); // t = -(rx-px) elt_mul(ry, s, t); // ry = -s*(rx-px) elt_sub(ry, ry, py); // ry = -s*(rx-px) - py }
static void point_mul(struct point *d, u8 *a, struct point *b) // a is bignum { u32 i; u8 mask; point_zero(d); for (i = 0; i < 21; i++) for (mask = 0x80; mask != 0; mask >>= 1) { point_double(d, d); if ((a[i] & mask) != 0) point_add(d, d, b); } }
struct affine_point pointmul(const struct affine_point *p, const gcry_mpi_t exp, const struct domain_params *dp) { struct affine_point r = point_new(); int n = gcry_mpi_get_nbits(exp); while (n) { point_double(&r, dp); if (gcry_mpi_test_bit(exp, --n)) point_add(&r, p, dp); } assert(point_on_curve(&r, dp)); return r; }
static void ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_RAW_POINT *r, const EC_SCALAR *g_scalar, const EC_RAW_POINT *p, const EC_SCALAR *p_scalar) { fe p_pre_comp[17][3]; fe x_out, y_out, z_out; if (p != NULL && p_scalar != NULL) { // We treat NULL scalars as 0, and NULL points as points at infinity, i.e., // they contribute nothing to the linear combination. OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp)); // Precompute multiples. fe_from_generic(p_pre_comp[1][0], &p->X); fe_from_generic(p_pre_comp[1][1], &p->Y); fe_from_generic(p_pre_comp[1][2], &p->Z); for (size_t j = 2; j <= 16; ++j) { if (j & 1) { point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2], p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2], 0, p_pre_comp[j - 1][0], p_pre_comp[j - 1][1], p_pre_comp[j - 1][2]); } else { point_double(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2], p_pre_comp[j / 2][0], p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]); } } } batch_mul(x_out, y_out, z_out, (p != NULL && p_scalar != NULL) ? p_scalar->bytes : NULL, g_scalar != NULL ? g_scalar->bytes : NULL, (const fe (*) [3])p_pre_comp); fe_to_generic(&r->X, x_out); fe_to_generic(&r->Y, y_out); fe_to_generic(&r->Z, z_out); }
static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group, EC_RAW_POINT *r, const EC_SCALAR *g_scalar, const EC_RAW_POINT *p, const EC_SCALAR *p_scalar) { #define P256_WSIZE_PUBLIC 4 // Precompute multiples of |p|. p_pre_comp[i] is (2*i+1) * |p|. fe p_pre_comp[1 << (P256_WSIZE_PUBLIC-1)][3]; fe_from_generic(p_pre_comp[0][0], &p->X); fe_from_generic(p_pre_comp[0][1], &p->Y); fe_from_generic(p_pre_comp[0][2], &p->Z); fe p2[3]; point_double(p2[0], p2[1], p2[2], p_pre_comp[0][0], p_pre_comp[0][1], p_pre_comp[0][2]); for (size_t i = 1; i < OPENSSL_ARRAY_SIZE(p_pre_comp); i++) { point_add(p_pre_comp[i][0], p_pre_comp[i][1], p_pre_comp[i][2], p_pre_comp[i - 1][0], p_pre_comp[i - 1][1], p_pre_comp[i - 1][2], 0 /* not mixed */, p2[0], p2[1], p2[2]); } // Set up the coefficients for |p_scalar|. int8_t p_wNAF[257]; ec_compute_wNAF(group, p_wNAF, p_scalar, 256, P256_WSIZE_PUBLIC); // Set |ret| to the point at infinity. int skip = 1; // Save some point operations. fe ret[3] = {{0},{0},{0}}; for (int i = 256; i >= 0; i--) { if (!skip) { point_double(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2]); } // For the |g_scalar|, we use the precomputed table without the // constant-time lookup. if (i <= 31) { // First, look 32 bits upwards. uint64_t bits = get_bit(g_scalar->bytes, i + 224) << 3; bits |= get_bit(g_scalar->bytes, i + 160) << 2; bits |= get_bit(g_scalar->bytes, i + 96) << 1; bits |= get_bit(g_scalar->bytes, i + 32); point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2], 1 /* mixed */, g_pre_comp[1][bits][0], g_pre_comp[1][bits][1], g_pre_comp[1][bits][2]); skip = 0; // Second, look at the current position. bits = get_bit(g_scalar->bytes, i + 192) << 3; bits |= get_bit(g_scalar->bytes, i + 128) << 2; bits |= get_bit(g_scalar->bytes, i + 64) << 1; bits |= get_bit(g_scalar->bytes, i); point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2], 1 /* mixed */, g_pre_comp[0][bits][0], g_pre_comp[0][bits][1], g_pre_comp[0][bits][2]); } int digit = p_wNAF[i]; if (digit != 0) { assert(digit & 1); int idx = digit < 0 ? (-digit) >> 1 : digit >> 1; fe *y = &p_pre_comp[idx][1], tmp; if (digit < 0) { fe_opp(tmp, p_pre_comp[idx][1]); y = &tmp; } if (!skip) { point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2], 0 /* not mixed */, p_pre_comp[idx][0], *y, p_pre_comp[idx][2]); } else { fe_copy(ret[0], p_pre_comp[idx][0]); fe_copy(ret[1], *y); fe_copy(ret[2], p_pre_comp[idx][2]); skip = 0; } } }
// Interleaved point multiplication using precomputed point multiples: The // small point multiples 0*P, 1*P, ..., 17*P are in p_pre_comp, the scalar // in p_scalar, if non-NULL. If g_scalar is non-NULL, we also add this multiple // of the generator, using certain (large) precomputed multiples in g_pre_comp. // Output point (X, Y, Z) is stored in x_out, y_out, z_out. static void batch_mul(fe x_out, fe y_out, fe z_out, const uint8_t *p_scalar, const uint8_t *g_scalar, const fe p_pre_comp[17][3]) { // set nq to the point at infinity fe nq[3] = {{0},{0},{0}}, ftmp, tmp[3]; uint64_t bits; uint8_t sign, digit; // Loop over both scalars msb-to-lsb, interleaving additions of multiples // of the generator (two in each of the last 32 rounds) and additions of p // (every 5th round). int skip = 1; // save two point operations in the first round size_t i = p_scalar != NULL ? 255 : 31; for (;;) { // double if (!skip) { point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); } // add multiples of the generator if (g_scalar != NULL && i <= 31) { // first, look 32 bits upwards bits = get_bit(g_scalar, i + 224) << 3; bits |= get_bit(g_scalar, i + 160) << 2; bits |= get_bit(g_scalar, i + 96) << 1; bits |= get_bit(g_scalar, i + 32); // select the point to add, in constant time select_point(bits, 16, g_pre_comp[1], tmp); if (!skip) { point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, tmp[0], tmp[1], tmp[2]); } else { fe_copy(nq[0], tmp[0]); fe_copy(nq[1], tmp[1]); fe_copy(nq[2], tmp[2]); skip = 0; } // second, look at the current position bits = get_bit(g_scalar, i + 192) << 3; bits |= get_bit(g_scalar, i + 128) << 2; bits |= get_bit(g_scalar, i + 64) << 1; bits |= get_bit(g_scalar, i); // select the point to add, in constant time select_point(bits, 16, g_pre_comp[0], tmp); point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, tmp[0], tmp[1], tmp[2]); } // do other additions every 5 doublings if (p_scalar != NULL && i % 5 == 0) { bits = get_bit(p_scalar, i + 4) << 5; bits |= get_bit(p_scalar, i + 3) << 4; bits |= get_bit(p_scalar, i + 2) << 3; bits |= get_bit(p_scalar, i + 1) << 2; bits |= get_bit(p_scalar, i) << 1; bits |= get_bit(p_scalar, i - 1); ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); // select the point to add or subtract, in constant time. select_point(digit, 17, p_pre_comp, tmp); fe_opp(ftmp, tmp[1]); // (X, -Y, Z) is the negative point. fe_cmovznz(tmp[1], sign, tmp[1], ftmp); if (!skip) { point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */, tmp[0], tmp[1], tmp[2]); } else { fe_copy(nq[0], tmp[0]); fe_copy(nq[1], tmp[1]); fe_copy(nq[2], tmp[2]); skip = 0; } } if (i == 0) { break; } --i; } fe_copy(x_out, nq[0]); fe_copy(y_out, nq[1]); fe_copy(z_out, nq[2]); }
// point_add calcuates (x1, y1, z1) + (x2, y2, z2) // // The method is taken from: // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, // adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). // // Coq transcription and correctness proof: // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L135> // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L205> // // This function includes a branch for checking whether the two input points // are equal, (while not equal to the point at infinity). This case never // happens during single point multiplication, so there is no timing leak for // ECDH or ECDSA signing. static void point_add(fe x3, fe y3, fe z3, const fe x1, const fe y1, const fe z1, const int mixed, const fe x2, const fe y2, const fe z2) { fe x_out, y_out, z_out; limb_t z1nz = fe_nz(z1); limb_t z2nz = fe_nz(z2); // z1z1 = z1z1 = z1**2 fe z1z1; fe_sqr(z1z1, z1); fe u1, s1, two_z1z2; if (!mixed) { // z2z2 = z2**2 fe z2z2; fe_sqr(z2z2, z2); // u1 = x1*z2z2 fe_mul(u1, x1, z2z2); // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 fe_add(two_z1z2, z1, z2); fe_sqr(two_z1z2, two_z1z2); fe_sub(two_z1z2, two_z1z2, z1z1); fe_sub(two_z1z2, two_z1z2, z2z2); // s1 = y1 * z2**3 fe_mul(s1, z2, z2z2); fe_mul(s1, s1, y1); } else { // We'll assume z2 = 1 (special case z2 = 0 is handled later). // u1 = x1*z2z2 fe_copy(u1, x1); // two_z1z2 = 2z1z2 fe_add(two_z1z2, z1, z1); // s1 = y1 * z2**3 fe_copy(s1, y1); } // u2 = x2*z1z1 fe u2; fe_mul(u2, x2, z1z1); // h = u2 - u1 fe h; fe_sub(h, u2, u1); limb_t xneq = fe_nz(h); // z_out = two_z1z2 * h fe_mul(z_out, h, two_z1z2); // z1z1z1 = z1 * z1z1 fe z1z1z1; fe_mul(z1z1z1, z1, z1z1); // s2 = y2 * z1**3 fe s2; fe_mul(s2, y2, z1z1z1); // r = (s2 - s1)*2 fe r; fe_sub(r, s2, s1); fe_add(r, r, r); limb_t yneq = fe_nz(r); if (!xneq && !yneq && z1nz && z2nz) { point_double(x3, y3, z3, x1, y1, z1); return; } // I = (2h)**2 fe i; fe_add(i, h, h); fe_sqr(i, i); // J = h * I fe j; fe_mul(j, h, i); // V = U1 * I fe v; fe_mul(v, u1, i); // x_out = r**2 - J - 2V fe_sqr(x_out, r); fe_sub(x_out, x_out, j); fe_sub(x_out, x_out, v); fe_sub(x_out, x_out, v); // y_out = r(V-x_out) - 2 * s1 * J fe_sub(y_out, v, x_out); fe_mul(y_out, y_out, r); fe s1j; fe_mul(s1j, s1, j); fe_sub(y_out, y_out, s1j); fe_sub(y_out, y_out, s1j); fe_cmovznz(x_out, z1nz, x2, x_out); fe_cmovznz(x3, z2nz, x1, x_out); fe_cmovznz(y_out, z1nz, y2, y_out); fe_cmovznz(y3, z2nz, y1, y_out); fe_cmovznz(z_out, z1nz, z2, z_out); fe_cmovznz(z3, z2nz, z1, z_out); }