void mpzspv_from_ntt (mpzspv_t x, spv_size_t offset, spv_size_t ntt_size, spv_size_t monic_pos, mpzspm_t mpzspm) { unsigned int i; spv_size_t log2_ntt_size; spm_t spm; spv_t spv; ASSERT (mpzspv_verify (x, offset, ntt_size, mpzspm)); log2_ntt_size = ceil_log_2 (ntt_size); for (i = 0; i < mpzspm->sp_num; i++) { spm = mpzspm->spm[i]; spv = x[i] + offset; spv_ntt_gfp_dit (spv, log2_ntt_size, spm); /* spm->sp - (spm->sp - 1) / ntt_size is the inverse of ntt_size */ spv_mul_sp (spv, spv, spm->sp - (spm->sp - 1) / ntt_size, ntt_size, spm->sp, spm->mul_c); if (monic_pos) spv[monic_pos % ntt_size] = sp_sub (spv[monic_pos % ntt_size], 1, spm->sp); } }
void mpzspv_to_ntt (mpzspv_t x, spv_size_t offset, spv_size_t len, spv_size_t ntt_size, int monic, mpzspm_t mpzspm) { unsigned int i; spv_size_t j, log2_ntt_size; spm_t spm; spv_t spv; ASSERT (mpzspv_verify (x, offset, len, mpzspm)); ASSERT (mpzspv_verify (x, offset + ntt_size, 0, mpzspm)); log2_ntt_size = ceil_log_2 (ntt_size); for (i = 0; i < mpzspm->sp_num; i++) { spm = mpzspm->spm[i]; spv = x[i] + offset; if (ntt_size < len) { for (j = ntt_size; j < len; j += ntt_size) spv_add (spv, spv, spv + j, ntt_size, spm->sp); } if (ntt_size > len) spv_set_zero (spv + len, ntt_size - len); if (monic) spv[len % ntt_size] = sp_add (spv[len % ntt_size], 1, spm->sp); spv_ntt_gfp_dif (spv, log2_ntt_size, spm); } }
void mpzspv_mul_by_dct (mpzspv_t dft, const mpzspv_t dct, const spv_size_t len, const mpzspm_t mpzspm, const int steps) { int j; spv_size_t log2_len = ceil_log_2 (len); #ifdef _OPENMP #pragma omp parallel private(j) { #pragma omp for #endif for (j = 0; j < (int) (mpzspm->sp_num); j++) { const spm_t spm = mpzspm->spm[j]; const spv_t spv = dft[j]; unsigned long i, m; /* Forward DFT of dft[j] */ if ((steps & NTT_MUL_STEP_FFT1) != 0) spv_ntt_gfp_dif (spv, log2_len, spm); /* Point-wise product */ if ((steps & NTT_MUL_STEP_MUL) != 0) { m = 5UL; spv[0] = sp_mul (spv[0], dct[j][0], spm->sp, spm->mul_c); spv[1] = sp_mul (spv[1], dct[j][len / 2UL], spm->sp, spm->mul_c); for (i = 2UL; i < len; i += 2UL) { /* This works, but why? */ if (i + i / 2UL > m) m = 2UL * m + 1; spv[i] = sp_mul (spv[i], dct[j][i / 2UL], spm->sp, spm->mul_c); spv[m - i] = sp_mul (spv[m - i], dct[j][i / 2UL], spm->sp, spm->mul_c); } } /* Inverse transform of dft[j] */ if ((steps & NTT_MUL_STEP_IFFT) != 0) { spv_ntt_gfp_dit (spv, log2_len, spm); /* Divide by transform length. FIXME: scale the DCT of h instead */ spv_mul_sp (spv, spv, spm->sp - (spm->sp - 1) / len, len, spm->sp, spm->mul_c); } } #ifdef _OPENMP } #endif }
/* * malloc - allocates a block with requested size * (implementation issue: adjust size, extend size) * */ void *malloc(size_t size) { size_t asize = 0; size_t extendsize; char* bp; #ifdef TWEAK int k; #endif dbg1("[IN ] : malloc() - malloc(%ld)\n", size); checkheap(); if (!mm_initialized) mm_init(); /* Ignore spurious requests */ if (size <= 0) return NULL; #ifdef TWEAK /* * [Optimization for binary*.rep trace files] * if requested size is larger than 2^4=16 bytes, and close to power of two * (when the difference between requested size and its closest larger power of two * is smaller than one eighth of closest power of two) round up to power of two */ k = ceil_log_2(size); if( k >= 4 && ((1 << k) - size) <= (unsigned)(1 << (k -3))) { size = 1 << k; } dbg1("intermediate size : %ld\n", size); #endif #ifdef PUTBUG if (size <= MINBLKSIZE) { asize = MINBLKSIZE + WSIZE; } else { asize = DSIZE * ((size + DSIZE-1)/DSIZE); } #endif #ifdef ASIZE1 /* Adjust block size to include overhead and alignment reqs */ /* Block should at least have space for Header, Footer, Pred, Succ Pointers */ if (size <= MINBLKSIZE) { asize = MINBLKSIZE + WSIZE; } else { asize = DSIZE * ((size + WSIZE + DSIZE-1)/DSIZE); } #endif #ifdef ASIZE2 if (size <= 2*DSIZE+OVERHEAD) { asize = 2*DSIZE+OVERHEAD+OVERHEAD; } else { /* round up to the nearest order of 16 */ asize = 2*DSIZE * ((size + (OVERHEAD - WSIZE) + 2*DSIZE-1)/(2*DSIZE)); } #endif #ifdef ASIZE3 if (size <= DDSIZE){ asize = DDSIZE + DSIZE; } else { asize = DDSIZE*((size + (DSIZE - WSIZE) + (DDSIZE-1)) / DDSIZE); } #endif dbg("malloc: adjusted size %ld, class %ld\n", asize, get_list_num(asize)); /* Search the free list for a fit */ if ((bp = find_fit(asize)) != NULL) { place(bp, asize); dbg1("[OUT] : malloc() - found fit\n"); return bp; } /* No fit found. Get more memory and place the block */ extendsize = MAX(asize, CHUNKSIZE); if((bp = extend_heap(extendsize/WSIZE)) == NULL) return NULL; place(bp, asize); dbg1("[OUT] : malloc() - found fit failed, extended the heap\n"); return bp; }
void mpzspv_sqr_reciprocal (mpzspv_t dft, const spv_size_t n, const mpzspm_t mpzspm) { const spv_size_t log2_n = ceil_log_2 (n); const spv_size_t len = ((spv_size_t) 2) << log2_n; const spv_size_t log2_len = 1 + log2_n; int j; ASSERT(mpzspm->max_ntt_size % 3UL == 0UL); ASSERT(len % 3UL != 0UL); ASSERT(mpzspm->max_ntt_size % len == 0UL); #ifdef _OPENMP #pragma omp parallel { #pragma omp for #endif for (j = 0; j < (int) (mpzspm->sp_num); j++) { const spm_t spm = mpzspm->spm[j]; const spv_t spv = dft[j]; sp_t w1, w2, invlen; const sp_t sp = spm->sp, mul_c = spm->mul_c; spv_size_t i; /* Zero out NTT elements [n .. len-n] */ spv_set_sp (spv + n, (sp_t) 0, len - 2*n + 1); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) { printf ("ntt_sqr_reciprocal: NTT vector mod %lu\n", sp); ntt_print_vec ("ntt_sqr_reciprocal: before weighting:", spv, len); } #endif /* Compute the root for the weight signal, a 3rd primitive root of unity */ w1 = sp_pow (spm->prim_root, mpzspm->max_ntt_size / 3UL, sp, mul_c); /* Compute iw= 1/w */ w2 = sp_pow (spm->inv_prim_root, mpzspm->max_ntt_size / 3UL, sp, mul_c); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) printf ("w1 = %lu ,w2 = %lu\n", w1, w2); #endif ASSERT(sp_mul(w1, w2, sp, mul_c) == (sp_t) 1); ASSERT(w1 != (sp_t) 1); ASSERT(sp_pow (w1, 3UL, sp, mul_c) == (sp_t) 1); ASSERT(w2 != (sp_t) 1); ASSERT(sp_pow (w2, 3UL, sp, mul_c) == (sp_t) 1); /* Fill NTT elements spv[len-n+1 .. len-1] with coefficients and apply weight signal to spv[i] and spv[l-i] for 0 <= i < n Use the fact that w^i + w^{-i} = -1 if i != 0 (mod 3). */ for (i = 0; i + 2 < n; i += 3) { sp_t t, u; if (i > 0) spv[len - i] = spv[i]; t = spv[i + 1]; u = sp_mul (t, w1, sp, mul_c); spv[i + 1] = u; spv[len - i - 1] = sp_neg (sp_add (t, u, sp), sp); t = spv[i + 2]; u = sp_mul (t, w2, sp, mul_c); spv[i + 2] = u; spv[len - i - 2] = sp_neg (sp_add (t, u, sp), sp); } if (i < n && i > 0) { spv[len - i] = spv[i]; } if (i + 1 < n) { sp_t t, u; t = spv[i + 1]; u = sp_mul (t, w1, sp, mul_c); spv[i + 1] = u; spv[len - i - 1] = sp_neg (sp_add (t, u, sp), sp); } #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after weighting:", spv, len); #endif /* Forward DFT of dft[j] */ spv_ntt_gfp_dif (spv, log2_len, spm); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after forward transform:", spv, len); #endif /* Square the transformed vector point-wise */ spv_pwmul (spv, spv, spv, len, sp, mul_c); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after point-wise squaring:", spv, len); #endif /* Inverse transform of dft[j] */ spv_ntt_gfp_dit (spv, log2_len, spm); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after inverse transform:", spv, len); #endif /* Un-weight and divide by transform length */ invlen = sp - (sp - (sp_t) 1) / len; /* invlen = 1/len (mod sp) */ w1 = sp_mul (invlen, w1, sp, mul_c); w2 = sp_mul (invlen, w2, sp, mul_c); for (i = 0; i < 2 * n - 3; i += 3) { spv[i] = sp_mul (spv[i], invlen, sp, mul_c); spv[i + 1] = sp_mul (spv[i + 1], w2, sp, mul_c); spv[i + 2] = sp_mul (spv[i + 2], w1, sp, mul_c); } if (i < 2 * n - 1) spv[i] = sp_mul (spv[i], invlen, sp, mul_c); if (i < 2 * n - 2) spv[i + 1] = sp_mul (spv[i + 1], w2, sp, mul_c); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after un-weighting:", spv, len); #endif /* Separate the coefficients of R in the wrapped-around product. */ /* Set w1 = cuberoot(1)^l where cuberoot(1) is the same primitive 3rd root of unity we used for the weight signal */ w1 = sp_pow (spm->prim_root, mpzspm->max_ntt_size / 3UL, sp, mul_c); w1 = sp_pow (w1, len % 3UL, sp, mul_c); /* Set w2 = 1/(w1 - 1/w1). Incidentally, w2 = 1/sqrt(-3) */ w2 = sp_inv (w1, sp, mul_c); w2 = sp_sub (w1, w2, sp); w2 = sp_inv (w2, sp, mul_c); #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) printf ("For separating: w1 = %lu, w2 = %lu\n", w1, w2); #endif for (i = len - (2*n - 2); i <= len / 2; i++) { sp_t t, u; /* spv[i] = s_i + w^{-l} s_{l-i}. spv[l-i] = s_{l-i} + w^{-l} s_i */ t = sp_mul (spv[i], w1, sp, mul_c); /* t = w^l s_i + s_{l-i} */ t = sp_sub (t, spv[len - i], sp); /* t = w^l s_i + w^{-l} s_i */ t = sp_mul (t, w2, sp, mul_c); /* t = s_1 */ u = sp_sub (spv[i], t, sp); /* u = w^{-l} s_{l-i} */ u = sp_mul (u, w1, sp, mul_c); /* u = s_{l-i} */ spv[i] = t; spv[len - i] = u; ASSERT(i < len / 2 || t == u); } #ifdef TRACE_ntt_sqr_reciprocal if (j == 0) ntt_print_vec ("ntt_sqr_reciprocal: after un-wrapping:", spv, len); #endif } #ifdef _OPENMP } #endif }
void mpzspv_to_dct1 (mpzspv_t dct, const mpzspv_t spv, const spv_size_t spvlen, const spv_size_t dctlen, mpzspv_t tmp, const mpzspm_t mpzspm) { const spv_size_t l = 2 * (dctlen - 1); /* Length for the DFT */ const spv_size_t log2_l = ceil_log_2 (l); int j; #ifdef _OPENMP #pragma omp parallel private(j) { #pragma omp for #endif for (j = 0; j < (int) mpzspm->sp_num; j++) { const spm_t spm = mpzspm->spm[j]; spv_size_t i; /* Make a symmetric copy of spv in tmp. I.e. with spv = [3, 2, 1], spvlen = 3, dctlen = 5 (hence l = 8), we want tmp = [3, 2, 1, 0, 0, 0, 1, 2] */ spv_set (tmp[j], spv[j], spvlen); spv_rev (tmp[j] + l - spvlen + 1, spv[j] + 1, spvlen - 1); /* Now we have [3, 2, 1, ?, ?, ?, 1, 2]. Fill the ?'s with zeros. */ spv_set_sp (tmp[j] + spvlen, (sp_t) 0, l - 2 * spvlen + 1); #if 0 printf ("mpzspv_to_dct1: tmp[%d] = [", j); for (i = 0; i < l; i++) printf ("%lu, ", tmp[j][i]); printf ("]\n"); #endif spv_ntt_gfp_dif (tmp[j], log2_l, spm); #if 0 printf ("mpzspv_to_dct1: tmp[%d] = [", j); for (i = 0; i < l; i++) printf ("%lu, ", tmp[j][i]); printf ("]\n"); #endif /* The forward transform is scrambled. We want elements [0 ... l/2] of the unscrabled data, that is all the coefficients with the most significant bit in the index (in log2(l) word size) unset, plus the element at index l/2. By scrambling, these map to the elements with even index, plus the element at index 1. The elements with scrambled index 2*i are stored in h[i], the element with scrambled index 1 is stored in h[params->l] */ #ifdef WANT_ASSERT /* Test that the coefficients are symmetric (if they were unscrambled) and that our algorithm for finding identical coefficients in the scrambled data works */ { spv_size_t m = 5; for (i = 2; i < l; i += 2L) { /* This works, but why? */ if (i + i / 2L > m) m = 2L * m + 1L; ASSERT (tmp[j][i] == tmp[j][m - i]); #if 0 printf ("mpzspv_to_dct1: DFT[%lu] == DFT[%lu]\n", i, m - i); #endif } } #endif /* Copy coefficients to dct buffer */ for (i = 0; i < l / 2; i++) dct[j][i] = tmp[j][i * 2]; dct[j][l / 2] = tmp[j][1]; } #ifdef _OPENMP } #endif }
void mpzspv_mul_ntt (mpzspv_t r, const spv_size_t offsetr, mpzspv_t x, const spv_size_t offsetx, const spv_size_t lenx, mpzspv_t y, const spv_size_t offsety, const spv_size_t leny, const spv_size_t ntt_size, const int monic, const spv_size_t monic_pos, mpzspm_t mpzspm, const int steps) { spv_size_t log2_ntt_size; int i; ASSERT (mpzspv_verify (x, offsetx, lenx, mpzspm)); ASSERT (mpzspv_verify (y, offsety, leny, mpzspm)); ASSERT (mpzspv_verify (x, offsetx + ntt_size, 0, mpzspm)); ASSERT (mpzspv_verify (y, offsety + ntt_size, 0, mpzspm)); ASSERT (mpzspv_verify (r, offsetr + ntt_size, 0, mpzspm)); log2_ntt_size = ceil_log_2 (ntt_size); /* Need parallelization at higher level (e.g., handling a branch of the product tree in one thread) to make this worthwhile for ECM */ #define MPZSPV_MUL_NTT_OPENMP 0 #if defined(_OPENMP) && MPZSPV_MUL_NTT_OPENMP #pragma omp parallel if (ntt_size > 16384) { #pragma omp for #endif for (i = 0; i < (int) mpzspm->sp_num; i++) { spv_size_t j; spm_t spm = mpzspm->spm[i]; spv_t spvr = r[i] + offsetr; spv_t spvx = x[i] + offsetx; spv_t spvy = y[i] + offsety; if ((steps & NTT_MUL_STEP_FFT1) != 0) { if (ntt_size < lenx) { for (j = ntt_size; j < lenx; j += ntt_size) spv_add (spvx, spvx, spvx + j, ntt_size, spm->sp); } if (ntt_size > lenx) spv_set_zero (spvx + lenx, ntt_size - lenx); if (monic) spvx[lenx % ntt_size] = sp_add (spvx[lenx % ntt_size], 1, spm->sp); spv_ntt_gfp_dif (spvx, log2_ntt_size, spm); } if ((steps & NTT_MUL_STEP_FFT2) != 0) { if (ntt_size < leny) { for (j = ntt_size; j < leny; j += ntt_size) spv_add (spvy, spvy, spvy + j, ntt_size, spm->sp); } if (ntt_size > leny) spv_set_zero (spvy + leny, ntt_size - leny); if (monic) spvy[leny % ntt_size] = sp_add (spvy[leny % ntt_size], 1, spm->sp); spv_ntt_gfp_dif (spvy, log2_ntt_size, spm); } if ((steps & NTT_MUL_STEP_MUL) != 0) { spv_pwmul (spvr, spvx, spvy, ntt_size, spm->sp, spm->mul_c); } if ((steps & NTT_MUL_STEP_IFFT) != 0) { ASSERT (sizeof (mp_limb_t) >= sizeof (sp_t)); spv_ntt_gfp_dit (spvr, log2_ntt_size, spm); /* spm->sp - (spm->sp - 1) / ntt_size is the inverse of ntt_size */ spv_mul_sp (spvr, spvr, spm->sp - (spm->sp - 1) / ntt_size, ntt_size, spm->sp, spm->mul_c); if (monic_pos) spvr[monic_pos % ntt_size] = sp_sub (spvr[monic_pos % ntt_size], 1, spm->sp); } } #if defined(_OPENMP) && MPZSPV_MUL_NTT_OPENMP } #endif }