zz_pInfoT::zz_pInfoT(long NewP, long maxroot) { if (maxroot < 0) LogicError("zz_pContext: maxroot may not be negative"); if (NewP <= 1) LogicError("zz_pContext: p must be > 1"); if (NumBits(NewP) > NTL_SP_NBITS) ResourceError("zz_pContext: modulus too big"); ZZ P, B, M, M1, MinusM; long n, i; long q, t; p = NewP; pinv = 1/double(p); p_info = 0; conv(P, p); sqr(B, P); LeftShift(B, B, maxroot+NTL_FFTFudge); set(M); n = 0; while (M <= B) { UseFFTPrime(n); q = GetFFTPrime(n); n++; mul(M, M, q); } if (n > 4) LogicError("zz_pInit: too many primes"); NumPrimes = n; PrimeCnt = n; MaxRoot = CalcMaxRoot(q); if (maxroot < MaxRoot) MaxRoot = maxroot; negate(MinusM, M); MinusMModP = rem(MinusM, p); CoeffModP.SetLength(n); x.SetLength(n); u.SetLength(n); for (i = 0; i < n; i++) { q = GetFFTPrime(i); div(M1, M, q); t = rem(M1, q); t = InvMod(t, q); if (NTL_zz_p_QUICK_CRT) mul(M1, M1, t); CoeffModP[i] = rem(M1, p); x[i] = ((double) t)/((double) q); u[i] = t; } }
long GetPrimeNumber(long bound, ZZ &prod){ long nprimes; zz_pBak bak; bak.save(); for (nprimes = 0; NumBits(prod) <= bound; nprimes++) { UseFFTPrime(nprimes); mul(prod, prod, GetFFTPrime(nprimes)); } bak.restore(); return nprimes; }
void ZZ_p::DoInstall() { SmartPtr<ZZ_pTmpSpaceT> tmps = 0; do { // NOTE: thread safe lazy init Lazy<ZZ_pFFTInfoT>::Builder builder(ZZ_pInfo->FFTInfo); if (!builder()) break; UniquePtr<ZZ_pFFTInfoT> FFTInfo; FFTInfo.make(); ZZ B, M, M1, M2, M3; long n, i; long q, t; mulmod_t qinv; sqr(B, ZZ_pInfo->p); LeftShift(B, B, NTL_FFTMaxRoot+NTL_FFTFudge); // FIXME: the following is quadratic time...would // be nice to get a faster solution... // One could estimate the # of primes by summing logs, // then multiply using a tree-based multiply, then // adjust up or down... // Assuming IEEE floating point, the worst case estimate // for error guarantees a correct answer +/- 1 for // numprimes up to 2^25...for sure we won't be // using that many primes...we can certainly put in // a sanity check, though. // If I want a more accuaruate summation (with using Kahan, // which has some portability issues), I could represent // numbers as x = a + f, where a is integer and f is the fractional // part. Summing in this representation introduces an *absolute* // error of 2 epsilon n, which is just as good as Kahan // for this application. // same strategy could also be used in the ZZX HomMul routine, // if we ever want to make that subquadratic set(M); n = 0; while (M <= B) { UseFFTPrime(n); q = GetFFTPrime(n); n++; mul(M, M, q); } FFTInfo->NumPrimes = n; FFTInfo->MaxRoot = CalcMaxRoot(q); double fn = double(n); if (8.0*fn*(fn+48) > NTL_FDOUBLE_PRECISION) ResourceError("modulus too big"); if (8.0*fn*(fn+48) <= NTL_FDOUBLE_PRECISION/double(NTL_SP_BOUND)) FFTInfo->QuickCRT = true; else FFTInfo->QuickCRT = false; // FIXME: some of this stuff does not need to be initialized // at all if FFTInfo->crt_struct.special() FFTInfo->x.SetLength(n); FFTInfo->u.SetLength(n); FFTInfo->uqinv.SetLength(n); FFTInfo->rem_struct.init(n, ZZ_pInfo->p, GetFFTPrime); FFTInfo->crt_struct.init(n, ZZ_pInfo->p, GetFFTPrime); if (!FFTInfo->crt_struct.special()) { ZZ qq, rr; DivRem(qq, rr, M, ZZ_pInfo->p); NegateMod(FFTInfo->MinusMModP, rr, ZZ_pInfo->p); for (i = 0; i < n; i++) { q = GetFFTPrime(i); qinv = GetFFTPrimeInv(i); long tt = rem(qq, q); mul(M2, ZZ_pInfo->p, tt); add(M2, M2, rr); div(M2, M2, q); // = (M/q) rem p div(M1, M, q); t = rem(M1, q); t = InvMod(t, q); mul(M3, M2, t); rem(M3, M3, ZZ_pInfo->p); FFTInfo->crt_struct.insert(i, M3); FFTInfo->x[i] = ((double) t)/((double) q); FFTInfo->u[i] = t; FFTInfo->uqinv[i] = PrepMulModPrecon(FFTInfo->u[i], q, qinv); } } tmps = MakeSmart<ZZ_pTmpSpaceT>(); tmps->crt_tmp_vec.fetch(FFTInfo->crt_struct); tmps->rem_tmp_vec.fetch(FFTInfo->rem_struct); builder.move(FFTInfo); } while (0); if (!tmps) { const ZZ_pFFTInfoT *FFTInfo = ZZ_pInfo->FFTInfo.get(); tmps = MakeSmart<ZZ_pTmpSpaceT>(); tmps->crt_tmp_vec.fetch(FFTInfo->crt_struct); tmps->rem_tmp_vec.fetch(FFTInfo->rem_struct); } ZZ_pTmpSpace = tmps; }
int main() { #ifdef NTL_SPMM_ULL if (sizeof(NTL_ULL_TYPE) < 2*sizeof(long)) { printf("999999999999999 "); print_flag(); return 0; } #endif long n, k; n = 200; k = 10*NTL_ZZ_NBITS; ZZ p; RandomLen(p, k); ZZ_p::init(p); // initialization ZZ_pX f, g, h, r1, r2, r3; random(g, n); // g = random polynomial of degree < n random(h, n); // h = " " random(f, n); // f = " " SetCoeff(f, n); // Sets coefficient of X^n to 1 // For doing arithmetic mod f quickly, one must pre-compute // some information. ZZ_pXModulus F; build(F, f); PlainMul(r1, g, h); // this uses classical arithmetic PlainRem(r1, r1, f); MulMod(r2, g, h, F); // this uses the FFT MulMod(r3, g, h, f); // uses FFT, but slower // compare the results... if (r1 != r2) { printf("999999999999999 "); print_flag(); return 0; } else if (r1 != r3) { printf("999999999999999 "); print_flag(); return 0; } double t; long i, j; long iter; const int nprimes = 30; const long L = 12; const long N = 1L << L; long r; for (r = 0; r < nprimes; r++) UseFFTPrime(r); vec_long aa[nprimes], AA[nprimes]; for (r = 0; r < nprimes; r++) { aa[r].SetLength(N); AA[r].SetLength(N); for (i = 0; i < N; i++) aa[r][i] = RandomBnd(GetFFTPrime(r)); FFTFwd(AA[r].elts(), aa[r].elts(), L, r); FFTRev1(AA[r].elts(), AA[r].elts(), L, r); } iter = 1; do { t = GetTime(); for (j = 0; j < iter; j++) { for (r = 0; r < nprimes; r++) { long *AAp = AA[r].elts(); long *aap = aa[r].elts(); long q = GetFFTPrime(r); mulmod_t qinv = GetFFTPrimeInv(r); FFTFwd(AAp, aap, L, r); FFTRev1(AAp, aap, L, r); for (i = 0; i < N; i++) AAp[i] = NormalizedMulMod(AAp[i], aap[i], q, qinv); } } t = GetTime() - t; iter = 2*iter; } while(t < 1); iter = iter/2; iter = long((1.5/t)*iter) + 1; double tvec[5]; long w; for (w = 0; w < 5; w++) { t = GetTime(); for (j = 0; j < iter; j++) { for (r = 0; r < nprimes; r++) { long *AAp = AA[r].elts(); long *aap = aa[r].elts(); long q = GetFFTPrime(r); mulmod_t qinv = GetFFTPrimeInv(r); FFTFwd(AAp, aap, L, r); FFTRev1(AAp, aap, L, r); for (i = 0; i < N; i++) AAp[i] = NormalizedMulMod(AAp[i], aap[i], q, qinv); } } t = GetTime() - t; tvec[w] = t; } t = clean_data(tvec); t = floor((t/iter)*1e13); if (t < 0 || t >= 1e15) printf("999999999999999 "); else printf("%015.0f ", t); printf(" [%ld] ", iter); print_flag(); return 0; }