zz_pInfoT::zz_pInfoT(long NewP, long maxroot) { if (maxroot < 0) LogicError("zz_pContext: maxroot may not be negative"); if (NewP <= 1) LogicError("zz_pContext: p must be > 1"); if (NumBits(NewP) > NTL_SP_NBITS) ResourceError("zz_pContext: modulus too big"); ZZ P, B, M, M1, MinusM; long n, i; long q, t; p = NewP; pinv = 1/double(p); p_info = 0; conv(P, p); sqr(B, P); LeftShift(B, B, maxroot+NTL_FFTFudge); set(M); n = 0; while (M <= B) { UseFFTPrime(n); q = GetFFTPrime(n); n++; mul(M, M, q); } if (n > 4) LogicError("zz_pInit: too many primes"); NumPrimes = n; PrimeCnt = n; MaxRoot = CalcMaxRoot(q); if (maxroot < MaxRoot) MaxRoot = maxroot; negate(MinusM, M); MinusMModP = rem(MinusM, p); CoeffModP.SetLength(n); x.SetLength(n); u.SetLength(n); for (i = 0; i < n; i++) { q = GetFFTPrime(i); div(M1, M, q); t = rem(M1, q); t = InvMod(t, q); if (NTL_zz_p_QUICK_CRT) mul(M1, M1, t); CoeffModP[i] = rem(M1, p); x[i] = ((double) t)/((double) q); u[i] = t; } }
zz_pContext::zz_pContext(INIT_FFT_TYPE, long index) { if (index < 0) LogicError("bad FFT prime index"); UseFFTPrime(index); ptr = FFTTables[index]->zz_p_context; }
long GetPrimeNumber(long bound, ZZ &prod){ long nprimes; zz_pBak bak; bak.save(); for (nprimes = 0; NumBits(prod) <= bound; nprimes++) { UseFFTPrime(nprimes); mul(prod, prod, GetFFTPrime(nprimes)); } bak.restore(); return nprimes; }
zz_pInfoT::zz_pInfoT(long Index) { ref_count = 1; index = Index; if (index < 0) Error("bad FFT prime index"); // allows non-consecutive indices...I'm not sure why while (NumFFTPrimes < index) UseFFTPrime(NumFFTPrimes); UseFFTPrime(index); p = FFTPrime[index]; pinv = FFTPrimeInv[index]; NumPrimes = 1; PrimeCnt = 0; MaxRoot = CalcMaxRoot(p); }
void ZZ_p::DoInstall() { SmartPtr<ZZ_pTmpSpaceT> tmps = 0; do { // NOTE: thread safe lazy init Lazy<ZZ_pFFTInfoT>::Builder builder(ZZ_pInfo->FFTInfo); if (!builder()) break; UniquePtr<ZZ_pFFTInfoT> FFTInfo; FFTInfo.make(); ZZ B, M, M1, M2, M3; long n, i; long q, t; mulmod_t qinv; sqr(B, ZZ_pInfo->p); LeftShift(B, B, NTL_FFTMaxRoot+NTL_FFTFudge); // FIXME: the following is quadratic time...would // be nice to get a faster solution... // One could estimate the # of primes by summing logs, // then multiply using a tree-based multiply, then // adjust up or down... // Assuming IEEE floating point, the worst case estimate // for error guarantees a correct answer +/- 1 for // numprimes up to 2^25...for sure we won't be // using that many primes...we can certainly put in // a sanity check, though. // If I want a more accuaruate summation (with using Kahan, // which has some portability issues), I could represent // numbers as x = a + f, where a is integer and f is the fractional // part. Summing in this representation introduces an *absolute* // error of 2 epsilon n, which is just as good as Kahan // for this application. // same strategy could also be used in the ZZX HomMul routine, // if we ever want to make that subquadratic set(M); n = 0; while (M <= B) { UseFFTPrime(n); q = GetFFTPrime(n); n++; mul(M, M, q); } FFTInfo->NumPrimes = n; FFTInfo->MaxRoot = CalcMaxRoot(q); double fn = double(n); if (8.0*fn*(fn+48) > NTL_FDOUBLE_PRECISION) ResourceError("modulus too big"); if (8.0*fn*(fn+48) <= NTL_FDOUBLE_PRECISION/double(NTL_SP_BOUND)) FFTInfo->QuickCRT = true; else FFTInfo->QuickCRT = false; // FIXME: some of this stuff does not need to be initialized // at all if FFTInfo->crt_struct.special() FFTInfo->x.SetLength(n); FFTInfo->u.SetLength(n); FFTInfo->uqinv.SetLength(n); FFTInfo->rem_struct.init(n, ZZ_pInfo->p, GetFFTPrime); FFTInfo->crt_struct.init(n, ZZ_pInfo->p, GetFFTPrime); if (!FFTInfo->crt_struct.special()) { ZZ qq, rr; DivRem(qq, rr, M, ZZ_pInfo->p); NegateMod(FFTInfo->MinusMModP, rr, ZZ_pInfo->p); for (i = 0; i < n; i++) { q = GetFFTPrime(i); qinv = GetFFTPrimeInv(i); long tt = rem(qq, q); mul(M2, ZZ_pInfo->p, tt); add(M2, M2, rr); div(M2, M2, q); // = (M/q) rem p div(M1, M, q); t = rem(M1, q); t = InvMod(t, q); mul(M3, M2, t); rem(M3, M3, ZZ_pInfo->p); FFTInfo->crt_struct.insert(i, M3); FFTInfo->x[i] = ((double) t)/((double) q); FFTInfo->u[i] = t; FFTInfo->uqinv[i] = PrepMulModPrecon(FFTInfo->u[i], q, qinv); } } tmps = MakeSmart<ZZ_pTmpSpaceT>(); tmps->crt_tmp_vec.fetch(FFTInfo->crt_struct); tmps->rem_tmp_vec.fetch(FFTInfo->rem_struct); builder.move(FFTInfo); } while (0); if (!tmps) { const ZZ_pFFTInfoT *FFTInfo = ZZ_pInfo->FFTInfo.get(); tmps = MakeSmart<ZZ_pTmpSpaceT>(); tmps->crt_tmp_vec.fetch(FFTInfo->crt_struct); tmps->rem_tmp_vec.fetch(FFTInfo->rem_struct); } ZZ_pTmpSpace = tmps; }
newNTL_START_IMPL zz_pInfoT::zz_pInfoT(long NewP, long maxroot) { ref_count = 1; if (maxroot < 0) Error("zz_pContext: maxroot may not be negative"); if (NewP <= 1) Error("zz_pContext: p must be > 1"); if (NumBits(NewP) > newNTL_SP_NBITS) Error("zz_pContext: modulus too big"); ZZ P, B, M, M1, MinusM; long n, i; long q, t; p = NewP; pinv = 1/double(p); index = -1; conv(P, p); sqr(B, P); LeftShift(B, B, maxroot+newNTL_FFTFudge); set(M); n = 0; while (M <= B) { UseFFTPrime(n); q = FFTPrime[n]; n++; mul(M, M, q); } if (n > 4) Error("zz_pInit: too many primes"); NumPrimes = n; PrimeCnt = n; MaxRoot = CalcMaxRoot(q); if (maxroot < MaxRoot) MaxRoot = maxroot; negate(MinusM, M); MinusMModP = rem(MinusM, p); if (!(CoeffModP = (long *) newNTL_MALLOC(n, sizeof(long), 0))) Error("out of space"); if (!(x = (double *) newNTL_MALLOC(n, sizeof(double), 0))) Error("out of space"); if (!(u = (long *) newNTL_MALLOC(n, sizeof(long), 0))) Error("out of space"); for (i = 0; i < n; i++) { q = FFTPrime[i]; div(M1, M, q); t = rem(M1, q); t = InvMod(t, q); mul(M1, M1, t); CoeffModP[i] = rem(M1, p); x[i] = ((double) t)/((double) q); u[i] = t; } }
void ZZ_pInfoT::init() { ZZ B, M, M1, M2, M3; long n, i; long q, t; initialized = 1; sqr(B, p); LeftShift(B, B, NTL_FFTMaxRoot+NTL_FFTFudge); set(M); n = 0; while (M <= B) { UseFFTPrime(n); q = FFTPrime[n]; n++; mul(M, M, q); } NumPrimes = n; MaxRoot = CalcMaxRoot(q); double fn = double(n); if (8.0*fn*(fn+32) > NTL_FDOUBLE_PRECISION) Error("modulus too big"); if (8.0*fn*(fn+32) > NTL_FDOUBLE_PRECISION/double(NTL_SP_BOUND)) QuickCRT = 0; else QuickCRT = 1; if (!(x = (double *) NTL_MALLOC(n, sizeof(double), 0))) Error("out of space"); if (!(u = (long *) NTL_MALLOC(n, sizeof(long), 0))) Error("out of space"); ZZ_p_rem_struct_init(&rem_struct, n, p, FFTPrime); ZZ_p_crt_struct_init(&crt_struct, n, p, FFTPrime); if (ZZ_p_crt_struct_special(crt_struct)) return; ZZ qq, rr; DivRem(qq, rr, M, p); NegateMod(MinusMModP, rr, p); for (i = 0; i < n; i++) { q = FFTPrime[i]; long tt = rem(qq, q); mul(M2, p, tt); add(M2, M2, rr); div(M2, M2, q); // = (M/q) rem p div(M1, M, q); t = rem(M1, q); t = InvMod(t, q); mul(M3, M2, t); rem(M3, M3, p); ZZ_p_crt_struct_insert(crt_struct, i, M3); x[i] = ((double) t)/((double) q); u[i] = t; } }
int main() { #ifdef NTL_SPMM_ULL if (sizeof(NTL_ULL_TYPE) < 2*sizeof(long)) { printf("999999999999999 "); print_flag(); return 0; } #endif long n, k; n = 200; k = 10*NTL_ZZ_NBITS; ZZ p; RandomLen(p, k); ZZ_p::init(p); // initialization ZZ_pX f, g, h, r1, r2, r3; random(g, n); // g = random polynomial of degree < n random(h, n); // h = " " random(f, n); // f = " " SetCoeff(f, n); // Sets coefficient of X^n to 1 // For doing arithmetic mod f quickly, one must pre-compute // some information. ZZ_pXModulus F; build(F, f); PlainMul(r1, g, h); // this uses classical arithmetic PlainRem(r1, r1, f); MulMod(r2, g, h, F); // this uses the FFT MulMod(r3, g, h, f); // uses FFT, but slower // compare the results... if (r1 != r2) { printf("999999999999999 "); print_flag(); return 0; } else if (r1 != r3) { printf("999999999999999 "); print_flag(); return 0; } double t; long i, j; long iter; const int nprimes = 30; const long L = 12; const long N = 1L << L; long r; for (r = 0; r < nprimes; r++) UseFFTPrime(r); vec_long aa[nprimes], AA[nprimes]; for (r = 0; r < nprimes; r++) { aa[r].SetLength(N); AA[r].SetLength(N); for (i = 0; i < N; i++) aa[r][i] = RandomBnd(GetFFTPrime(r)); FFTFwd(AA[r].elts(), aa[r].elts(), L, r); FFTRev1(AA[r].elts(), AA[r].elts(), L, r); } iter = 1; do { t = GetTime(); for (j = 0; j < iter; j++) { for (r = 0; r < nprimes; r++) { long *AAp = AA[r].elts(); long *aap = aa[r].elts(); long q = GetFFTPrime(r); mulmod_t qinv = GetFFTPrimeInv(r); FFTFwd(AAp, aap, L, r); FFTRev1(AAp, aap, L, r); for (i = 0; i < N; i++) AAp[i] = NormalizedMulMod(AAp[i], aap[i], q, qinv); } } t = GetTime() - t; iter = 2*iter; } while(t < 1); iter = iter/2; iter = long((1.5/t)*iter) + 1; double tvec[5]; long w; for (w = 0; w < 5; w++) { t = GetTime(); for (j = 0; j < iter; j++) { for (r = 0; r < nprimes; r++) { long *AAp = AA[r].elts(); long *aap = aa[r].elts(); long q = GetFFTPrime(r); mulmod_t qinv = GetFFTPrimeInv(r); FFTFwd(AAp, aap, L, r); FFTRev1(AAp, aap, L, r); for (i = 0; i < N; i++) AAp[i] = NormalizedMulMod(AAp[i], aap[i], q, qinv); } } t = GetTime() - t; tvec[w] = t; } t = clean_data(tvec); t = floor((t/iter)*1e13); if (t < 0 || t >= 1e15) printf("999999999999999 "); else printf("%015.0f ", t); printf(" [%ld] ", iter); print_flag(); return 0; }