/* Tests zn_array_invert() on a range of problems. */ int test_zn_array_invert (int quick) { int success = 1; int b, trial; size_t n; zn_mod_t mod; // first try a dense range of "small" problems for (b = 2; b <= ULONG_BITS && success; b++) for (n = 1; n <= 60 && success; n++) for (trial = 0; trial < (quick ? 1 : 10) && success; trial++) { zn_mod_init (mod, random_modulus (b, 0)); success = success && testcase_zn_array_invert (n, mod); zn_mod_clear (mod); } // now try a few larger random problems for (b = 2; b <= ULONG_BITS && success; b += (quick ? random_ulong (3) + 1 : 1)) for (trial = 0; trial < (quick ? 1 : 5) && success; trial++) { zn_mod_init (mod, random_modulus (b, 0)); n = random_ulong (quick ? 2000 : 10000) + 1; success = success && testcase_zn_array_invert (n, mod); zn_mod_clear (mod); } return success; }
/* tests zn_array_mul_fft() on a range of input cases */ int test_zn_array_mul_or_sqr_fft (int sqr, int quick) { int success = 1; int i, trial, use_scale; size_t n1, n2; zn_mod_t mod; // first try a dense range of "small" problems for (i = 0; i < num_test_bitsizes && success; i++) for (n2 = 1; n2 <= 50 && success; n2 += (quick ? 3 : 1)) for (n1 = n2; n1 <= 50 && (!sqr || n1 <= n2) && success; n1 += (quick ? 3 : 1)) for (use_scale = 0; use_scale <= 1 && success; use_scale++) for (trial = 0; trial < (quick ? 1 : 3) && success; trial++) { zn_mod_init (mod, random_modulus (test_bitsizes[i], 1)); success = success && testcase_zn_array_mul_fft (n1, n2, sqr, use_scale, mod); zn_mod_clear (mod); } // now try some random larger problems // and temporarily change the nussbaumer thresholds so we use that // code sometimes unsigned thresh; for (i = 0; i < num_test_bitsizes && success; i++) { unsigned b = test_bitsizes[i]; unsigned* c = sqr ? &(tuning_info[b].nuss_sqr_thresh) : &(tuning_info[b].nuss_mul_thresh); for (use_scale = 0; use_scale <= 1 && success; use_scale++) for (thresh = 2; thresh <= 8 && success; thresh += (quick ? 4 : 1)) { unsigned save_thresh = *c; *c = thresh; size_t t1 = random_ulong (quick ? 3000 : 10000) + 1; size_t t2 = sqr ? t1 : (random_ulong (quick ? 3000 : 10000) + 1); n1 = ZNP_MAX (t1, t2); n2 = ZNP_MIN (t1, t2); zn_mod_init (mod, random_modulus (b, 1)); success = success && testcase_zn_array_mul_fft (n1, n2, sqr, use_scale, mod); zn_mod_clear (mod); *c = save_thresh; } } return success; }
/* Tests pmfvec_fft_dc (if huge == 0) or pmfvec_fft_huge (if huge == 1) */ int test_pmfvec_fft_dc_or_huge (int huge, int quick) { int success = 1; int i; unsigned lgK, lgM, lgT; ulong z, n, t; zn_mod_t mod; for (i = 0; i < num_test_bitsizes && success; i++) for (lgK = 0; lgK < 5 && success; lgK++) for (lgT = (huge ? 1 : 0); lgT < (huge ? lgK : 1) && success; lgT++) for (lgM = lgK ? (lgK - 1) : 0; lgM < lgK + (quick ? 1 : 3) && success; lgM++) { ulong K = 1UL << lgK; ulong M = 1UL << lgM; for (t = 0; t < ZNP_MIN (2 * M / K, quick ? 2 : 1000) && success; t++) for (n = 1; n <= K && success; n++) for (z = 1; z <= K && success; z++) { zn_mod_init (mod, random_modulus (test_bitsizes[i], 1)); success = success && testcase_pmfvec_fft_dc_or_huge (lgK, lgM, lgT, n, z, t, mod); zn_mod_clear (mod); } } return success; }
/* This function eats some CPU cycles. The number of cycles eaten is roughly proportional to the count parameter. This function exists only to ensure that the compiler is not smart enough to optimise away our cycle-eating. */ void use_up_cycles (unsigned long count) { for (; count; count--) { unsigned long x[3] = {0, 1, 2}; unsigned long y[3] = {0, 1, 2}; unsigned long z[5]; zn_mod_t mod; zn_mod_init (mod, 3); zn_array_mul (z, x, 3, y, 3, mod); zn_mod_clear (mod); } }
/* tests zn_array_mul_fft_dft() on a range of input cases */ int test_zn_array_mul_fft_dft (int quick) { int success = 1; int i, trial; unsigned lgT; size_t n1, n2; zn_mod_t mod; for (i = 0; i < num_test_bitsizes && success; i++) for (n2 = 1; n2 <= 30 && success; n2 += (quick ? random_ulong (2) + 1 : 1)) for (n1 = n2; n1 <= 30 && success; n1 += (quick ? random_ulong (2) + 1 : 1)) for (lgT = 0; lgT < 5 && success; lgT++) for (trial = 0; trial < (quick ? 1 : 3) && success; trial++) { zn_mod_init (mod, random_modulus (test_bitsizes[i], 1)); success = success && testcase_zn_array_mul_fft_dft (n1, n2, lgT, mod); zn_mod_clear (mod); } return success; }
void fmpz_comb_init(fmpz_comb_t comb, ulong * primes, ulong num_primes) { ulong i, j, k; comb->primes = primes; comb->num_primes = num_primes; ulong n = 0L; while (num_primes > (1L<<n)) n++; comb->n = n; ulong num; // create zn_poly modulus information comb->mod = (zn_mod_t *) flint_heap_alloc_bytes(sizeof(zn_mod_t)*num_primes); for (ulong i = 0; i < num_primes; i++) zn_mod_init(comb->mod[i], primes[i]); if (n == 0) return; // nothing to do // allocate space for comb comb->comb = (fmpz_t **) flint_heap_alloc(n); j = (1L<<(n - 1)); ulong size = 2; mp_limb_t * ptr; for (i = 0; i < n; i++) { comb->comb[i] = (fmpz_t *) flint_heap_alloc(j); ptr = (mp_limb_t *) flint_heap_alloc((1L<<n) + j); for (k = 0; k < j; k++, ptr += (size + 1)) { comb->comb[i][k] = ptr; } j/=2; size*=2; } // allocate space for res comb->res = (fmpz_t **) flint_heap_alloc(n); j = (1L<<(n - 1)); size = 2; for (i = 0; i < n; i++) { comb->res[i] = (fmpz_t *) flint_heap_alloc(j); ptr = (mp_limb_t *) flint_heap_alloc((1L<<n) + j); for (k = 0; k < j; k++, ptr += (size + 1)) { comb->res[i][k] = ptr; } j/=2; size*=2; } // compute products of pairs of primes and place in comb for (i = 0, j = 0; i + 2 <= num_primes; i += 2, j++) { fmpz_set_ui(comb->comb[0][j], primes[i]); fmpz_mul_ui(comb->comb[0][j], comb->comb[0][j], primes[i+1]); } if (i < num_primes) // in case number of primes is odd { fmpz_set_ui(comb->comb[0][j], primes[i]); i+=2; j++; } num = (1L<<n); // set the rest of the entries on that row of the comb to 1 for (; i < num; i += 2, j++) { fmpz_set_ui(comb->comb[0][j], 1L); } // compute rest of comb by multiplying in pairs ulong log_comb = 1; num /= 2; while (num >= 2) { for (i = 0, j = 0; i < num; i += 2, j++) { fmpz_mul(comb->comb[log_comb][j], comb->comb[log_comb-1][i], comb->comb[log_comb-1][i+1]); } log_comb++; num /= 2; } // compute inverses from pairs of primes fmpz_t temp = (fmpz_t) flint_stack_alloc(2); fmpz_t temp2 = (fmpz_t) flint_stack_alloc(2); for (i = 0, j = 0; i + 2 <= num_primes; i += 2, j++) { fmpz_set_ui(temp, primes[i]); fmpz_set_ui(temp2, primes[i+1]); fmpz_invert(comb->res[0][j], temp, temp2); } flint_stack_release(); //temp2 flint_stack_release(); //temp ulong log_res = 1; num = (1L<<(n - 1)); // compute remaining inverses, each level combining pairs from the level below while (log_res < n) { for (i = 0, j = 0; i < num; i += 2, j++) { fmpz_invert(comb->res[log_res][j], comb->comb[log_res-1][i], comb->comb[log_res-1][i+1]); } log_res++; num /= 2; } }
double profile_mulmid (void* arg, unsigned long count) { profile_info_struct* info = (profile_info_struct*) arg; size_t n1 = info->n1; size_t n2 = info->n2; zn_mod_t mod; zn_mod_init (mod, info->m); ulong* buf1 = (ulong*) malloc (sizeof (ulong) * n1); ulong* buf2 = (ulong*) malloc (sizeof (ulong) * n2); ulong* buf3 = (ulong*) malloc (sizeof (ulong) * (n1 - n2 + 1)); // generate random inputs size_t i; for (i = 0; i < n1; i++) buf1[i] = random_ulong (info->m); for (i = 0; i < n2; i++) buf2[i] = random_ulong (info->m); void (*target)(ulong*, const ulong*, size_t, const ulong*, size_t, int, const zn_mod_t); int redc; switch (info->algo) { case ALGO_MULMID_BEST: target = zn_array_mulmid_wrapper; break; case ALGO_MULMID_FALLBACK: target = zn_array_mulmid_fallback_wrapper; break; case ALGO_MULMID_KS1: target = zn_array_mulmid_KS1; redc = 0; break; case ALGO_MULMID_KS1_REDC: target = zn_array_mulmid_KS1; redc = 1; break; case ALGO_MULMID_KS2: target = zn_array_mulmid_KS2; redc = 0; break; case ALGO_MULMID_KS2_REDC: target = zn_array_mulmid_KS2; redc = 1; break; case ALGO_MULMID_KS3: target = zn_array_mulmid_KS3; redc = 0; break; case ALGO_MULMID_KS3_REDC: target = zn_array_mulmid_KS3; redc = 1; break; case ALGO_MULMID_KS4: target = zn_array_mulmid_KS4; redc = 0; break; case ALGO_MULMID_KS4_REDC: target = zn_array_mulmid_KS4; redc = 1; break; case ALGO_MULMID_FFT: target = zn_array_mulmid_fft_wrapper; break; default: abort (); } // warm up ulong j; for (j = 0; j < count/4; j++) target (buf3, buf1, n1, buf2, n2, redc, mod); // do the actual profile cycle_count_t t0 = get_cycle_counter (); for (j = 0; j < count; j++) target (buf3, buf1, n1, buf2, n2, redc, mod); cycle_count_t t1 = get_cycle_counter (); free (buf3); free (buf2); free (buf1); zn_mod_clear (mod); return cycle_diff (t0, t1); }