/****************************************************************** * * SPLIT RADIX PRECOMPUTED AND VECTORIZED FFT MULTIPLICATION * ******************************************************************/ void sr_vector_mul(ring_t *r, const ring_t *x, const ring_t *y){ // printf("\n\n**************split-radix FAST**************\n"); fft_vector_forward(&vctr_x,x); fft_vector_forward(&vctr_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp; // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vctr_x.real+i); imag_x = _mm256_load_pd(vctr_x.imag+i); real_y = _mm256_load_pd(vctr_y.real+i); imag_y = _mm256_load_pd(vctr_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_y = _mm256_set1_pd(CPLXDIM); real_x = _mm256_div_pd(real_x,real_y); imag_x = _mm256_div_pd(imag_x,real_y); _mm256_store_pd(vctr_res.real+i,real_x); _mm256_store_pd(vctr_res.imag+i,imag_x); } fft_vector_backward(&vctr_res,r); }
/****************************************************************** * * NEGACYCLIC FFT LOOK UP TABLE * ******************************************************************/ void negacyc_mul(ring_t *r, const ring_t *x, const ring_t *y) { phi_forward(&vector_x,x); phi_forward(&vector_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp,dim; dim = _mm256_set1_pd(CPLXDIM); // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vector_x.real+i); imag_x = _mm256_load_pd(vector_x.imag+i); real_y = _mm256_load_pd(vector_y.real+i); imag_y = _mm256_load_pd(vector_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_x = _mm256_div_pd(real_x,dim); imag_x = _mm256_div_pd(imag_x,dim); _mm256_store_pd(vector_res.real+i,real_x); _mm256_store_pd(vector_res.imag+i,imag_x); } phi_backward(&vector_res,r); // print_cplx(&vec_res,CPLXDIM); }
double zdotu_soa( const int N, const double* da, const double* db, const int ix, const double* dc, const double* dd, const int iy, double* res ) { __m256d ymm0; __m256d ymm1; __m256d ymm2; __m256d ymm3; __m256d ymm4 = _mm256_setzero_pd(); __m256d ymm5 = _mm256_setzero_pd(); // int ii; //#pragma unroll for(ii = 0; ii < N/4; ii++) { _mm_prefetch((const char*) da + 0x200, 1); _mm_prefetch((const char*) db + 0x200, 1); _mm_prefetch((const char*) dc + 0x200, 1); _mm_prefetch((const char*) dd + 0x200, 1); //IACA_START; // 8*4*4 = 128 bytes ymm0 = _mm256_loadu_pd(da + 4*ii); ymm1 = _mm256_loadu_pd(db + 4*ii); ymm2 = _mm256_loadu_pd(dc + 4*ii); ymm3 = _mm256_loadu_pd(dd + 4*ii); // 2*4*4 = 32 flops ymm4 = _mm256_fmsub_pd(ymm0, ymm2, _mm256_fmsub_pd(ymm1, ymm3, ymm4)); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, _mm256_fmadd_pd(ymm1, ymm2, ymm5)); // flops/bute ratio = 1/4 //IACA_END } double* re = (double*)&ymm4; double* im = (double*)&ymm5; // res[0] = re[0] + re[1] + re[2] + re[3]; res[1] = im[0] + im[1] + im[2] + im[3]; }
void check_mm256_fmsub_pd (__m256d __A, __m256d __B, __m256d __C) { union256d a, b, c, e; a.x = __A; b.x = __B; c.x = __C; double d[4]; int i; e.x = _mm256_fmsub_pd (__A, __B, __C); for (i = 0; i < 4; i++) { d[i] = a.a[i] * b.a[i] - c.a[i]; } if (check_union256d (e, d)) abort (); }
/****************************************************************** * * SPLIT RADIX PRECOMPUTED AND VECTORIZED NON RECURSIVE FFT MULTIPLICATION * ******************************************************************/ void sr_vector_nonrec_mul(ring_t *r, const ring_t *x, const ring_t *y){ fft_vector_nonrec_forward(&vec_x,x); fft_vector_nonrec_forward(&vec_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp; // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vec_x.real+i); imag_x = _mm256_load_pd(vec_x.imag+i); real_y = _mm256_load_pd(vec_y.real+i); imag_y = _mm256_load_pd(vec_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); //REPLACED FOR COMMENTED SECTION //real_x = ac // real_x = _mm256_mul_pd(real_x,real_y); // //imag_x = bc // imag_x = _mm256_mul_pd(imag_x,real_y); // //real_x = ac - bd => real_x - real_temp // real_x = _mm256_sub_pd(real_x,real_temp); // //imag_x = ad + bc => imag_temp + imag_x // imag_x = _mm256_add_pd(imag_x,imag_temp); //THESE ARE NOT WORKING real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_y = _mm256_set1_pd(CPLXDIM); real_x = _mm256_div_pd(real_x,real_y); imag_x = _mm256_div_pd(imag_x,real_y); _mm256_store_pd(vec_res.real+i,real_x); _mm256_store_pd(vec_res.imag+i,imag_x); } fft_vector_nonrec_backward(&vec_res,r); }
__m256d test_mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_fmsub_pd // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}} // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> %{{.+}}, <4 x double> %{{.+}}, <4 x double> [[NEG]]) return _mm256_fmsub_pd(a, b, c); }
__m256d test_mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) { // CHECK: @llvm.x86.fma.vfmsub.pd.256 return _mm256_fmsub_pd(a, b, c); }
__m256d check_mm256_fmsub_pd (__m256d a, __m256d b, __m256d c) { return _mm256_fmsub_pd (a, b, c); }