Beispiel #1
0
/******************************************************************
*
* SPLIT RADIX PRECOMPUTED AND VECTORIZED FFT MULTIPLICATION
*
******************************************************************/
void sr_vector_mul(ring_t *r, const ring_t *x, const ring_t *y){
  // printf("\n\n**************split-radix FAST**************\n");

  fft_vector_forward(&vctr_x,x);
  fft_vector_forward(&vctr_y,y);
  
  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp;
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vctr_x.real+i);
    imag_x = _mm256_load_pd(vctr_x.imag+i);
    real_y = _mm256_load_pd(vctr_y.real+i);
    imag_y = _mm256_load_pd(vctr_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
     
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    real_y = _mm256_set1_pd(CPLXDIM);
    real_x = _mm256_div_pd(real_x,real_y);
    imag_x = _mm256_div_pd(imag_x,real_y);

    _mm256_store_pd(vctr_res.real+i,real_x);
    _mm256_store_pd(vctr_res.imag+i,imag_x);
  }
  fft_vector_backward(&vctr_res,r);
}
Beispiel #2
0
/******************************************************************
*
* NEGACYCLIC FFT LOOK UP TABLE
*
******************************************************************/
void negacyc_mul(ring_t *r, const ring_t *x, const ring_t *y)
{
  phi_forward(&vector_x,x);
  phi_forward(&vector_y,y);

  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp,dim;
  dim = _mm256_set1_pd(CPLXDIM);
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vector_x.real+i);
    imag_x = _mm256_load_pd(vector_x.imag+i);
    real_y = _mm256_load_pd(vector_y.real+i);
    imag_y = _mm256_load_pd(vector_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
 
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    
    real_x = _mm256_div_pd(real_x,dim);
    imag_x = _mm256_div_pd(imag_x,dim);

    _mm256_store_pd(vector_res.real+i,real_x);
    _mm256_store_pd(vector_res.imag+i,imag_x);
  }
  phi_backward(&vector_res,r);
  // print_cplx(&vec_res,CPLXDIM);
}
Beispiel #3
0
 double zdotu_soa(
                const int    N,
                const double* da,
                const double* db,
                const int    ix,
                const double* dc,
                const double* dd,
                const int    iy,
                double*  res
                )
{
        __m256d ymm0;
        __m256d ymm1;
        __m256d ymm2;
        __m256d ymm3;
        __m256d ymm4 = _mm256_setzero_pd();
        __m256d ymm5 = _mm256_setzero_pd();
        //
        int ii;
//#pragma unroll
        for(ii = 0; ii < N/4; ii++)
        {
		_mm_prefetch((const char*) da + 0x200, 1);
		_mm_prefetch((const char*) db + 0x200, 1);
		_mm_prefetch((const char*) dc + 0x200, 1);
		_mm_prefetch((const char*) dd + 0x200, 1);
                //IACA_START;
                // 8*4*4 = 128 bytes
                ymm0 = _mm256_loadu_pd(da + 4*ii);
                ymm1 = _mm256_loadu_pd(db + 4*ii);
                ymm2 = _mm256_loadu_pd(dc + 4*ii);
                ymm3 = _mm256_loadu_pd(dd + 4*ii);
                // 2*4*4 = 32 flops
                ymm4 = _mm256_fmsub_pd(ymm0, ymm2, _mm256_fmsub_pd(ymm1, ymm3, ymm4));
                ymm5 = _mm256_fmadd_pd(ymm0, ymm3, _mm256_fmadd_pd(ymm1, ymm2, ymm5));
		// flops/bute ratio = 1/4
                //IACA_END
        }
        double* re = (double*)&ymm4;
        double* im = (double*)&ymm5;
	//
        res[0] = re[0] + re[1] + re[2] + re[3];
        res[1] = im[0] + im[1] + im[2] + im[3];
}
Beispiel #4
0
void
check_mm256_fmsub_pd (__m256d __A, __m256d __B, __m256d __C)
{
  union256d a, b, c, e;
  a.x = __A;
  b.x = __B;
  c.x = __C;
  double d[4];
  int i;
  e.x = _mm256_fmsub_pd (__A, __B, __C);
  for (i = 0; i < 4; i++)
    {
      d[i] = a.a[i] * b.a[i] - c.a[i];
    }
  if (check_union256d (e, d))
    abort ();
}
Beispiel #5
0
/******************************************************************
*
* SPLIT RADIX PRECOMPUTED AND VECTORIZED NON RECURSIVE FFT MULTIPLICATION
*
******************************************************************/
void sr_vector_nonrec_mul(ring_t *r, const ring_t *x, const ring_t *y){
  fft_vector_nonrec_forward(&vec_x,x);
  fft_vector_nonrec_forward(&vec_y,y);
  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp;
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vec_x.real+i);
    imag_x = _mm256_load_pd(vec_x.imag+i);
    real_y = _mm256_load_pd(vec_y.real+i);
    imag_y = _mm256_load_pd(vec_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
    //REPLACED FOR COMMENTED SECTION
    //real_x = ac
    // real_x = _mm256_mul_pd(real_x,real_y);
    // //imag_x = bc
    // imag_x = _mm256_mul_pd(imag_x,real_y);
    // //real_x = ac - bd => real_x - real_temp
    // real_x = _mm256_sub_pd(real_x,real_temp);
    // //imag_x = ad + bc => imag_temp + imag_x
    // imag_x = _mm256_add_pd(imag_x,imag_temp);
    //THESE ARE NOT WORKING 
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    real_y = _mm256_set1_pd(CPLXDIM);
    real_x = _mm256_div_pd(real_x,real_y);
    imag_x = _mm256_div_pd(imag_x,real_y);

    _mm256_store_pd(vec_res.real+i,real_x);
    _mm256_store_pd(vec_res.imag+i,imag_x);

  }
  fft_vector_nonrec_backward(&vec_res,r);
}
Beispiel #6
0
__m256d test_mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) {
  // CHECK-LABEL: test_mm256_fmsub_pd
  // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
  // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> %{{.+}}, <4 x double> %{{.+}}, <4 x double> [[NEG]])
  return _mm256_fmsub_pd(a, b, c);
}
Beispiel #7
0
__m256d test_mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) {
  // CHECK: @llvm.x86.fma.vfmsub.pd.256
  return _mm256_fmsub_pd(a, b, c);
}
Beispiel #8
0
__m256d
check_mm256_fmsub_pd (__m256d a, __m256d b, __m256d c)
{
  return _mm256_fmsub_pd (a, b, c);
}