void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N) {
#ifdef TM_USE_OMP
#pragma omp parallel
  {
#endif
  vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5;
  vector4double z0, z1, z2, z3, z4, z5, k;
  float *s, *r;
  float ALIGN32 _c;
  _c = c;
  __prefetch_by_load(S);
  __prefetch_by_load(R);

  k = vec_splats((double)_c);
  __alignx(16, s);
  __alignx(16, r);
  __alignx(16, S);
  __alignx(16, R);

#ifdef TM_USE_OMP
#pragma omp for
#else
#pragma unroll(2)
#endif
  for(int i = 0; i < N; i++) {
    s=(float*)((spinor32 *) S + i);
    r=(float*)((spinor32 *) R + i);
    __prefetch_by_load(S + i + 1);
    __prefetch_by_stream(1, R + i + 1);
    x0 = vec_ld(0, r);
    x1 = vec_ld(0, r+4);
    x2 = vec_ld(0, r+8);
    x3 = vec_ld(0, r+12);
    x4 = vec_ld(0, r+16);
    x5 = vec_ld(0, r+20);
    y0 = vec_ld(0, s);
    y1 = vec_ld(0, s+4);
    y2 = vec_ld(0, s+8);
    y3 = vec_ld(0, s+12);
    y4 = vec_ld(0, s+16);
    y5 = vec_ld(0, s+20);
    z0 = vec_madd(k, y0, x0);
    z1 = vec_madd(k, y1, x1);
    z2 = vec_madd(k, y2, x2);
    z3 = vec_madd(k, y3, x3);
    z4 = vec_madd(k, y4, x4);
    z5 = vec_madd(k, y5, x5);
    vec_st(z0, 0, r);
    vec_st(z1, 0, r+4);
    vec_st(z2, 0, r+8);
    vec_st(z3, 0, r+12);
    vec_st(z4, 0, r+16);
    vec_st(z5, 0, r+20);
  }
#ifdef TM_USE_OMP
  } /* OpenMP closing brace */
#endif
  return;
}
Exemple #2
0
void diff(spinor * const Q,const spinor * const R,const spinor * const S, const int N) {
#ifdef OMP
#pragma omp parallel
  {
#endif
  vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5;
  vector4double z0, z1, z2, z3, z4, z5;
  double *s, *r, *q;

  __alignx(32, s);
  __alignx(32, r);
  __alignx(32, q);
  __alignx(32, S);
  __alignx(32, R);

  __prefetch_by_load(S);
  __prefetch_by_load(R);
  __prefetch_by_load(Q);

#ifndef OMP
#pragma unroll(2)
#else
#pragma omp for
#endif
  for (int ix = 0; ix < N; ++ix) {
    s=(double*)((spinor *) S + ix);
    r=(double*)((spinor *) R + ix);
    q=(double*)((spinor *) Q + ix);
    __prefetch_by_load(S + ix + 1);
    __prefetch_by_load(R + ix + 1);
    __prefetch_by_stream(1, Q + ix + 1);
    x0 = vec_ld(0, r);
    x1 = vec_ld(0, r+4);
    x2 = vec_ld(0, r+8);
    x3 = vec_ld(0, r+12);
    x4 = vec_ld(0, r+16);
    x5 = vec_ld(0, r+20);
    y0 = vec_ld(0, s);
    y1 = vec_ld(0, s+4);
    y2 = vec_ld(0, s+8);
    y3 = vec_ld(0, s+12);
    y4 = vec_ld(0, s+16);
    y5 = vec_ld(0, s+20);
    z0 = vec_sub(x0, y0);
    z1 = vec_sub(x1, y1);
    z2 = vec_sub(x2, y2);
    z3 = vec_sub(x3, y3);
    z4 = vec_sub(x4, y4);
    z5 = vec_sub(x5, y5);
    vec_st(z0, 0, q);
    vec_st(z1, 0, q+4);
    vec_st(z2, 0, q+8);
    vec_st(z3, 0, q+12);
    vec_st(z4, 0, q+16);
    vec_st(z5, 0, q+20);
  }

#ifdef OMP
  } /* OpenMP parallel closing brace */
#endif

  return;
}