Exemple #1
0
static void
scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n)
{
  __m128d xmm1;

  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    *dest++ = *src1++ * *val;
  }
  xmm1 = _mm_load_pd1(val);
  for (; n >= 4; n -= 4) {
    __m128d xmm0;
    xmm0 = _mm_loadu_pd(src1);
    xmm0 = _mm_mul_pd(xmm0, xmm1);
    _mm_store_pd(dest, xmm0);
    xmm0 = _mm_loadu_pd(src1 + 2);
    xmm0 = _mm_mul_pd(xmm0, xmm1);
    _mm_store_pd(dest + 2, xmm0);
    dest += 4;
    src1 += 4;
  }
  for (; n > 0; n--) {
    *dest++ = *src1++ * *val;
  }
}
  inline void sse_micro_kernel<double>(double const *buffer_A, double const *buffer_B, double *buffer_C,
                                        vcl_size_t num_micro_slivers, vcl_size_t mr, vcl_size_t nr)
  {
    assert( (mr == MR_D) && (nr == NR_D) && bool("mr and nr obtained by 'get_block_sizes()' in 'matrix_operations.hpp' and given to 'avx_micro_kernel()' do not match with MR_D/NR_D defined in 'gemm_avx_micro_kernel.hpp' ") );

    __m128d xmm0 , xmm1 , xmm2 , xmm3 ;
    __m128d xmm4 , xmm5 , xmm6 , xmm7 ;
    __m128d xmm8 , xmm9 , xmm10, xmm11;
    __m128d xmm12, xmm13, xmm14, xmm15;

    vcl_size_t l;
    
    for (l=0; l<num_micro_slivers; ++l)
    {
      xmm0 = _mm_load_pd(buffer_B+l*NR_D);
      xmm1 = _mm_load_pd(buffer_B+l*NR_D+2);

      xmm2 = _mm_load_pd1(buffer_A+l*MR_D);
      xmm3 = _mm_mul_pd(xmm0, xmm2);
      xmm4 = _mm_mul_pd(xmm1, xmm2);
      
      xmm2 = _mm_load_pd1(buffer_A+l*MR_D+1);
      xmm5 = _mm_mul_pd(xmm0, xmm2);
      xmm6 = _mm_mul_pd(xmm1, xmm2);

      xmm2 = _mm_load_pd1(buffer_A+l*MR_D+2);
      xmm7 = _mm_mul_pd(xmm0, xmm2);
      xmm8 = _mm_mul_pd(xmm1, xmm2);

      xmm2  = _mm_load_pd1(buffer_A+l*MR_D+3);
      xmm9  = _mm_mul_pd(xmm0, xmm2);
      xmm10 = _mm_mul_pd(xmm1, xmm2);

      xmm2  = _mm_load_pd1(buffer_A+l*MR_D+4);
      xmm11 = _mm_mul_pd(xmm0, xmm2);
      xmm12 = _mm_mul_pd(xmm1, xmm2);

      xmm2  = _mm_load_pd1(buffer_A+l*MR_D+5);
      xmm13 = _mm_mul_pd(xmm0, xmm2);
      xmm14 = _mm_mul_pd(xmm1, xmm2);
    
      /* store new entries */
      xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(0));
      xmm15 = _mm_add_pd(xmm15, xmm3);
      _mm_store_pd(buffer_C+C0_ROW_D(0), xmm15);
      
      xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(0));
      xmm15 = _mm_add_pd(xmm15, xmm4);
      _mm_store_pd(buffer_C+C1_ROW_D(0), xmm15);

      xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(1));
      xmm15 = _mm_add_pd(xmm15, xmm5);
      _mm_store_pd(buffer_C+C0_ROW_D(1), xmm15);

      xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(1));
      xmm15 = _mm_add_pd(xmm15, xmm6);
      _mm_store_pd(buffer_C+C1_ROW_D(1), xmm15);

      xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(2));
      xmm15 = _mm_add_pd(xmm15, xmm7);
      _mm_store_pd(buffer_C+C0_ROW_D(2), xmm15);
      
      xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(2));
      xmm15 = _mm_add_pd(xmm15, xmm8);
      _mm_store_pd(buffer_C+C1_ROW_D(2), xmm15);

      xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(3));
      xmm15 = _mm_add_pd(xmm15, xmm9);
      _mm_store_pd(buffer_C+C0_ROW_D(3), xmm15);

      xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(3));
      xmm15 = _mm_add_pd(xmm15, xmm10);
      _mm_store_pd(buffer_C+C1_ROW_D(3), xmm15);

      xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(4));
      xmm15 = _mm_add_pd(xmm15, xmm11);
      _mm_store_pd(buffer_C+C0_ROW_D(4), xmm15);

      xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(4));
      xmm15 = _mm_add_pd(xmm15, xmm12);
      _mm_store_pd(buffer_C+C1_ROW_D(4), xmm15);

      xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(5));
      xmm15 = _mm_add_pd(xmm15, xmm13);
      _mm_store_pd(buffer_C+C0_ROW_D(5), xmm15);

      xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(5));
      xmm15 = _mm_add_pd(xmm15, xmm14);
      _mm_store_pd(buffer_C+C1_ROW_D(5), xmm15);

    }//for
  }//sse_micro_kernel<double>