static void scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n) { __m128d xmm1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { *dest++ = *src1++ * *val; } xmm1 = _mm_load_pd1(val); for (; n >= 4; n -= 4) { __m128d xmm0; xmm0 = _mm_loadu_pd(src1); xmm0 = _mm_mul_pd(xmm0, xmm1); _mm_store_pd(dest, xmm0); xmm0 = _mm_loadu_pd(src1 + 2); xmm0 = _mm_mul_pd(xmm0, xmm1); _mm_store_pd(dest + 2, xmm0); dest += 4; src1 += 4; } for (; n > 0; n--) { *dest++ = *src1++ * *val; } }
inline void sse_micro_kernel<double>(double const *buffer_A, double const *buffer_B, double *buffer_C, vcl_size_t num_micro_slivers, vcl_size_t mr, vcl_size_t nr) { assert( (mr == MR_D) && (nr == NR_D) && bool("mr and nr obtained by 'get_block_sizes()' in 'matrix_operations.hpp' and given to 'avx_micro_kernel()' do not match with MR_D/NR_D defined in 'gemm_avx_micro_kernel.hpp' ") ); __m128d xmm0 , xmm1 , xmm2 , xmm3 ; __m128d xmm4 , xmm5 , xmm6 , xmm7 ; __m128d xmm8 , xmm9 , xmm10, xmm11; __m128d xmm12, xmm13, xmm14, xmm15; vcl_size_t l; for (l=0; l<num_micro_slivers; ++l) { xmm0 = _mm_load_pd(buffer_B+l*NR_D); xmm1 = _mm_load_pd(buffer_B+l*NR_D+2); xmm2 = _mm_load_pd1(buffer_A+l*MR_D); xmm3 = _mm_mul_pd(xmm0, xmm2); xmm4 = _mm_mul_pd(xmm1, xmm2); xmm2 = _mm_load_pd1(buffer_A+l*MR_D+1); xmm5 = _mm_mul_pd(xmm0, xmm2); xmm6 = _mm_mul_pd(xmm1, xmm2); xmm2 = _mm_load_pd1(buffer_A+l*MR_D+2); xmm7 = _mm_mul_pd(xmm0, xmm2); xmm8 = _mm_mul_pd(xmm1, xmm2); xmm2 = _mm_load_pd1(buffer_A+l*MR_D+3); xmm9 = _mm_mul_pd(xmm0, xmm2); xmm10 = _mm_mul_pd(xmm1, xmm2); xmm2 = _mm_load_pd1(buffer_A+l*MR_D+4); xmm11 = _mm_mul_pd(xmm0, xmm2); xmm12 = _mm_mul_pd(xmm1, xmm2); xmm2 = _mm_load_pd1(buffer_A+l*MR_D+5); xmm13 = _mm_mul_pd(xmm0, xmm2); xmm14 = _mm_mul_pd(xmm1, xmm2); /* store new entries */ xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(0)); xmm15 = _mm_add_pd(xmm15, xmm3); _mm_store_pd(buffer_C+C0_ROW_D(0), xmm15); xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(0)); xmm15 = _mm_add_pd(xmm15, xmm4); _mm_store_pd(buffer_C+C1_ROW_D(0), xmm15); xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(1)); xmm15 = _mm_add_pd(xmm15, xmm5); _mm_store_pd(buffer_C+C0_ROW_D(1), xmm15); xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(1)); xmm15 = _mm_add_pd(xmm15, xmm6); _mm_store_pd(buffer_C+C1_ROW_D(1), xmm15); xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(2)); xmm15 = _mm_add_pd(xmm15, xmm7); _mm_store_pd(buffer_C+C0_ROW_D(2), xmm15); xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(2)); xmm15 = _mm_add_pd(xmm15, xmm8); _mm_store_pd(buffer_C+C1_ROW_D(2), xmm15); xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(3)); xmm15 = _mm_add_pd(xmm15, xmm9); _mm_store_pd(buffer_C+C0_ROW_D(3), xmm15); xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(3)); xmm15 = _mm_add_pd(xmm15, xmm10); _mm_store_pd(buffer_C+C1_ROW_D(3), xmm15); xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(4)); xmm15 = _mm_add_pd(xmm15, xmm11); _mm_store_pd(buffer_C+C0_ROW_D(4), xmm15); xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(4)); xmm15 = _mm_add_pd(xmm15, xmm12); _mm_store_pd(buffer_C+C1_ROW_D(4), xmm15); xmm15 = _mm_load_pd(buffer_C+C0_ROW_D(5)); xmm15 = _mm_add_pd(xmm15, xmm13); _mm_store_pd(buffer_C+C0_ROW_D(5), xmm15); xmm15 = _mm_load_pd(buffer_C+C1_ROW_D(5)); xmm15 = _mm_add_pd(xmm15, xmm14); _mm_store_pd(buffer_C+C1_ROW_D(5), xmm15); }//for }//sse_micro_kernel<double>