/* Transpose matrix B to both: * * - increase cache hits * - simd GCC vector extensions which is made possible. * by the transposition, to increase likelyhood of SIMDs. * * Note that GCC 6 O=3 is smart enough to use SIMD * even for the naive CPU method. However this was still way faster. * */ void mat_mul_cpu_trans_vec(const F *A, const F *B, F *C, size_t n, Cache *cache) { F tmpf; size_t i, j, k, k_max, ai, bi; Vec tmp, a, b; UNUSED(cache); mat_trans((F*)B, n); k_max = (n / VECTOR_NELEMS) * VECTOR_NELEMS; for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { vec_zero(&tmp, VECTOR_NELEMS); for (k = 0; k < k_max; k += VECTOR_NELEMS) { ai = i * n + k; bi = j * n + k; vec_load(&a, VECTOR_NELEMS, A, ai); vec_load(&b, VECTOR_NELEMS, B, bi); tmp += a * b; } tmpf = 0.0; for (; k < n; ++k) { tmpf += A[i*n+k] * B[j*n+k]; } C[i*n+j] = vec_sum(tmp, VECTOR_NELEMS) + tmpf; } } mat_trans((F*)B, n); }
main () { i = vec_load (a1); j = vec_load (a2); k = vec_add_int4 (i, j); vec_store (a3, k); compare_int4 (a3, addi); k = vec_average_int4 (i, j); vec_store (a3, k); compare_int4 (a3, avgi); f = (float4) vec_load (f1); g = (float4) vec_load (f2); h = vec_add_float4 (f, g); vec_store (f3, h); compare_float4 (f3, addf); exit (0); }