예제 #1
0
/* Transpose matrix B to both:
 *
 * - increase cache hits
 * - simd GCC vector extensions which is made possible.
 *   by the transposition, to increase likelyhood of SIMDs.
 *
 * Note that GCC 6 O=3 is smart enough to use SIMD
 * even for the naive CPU method. However this was still way faster.
 * */
void mat_mul_cpu_trans_vec(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    F tmpf;
    size_t i, j, k, k_max, ai, bi;
    Vec tmp, a, b;
    UNUSED(cache);

    mat_trans((F*)B, n);
    k_max = (n / VECTOR_NELEMS) * VECTOR_NELEMS;
    for (i = 0; i < n; ++i) {
        for (j = 0; j < n; ++j) {
            vec_zero(&tmp, VECTOR_NELEMS);
            for (k = 0; k < k_max; k += VECTOR_NELEMS) {
                ai = i * n + k;
                bi = j * n + k;
                vec_load(&a, VECTOR_NELEMS, A, ai);
                vec_load(&b, VECTOR_NELEMS, B, bi);
                tmp += a * b;
            }
            tmpf = 0.0;
            for (; k < n; ++k) {
                tmpf += A[i*n+k] * B[j*n+k];
            }
            C[i*n+j] = vec_sum(tmp, VECTOR_NELEMS) + tmpf;
        }
    }
    mat_trans((F*)B, n);
}
예제 #2
0
파일: altivec-1.c 프로젝트: aosm/gcc3
main ()
{
  i = vec_load (a1);
  j = vec_load (a2);
  k = vec_add_int4 (i, j);
  vec_store (a3, k);
  compare_int4 (a3, addi);

  k = vec_average_int4 (i, j);
  vec_store (a3, k);
  compare_int4 (a3, avgi);

  f = (float4) vec_load (f1);
  g = (float4) vec_load (f2);
  h = vec_add_float4 (f, g);
  vec_store (f3, h);
  compare_float4 (f3, addf);

  exit (0);
}