double vector_ps_double (const double* pa,const double* pb,size_t n) { size_t k; /* multiplication 4 par 4 */ size_t q = n / 4; size_t r = n % 4; double w; _mm_prefetch (pa,_MM_HINT_NTA); _mm_prefetch (pb,_MM_HINT_NTA); if (q > 0) { __m128d acc1 = _mm_setzero_pd(); __m128d acc2 = _mm_setzero_pd(); if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { for (k=0;k<q;k++) { /* Charge 2 doubles dans chaque tableau */ __m128d i1 = _mm_load_pd(pa); __m128d j1 = _mm_load_pd(pb); __m128d i2 = _mm_load_pd(pa+2); __m128d j2 = _mm_load_pd(pb+2); /* incrément de 4 doubles en tout (2 pour i et 2 pour j) */ /* Multiplie */ __m128d s1 = _mm_mul_pd(i1,j1); __m128d s2 = _mm_mul_pd(i2,j2); pa += 4; pb += 4; /* Accumule */ acc1 = _mm_add_pd(acc1,s1); acc2 = _mm_add_pd(acc2,s2); } } else { for (k=0;k<q;k++) { /* Charge 2 doubles dans chaque tableau */ __m128d i1 = _mm_loadu_pd(pa); __m128d j1 = _mm_loadu_pd(pb); __m128d i2 = _mm_loadu_pd(pa+2); __m128d j2 = _mm_loadu_pd(pb+2); /* Multiplie */ __m128d s1 = _mm_mul_pd(i1,j1); __m128d s2 = _mm_mul_pd(i2,j2); pa += 4; pb += 4; /* Accumule */ acc1 = _mm_add_pd(acc1,s1); acc2 = _mm_add_pd(acc2,s2); } } /* Somme finale */ acc1 = _mm_add_pd(acc1,acc2); acc1 = _mm_hadd_pd(acc1,acc1); _mm_store_sd(&w,acc1); } else { w = 0; } for (k=0;k<r;k++) w += (*pa++) * (*pb++); return w; }
int vector_ps_short (const short* pa,const short* pb,size_t n) { size_t k; size_t q = n / 16; size_t r = n % 16; int w; if (q > 0) { __m128i acc1 = _mm_setzero_si128(); __m128i acc2 = _mm_setzero_si128(); if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { for (k=0;k<q;k++) { /* Charge 16 mots dans chaque tableau */ __m128i a1 = _mm_load_si128((__m128i*)pa); __m128i b1 = _mm_load_si128((__m128i*)pb); __m128i a2 = _mm_load_si128((__m128i*)(pa+8)); __m128i b2 = _mm_load_si128((__m128i*)(pb+8)); /* Multiple, somme et converti en double word */ __m128i s1 = _mm_madd_epi16(a1,b1); __m128i s2 = _mm_madd_epi16(a2,b2); pa += 16; pb += 16; /* Accumule */ acc1 = _mm_add_epi32(acc1,s1); acc2 = _mm_add_epi32(acc2,s2); } } else { for (k=0;k<q;k++) { /* Charge 16 mots dans chaque tableau */ __m128i a1 = _mm_loadu_si128((__m128i*)pa); __m128i b1 = _mm_loadu_si128((__m128i*)pb); __m128i a2 = _mm_loadu_si128((__m128i*)(pa+8)); __m128i b2 = _mm_loadu_si128((__m128i*)(pb+8)); /* Multiple, somme et converti en double word */ __m128i s1 = _mm_madd_epi16(a1,b1); __m128i s2 = _mm_madd_epi16(a2,b2); pa += 16; pb += 16; /* Accumule */ acc1 = _mm_add_epi32(acc1,s1); acc2 = _mm_add_epi32(acc2,s2); } } /* Somme finale */ acc1 = _mm_add_epi32(acc1,acc2); acc1 = _mm_hadd_epi32(acc1,acc1); acc1 = _mm_hadd_epi32(acc1,acc1); w = _mm_extract_epi32(acc1,0); } else { w = 0; } for (k=0;k<r;k++) w += (*pa++) * (*pb++); return w; }
ALGEBRA_INLINE void vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n) { size_t k; __m128d l1 = _mm_load1_pd(&lambda); size_t q = n / 2; size_t r = n % 2; if(q > 0) { if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) { for (k=0;k<q;k++) { /* Charge 2 valeurs de chaque tableau */ __m128d i1 = _mm_load_pd(v1); __m128d j1 = _mm_load_pd(v2); /* multiplie */ j1 = _mm_mul_pd(j1, l1); /* additionne */ i1 = _mm_add_pd(i1,j1); /* Sauvegarde */ _mm_store_pd(v1, i1); v1 += 2; v2 += 2; } } else { for (k=0;k<q;k++) { /* Charge 8 valeurs de chaque tableau */ __m128d i1 = _mm_loadu_pd(v1); __m128d j1 = _mm_loadu_pd(v2); j1 = _mm_mul_pd(j1, l1); /* Soustrait */ i1 = _mm_add_pd(i1,j1); /* Sauvegarde */ _mm_storeu_pd(v1, i1); v1 += 2; v2 += 2; } } } for(k = 0 ; k<r ; k++) v1[k] += lambda*v2[k]; }
void matrix_CpAAt_float (float* C,const float* A,size_t n,size_t p) { size_t i,j,k; size_t q = n / 8; size_t r = n % 8; for (k=0;k<p;k++) { float* pC = C; for (j=0;j<n;j++) { __m128 w = _mm_load1_ps (A+j+k*n); const float* pA = A+k*n; if (ALGEBRA_IS_ALIGNED(pA) && ALGEBRA_IS_ALIGNED(pC)) { for (i=0;i<q;i++) { __m128 i1 = _mm_load_ps(pA); __m128 i2 = _mm_load_ps(pA+4); __m128 o1 = _mm_load_ps(pC); __m128 o2 = _mm_load_ps(pC+4); _mm_store_ps(pC+0,_mm_add_ps(o1,_mm_mul_ps(i1,w))); _mm_store_ps(pC+4,_mm_add_ps(o2,_mm_mul_ps(i2,w))); pA += 8; pC += 8; } } else { for (i=0;i<q;i++) { __m128 i1 = _mm_loadu_ps(pA); __m128 i2 = _mm_loadu_ps(pA+4); __m128 o1 = _mm_loadu_ps(pC); __m128 o2 = _mm_loadu_ps(pC+4); _mm_storeu_ps(pC+0,_mm_add_ps(o1,_mm_mul_ps(i1,w))); _mm_storeu_ps(pC+4,_mm_add_ps(o2,_mm_mul_ps(i2,w))); pA += 8; pC += 8; } } for (i=0;i<r;i++) { (*pC++) += A[j+k*n]*(*pA++); } } } }
ALGEBRA_INLINE double vector_ps_double (const double* pa,const double* pb,size_t n) { if(ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { size_t q = n/4; size_t r = n%4; double w = 0; if(q>0) { __m256d acc = _mm256_setzero_pd(); __m256d i1 = _mm256_load_pd(pa); __m256d j1 = _mm256_load_pd(pb); pa += 4; pb += 4; __m256d s = _mm256_mul_pd(i1, j1); acc = _mm256_add_pd(acc, s); while(--q != 0) { // load i1 = _mm256_load_pd(pa); j1 = _mm256_load_pd(pb); pa += 4; pb += 4; // multiplie s = _mm256_mul_pd(i1, j1); // accumule acc = _mm256_add_pd(acc, s); } // sum finale // add horizontal acc = _mm256_hadd_pd(acc, acc); // échange 128bits haut et bas __m256d accp = _mm256_permute2f128_pd(acc, acc, 1); // add vertical acc = _mm256_add_pd(acc, accp); // extract _mm_store_sd(&w, _mm256_extractf128_pd(acc,0)); } return w + vector_ps_double_basic(pa, pb, r); } return vector_ps_double_basic(pa, pb, n); }
float vector_cos_short (const short* pa,const short* pb,size_t n) { size_t k; double norm; size_t q = n / 16; size_t r = n % 16; int ps,na,nb; if (q > 0) { __m128i acc; __m128i acc_ps1 = _mm_setzero_si128(); __m128i acc_ps2 = _mm_setzero_si128(); __m128i acc_na1 = _mm_setzero_si128(); __m128i acc_na2 = _mm_setzero_si128(); __m128i acc_nb1 = _mm_setzero_si128(); __m128i acc_nb2 = _mm_setzero_si128(); if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { for (k=0;k<q;k++) { /* Charge 16 mots dans chaque tableau */ __m128i a1 = _mm_load_si128((__m128i*)pa); __m128i b1 = _mm_load_si128((__m128i*)pb); __m128i a2 = _mm_load_si128((__m128i*)(pa+8)); __m128i b2 = _mm_load_si128((__m128i*)(pb+8)); /* Multiple, somme et converti en double word */ __m128i ps1 = _mm_madd_epi16(a1,b1); __m128i ps2 = _mm_madd_epi16(a2,b2); __m128i na1 = _mm_madd_epi16(a1,a1); __m128i na2 = _mm_madd_epi16(a2,a2); __m128i nb1 = _mm_madd_epi16(b1,b1); __m128i nb2 = _mm_madd_epi16(b2,b2); pa += 16; pb += 16; /* Accumule */ acc_ps1 = _mm_add_epi32(acc_ps1,ps1); acc_ps2 = _mm_add_epi32(acc_ps2,ps2); acc_na1 = _mm_add_epi32(acc_na1,na1); acc_na2 = _mm_add_epi32(acc_na2,na2); acc_nb1 = _mm_add_epi32(acc_nb1,nb1); acc_nb2 = _mm_add_epi32(acc_nb2,nb2); } } else { for (k=0;k<q;k++) { } } /* Somme finale */ acc = _mm_add_epi32(acc_ps1,acc_ps2); acc = _mm_hadd_epi32(acc,acc); acc = _mm_hadd_epi32(acc,acc); ps = _mm_extract_epi32(acc,0); acc = _mm_add_epi32(acc_na1,acc_na2); acc = _mm_hadd_epi32(acc,acc); acc = _mm_hadd_epi32(acc,acc); na = _mm_extract_epi32(acc,0); acc = _mm_add_epi32(acc_nb1,acc_nb2); acc = _mm_hadd_epi32(acc,acc); acc = _mm_hadd_epi32(acc,acc); nb = _mm_extract_epi32(acc,0); } else { ps = 0; na = 0; nb = 0; } for (k=0;k<r;k++) { int a = *pa++; int b = *pb++; ps += a*b; na += a*a; nb += b*b; } norm = sqrt( ((double)na) * ((double)nb) ); if (norm < 1E-5f) return 0; return ps / norm; }
ALGEBRA_INLINE void vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n) { size_t k; __m256d l1 = _mm256_broadcast_sd(&lambda); __m256d l2 = _mm256_broadcast_sd(&lambda); __m256d l3 = _mm256_broadcast_sd(&lambda); __m256d l4 = _mm256_broadcast_sd(&lambda); size_t q = n / 16; size_t r = n % 16; if(q > 0) { if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) { for (k=0;k<q;k++) { /* Charge 4 valeurs de chaque tableau */ __m256d i1 = _mm256_load_pd(v1); __m256d j1 = _mm256_load_pd(v2); __m256d i2 = _mm256_load_pd(v1+4); __m256d j2 = _mm256_load_pd(v2+4); __m256d i3 = _mm256_load_pd(v1+8); __m256d j3 = _mm256_load_pd(v2+8); __m256d i4 = _mm256_load_pd(v1+12); __m256d j4 = _mm256_load_pd(v2+12); /* multiplie */ j1 = _mm256_mul_pd(j1, l1); j2 = _mm256_mul_pd(j2, l2); j3 = _mm256_mul_pd(j3, l3); j4 = _mm256_mul_pd(j4, l4); /* Additionne */ i1 = _mm256_add_pd(i1,j1); i2 = _mm256_add_pd(i2,j2); i3 = _mm256_add_pd(i3,j3); i4 = _mm256_add_pd(i4,j4); /* Sauvegarde */ _mm256_store_pd(v1, i1); _mm256_store_pd(v1+4, i2); _mm256_store_pd(v1+8, i3); _mm256_store_pd(v1+12, i4); v1 += 16; v2 += 16; } } else { for (k=0;k<q;k++) { /* Charge 4 valeurs de chaque tableau */ __m256d i1 = _mm256_loadu_pd(v1); __m256d j1 = _mm256_loadu_pd(v2); __m256d i2 = _mm256_loadu_pd(v1+4); __m256d j2 = _mm256_loadu_pd(v2+4); __m256d i3 = _mm256_loadu_pd(v1+8); __m256d j3 = _mm256_loadu_pd(v2+8); __m256d i4 = _mm256_loadu_pd(v1+12); __m256d j4 = _mm256_loadu_pd(v2+12); /* multiplie */ j1 = _mm256_mul_pd(j1, l1); j2 = _mm256_mul_pd(j2, l2); j3 = _mm256_mul_pd(j3, l3); j4 = _mm256_mul_pd(j4, l4); /* Additionne */ i1 = _mm256_add_pd(i1,j1); i2 = _mm256_add_pd(i2,j2); i3 = _mm256_add_pd(i3,j3); i4 = _mm256_add_pd(i4,j4); /* Sauvegarde */ _mm256_storeu_pd(v1, i1); _mm256_storeu_pd(v1+4, i2); _mm256_storeu_pd(v1+8, i3); _mm256_storeu_pd(v1+12, i4); v1 += 16; v2 += 16; } } } for(k = 0 ; k<r ; k++) v1[k] += lambda*v2[k]; }