Example #1
0
double	vector_ps_double (const double* pa,const double* pb,size_t n)
{
    size_t k;
    /* multiplication 4 par 4 */
    size_t q = n / 4;
    size_t r = n % 4;
    double w;
    _mm_prefetch (pa,_MM_HINT_NTA);
    _mm_prefetch (pb,_MM_HINT_NTA);
    if (q > 0) {
	__m128d acc1 = _mm_setzero_pd();
	__m128d acc2 = _mm_setzero_pd();
	if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
	    for (k=0;k<q;k++) {
		/* Charge 2 doubles dans chaque tableau */
		__m128d i1 = _mm_load_pd(pa);
		__m128d j1 = _mm_load_pd(pb);
		__m128d i2 = _mm_load_pd(pa+2);
		__m128d j2 = _mm_load_pd(pb+2);
		/* incrément de 4 doubles en tout (2 pour i et 2 pour j) */
		/* Multiplie */
		__m128d s1 = _mm_mul_pd(i1,j1);
		__m128d s2 = _mm_mul_pd(i2,j2);
		pa += 4;
		pb += 4;
		/* Accumule */
		acc1 = _mm_add_pd(acc1,s1);
		acc2 = _mm_add_pd(acc2,s2);
	    }
	}
	else {
	    for (k=0;k<q;k++) {
		/* Charge 2 doubles dans chaque tableau */
		__m128d i1 = _mm_loadu_pd(pa);
		__m128d j1 = _mm_loadu_pd(pb);
		__m128d i2 = _mm_loadu_pd(pa+2);
		__m128d j2 = _mm_loadu_pd(pb+2);
		/* Multiplie */
		__m128d s1 = _mm_mul_pd(i1,j1);
		__m128d s2 = _mm_mul_pd(i2,j2);
		pa += 4;
		pb += 4;
		/* Accumule */
		acc1 = _mm_add_pd(acc1,s1);
		acc2 = _mm_add_pd(acc2,s2);
	    }
	}
	/* Somme finale */
	acc1 = _mm_add_pd(acc1,acc2);
	acc1 = _mm_hadd_pd(acc1,acc1);
	_mm_store_sd(&w,acc1);
    }
    else {
	w = 0;
    }
    for (k=0;k<r;k++)
	w += (*pa++) * (*pb++);
    return w;
}
Example #2
0
int	vector_ps_short (const short* pa,const short* pb,size_t n)
{
    size_t k;
    size_t q = n / 16;
    size_t r = n % 16;
    int w;
    if (q > 0) {
	__m128i acc1 = _mm_setzero_si128();
	__m128i acc2 = _mm_setzero_si128();
	if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
	    for (k=0;k<q;k++) {
		/* Charge 16 mots dans chaque tableau */
		__m128i a1 = _mm_load_si128((__m128i*)pa);
		__m128i b1 = _mm_load_si128((__m128i*)pb);
		__m128i a2 = _mm_load_si128((__m128i*)(pa+8));
		__m128i b2 = _mm_load_si128((__m128i*)(pb+8));
		/* Multiple, somme et converti en double word */
		__m128i s1 = _mm_madd_epi16(a1,b1);
		__m128i s2 = _mm_madd_epi16(a2,b2);
		pa += 16;
		pb += 16;
		/* Accumule */
		acc1 = _mm_add_epi32(acc1,s1);
		acc2 = _mm_add_epi32(acc2,s2);
	    }
	}
	else {
	    for (k=0;k<q;k++) {
		/* Charge 16 mots dans chaque tableau */
		__m128i a1 = _mm_loadu_si128((__m128i*)pa);
		__m128i b1 = _mm_loadu_si128((__m128i*)pb);
		__m128i a2 = _mm_loadu_si128((__m128i*)(pa+8));
		__m128i b2 = _mm_loadu_si128((__m128i*)(pb+8));
		/* Multiple, somme et converti en double word */
		__m128i s1 = _mm_madd_epi16(a1,b1);
		__m128i s2 = _mm_madd_epi16(a2,b2);
		pa += 16;
		pb += 16;
		/* Accumule */
		acc1 = _mm_add_epi32(acc1,s1);
		acc2 = _mm_add_epi32(acc2,s2);
	    }
	}
	/* Somme finale */
	acc1 = _mm_add_epi32(acc1,acc2);
	acc1 = _mm_hadd_epi32(acc1,acc1);
	acc1 = _mm_hadd_epi32(acc1,acc1);
	w = _mm_extract_epi32(acc1,0);
    }
    else {
	w = 0;
    }
    for (k=0;k<r;k++)
	w += (*pa++) * (*pb++);
    return w;
}
Example #3
0
ALGEBRA_INLINE void		vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n)
{
	size_t k;
	
	__m128d l1 = _mm_load1_pd(&lambda);

	size_t q = n / 2;
	size_t r = n % 2;
	if(q > 0) {
		if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) {
			for (k=0;k<q;k++) {
				/* Charge 2 valeurs de chaque tableau */
				__m128d i1 = _mm_load_pd(v1);
				__m128d j1 = _mm_load_pd(v2);
				/* multiplie */
					   j1 = _mm_mul_pd(j1, l1);
				/* additionne */
				i1 = _mm_add_pd(i1,j1);
				/* Sauvegarde */
				_mm_store_pd(v1, i1);
				v1 += 2;
				v2 += 2;
			}
		}
		else {		
			for (k=0;k<q;k++) {
				/* Charge 8 valeurs de chaque tableau */
				__m128d i1 = _mm_loadu_pd(v1);
				__m128d j1 = _mm_loadu_pd(v2);
					   j1 = _mm_mul_pd(j1, l1);
				/* Soustrait */
				i1 = _mm_add_pd(i1,j1);
				/* Sauvegarde */
				_mm_storeu_pd(v1, i1);
				v1 += 2;
				v2 += 2;
			}
		}
	}
	
	for(k = 0 ; k<r ; k++)
		v1[k] += lambda*v2[k];

}
Example #4
0
void	matrix_CpAAt_float (float* C,const float* A,size_t n,size_t p)
{
    size_t i,j,k;
    size_t q = n / 8;
    size_t r = n % 8;

    for (k=0;k<p;k++) {
        float* pC = C;
        for (j=0;j<n;j++) {
            __m128 w = _mm_load1_ps (A+j+k*n);
            const float* pA = A+k*n;
            if (ALGEBRA_IS_ALIGNED(pA) && ALGEBRA_IS_ALIGNED(pC)) {
                for (i=0;i<q;i++) {
                    __m128 i1 = _mm_load_ps(pA);
                    __m128 i2 = _mm_load_ps(pA+4);
                    __m128 o1 = _mm_load_ps(pC);
                    __m128 o2 = _mm_load_ps(pC+4);
                    _mm_store_ps(pC+0,_mm_add_ps(o1,_mm_mul_ps(i1,w)));
                    _mm_store_ps(pC+4,_mm_add_ps(o2,_mm_mul_ps(i2,w)));
                    pA += 8;
                    pC += 8;
                }
            }
            else {
                for (i=0;i<q;i++) {
                    __m128 i1 = _mm_loadu_ps(pA);
                    __m128 i2 = _mm_loadu_ps(pA+4);
                    __m128 o1 = _mm_loadu_ps(pC);
                    __m128 o2 = _mm_loadu_ps(pC+4);
                    _mm_storeu_ps(pC+0,_mm_add_ps(o1,_mm_mul_ps(i1,w)));
                    _mm_storeu_ps(pC+4,_mm_add_ps(o2,_mm_mul_ps(i2,w)));
                    pA += 8;
                    pC += 8;
                }
            }
            for (i=0;i<r;i++) {
                (*pC++) += A[j+k*n]*(*pA++);
            }
        }
    }
}
Example #5
0
ALGEBRA_INLINE double	vector_ps_double (const double* pa,const double* pb,size_t n) {
    if(ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
        size_t q = n/4;
        size_t r = n%4;
        double w = 0;

        if(q>0) {
            __m256d acc = _mm256_setzero_pd();
            __m256d i1 = _mm256_load_pd(pa);
            __m256d j1 = _mm256_load_pd(pb);
            pa += 4;
            pb += 4;
            __m256d s = _mm256_mul_pd(i1, j1);
            acc = _mm256_add_pd(acc, s);

            while(--q != 0) {
                // load
                i1 = _mm256_load_pd(pa);
                j1 = _mm256_load_pd(pb);
                pa += 4;
                pb += 4;
                // multiplie
                s = _mm256_mul_pd(i1, j1);
                // accumule
                acc = _mm256_add_pd(acc, s);            
            }
            // sum finale
            // add horizontal
            acc = _mm256_hadd_pd(acc, acc);
            // échange 128bits haut et bas
            __m256d accp = _mm256_permute2f128_pd(acc, acc, 1);
            // add vertical
            acc = _mm256_add_pd(acc, accp);
            // extract
            _mm_store_sd(&w,  _mm256_extractf128_pd(acc,0));
        }
        return w + vector_ps_double_basic(pa, pb, r);
    }
    return vector_ps_double_basic(pa, pb, n);
}
Example #6
0
float	vector_cos_short (const short* pa,const short* pb,size_t n)
{
    size_t k;
    double norm;
    size_t q = n / 16;
    size_t r = n % 16;
    int ps,na,nb;
    if (q > 0) {
        __m128i acc;
	__m128i acc_ps1 = _mm_setzero_si128();
	__m128i acc_ps2 = _mm_setzero_si128();
	__m128i acc_na1 = _mm_setzero_si128();
	__m128i acc_na2 = _mm_setzero_si128();
	__m128i acc_nb1 = _mm_setzero_si128();
	__m128i acc_nb2 = _mm_setzero_si128();
	if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
	    for (k=0;k<q;k++) {
		/* Charge 16 mots dans chaque tableau */
		__m128i a1 = _mm_load_si128((__m128i*)pa);
		__m128i b1 = _mm_load_si128((__m128i*)pb);
		__m128i a2 = _mm_load_si128((__m128i*)(pa+8));
		__m128i b2 = _mm_load_si128((__m128i*)(pb+8));
		/* Multiple, somme et converti en double word */
		__m128i ps1 = _mm_madd_epi16(a1,b1);
		__m128i ps2 = _mm_madd_epi16(a2,b2);
		__m128i na1 = _mm_madd_epi16(a1,a1);
		__m128i na2 = _mm_madd_epi16(a2,a2);
		__m128i nb1 = _mm_madd_epi16(b1,b1);
		__m128i nb2 = _mm_madd_epi16(b2,b2);
		pa += 16;
		pb += 16;
		/* Accumule */
		acc_ps1 = _mm_add_epi32(acc_ps1,ps1);
		acc_ps2 = _mm_add_epi32(acc_ps2,ps2);
		acc_na1 = _mm_add_epi32(acc_na1,na1);
		acc_na2 = _mm_add_epi32(acc_na2,na2);
		acc_nb1 = _mm_add_epi32(acc_nb1,nb1);
		acc_nb2 = _mm_add_epi32(acc_nb2,nb2);
	    }
	}
	else {
	    for (k=0;k<q;k++) {
	    }
	}
	/* Somme finale */
	acc = _mm_add_epi32(acc_ps1,acc_ps2);
	acc = _mm_hadd_epi32(acc,acc);
	acc = _mm_hadd_epi32(acc,acc);
	ps = _mm_extract_epi32(acc,0);

	acc = _mm_add_epi32(acc_na1,acc_na2);
	acc = _mm_hadd_epi32(acc,acc);
	acc = _mm_hadd_epi32(acc,acc);
	na = _mm_extract_epi32(acc,0);

	acc = _mm_add_epi32(acc_nb1,acc_nb2);
	acc = _mm_hadd_epi32(acc,acc);
	acc = _mm_hadd_epi32(acc,acc);
	nb = _mm_extract_epi32(acc,0);
    }
    else {
	ps = 0;
	na = 0;
	nb = 0;
    }
    for (k=0;k<r;k++) {
	int a = *pa++;
	int b = *pb++;
	ps += a*b;
	na += a*a;
	nb += b*b;
    }
    norm = sqrt( ((double)na) * ((double)nb) );
    if (norm < 1E-5f)
	return 0;
    return ps / norm;
}
Example #7
0
ALGEBRA_INLINE void		vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n)
{
	size_t k;
	
	__m256d l1 = _mm256_broadcast_sd(&lambda);
	__m256d l2 = _mm256_broadcast_sd(&lambda);
	__m256d l3 = _mm256_broadcast_sd(&lambda);
	__m256d l4 = _mm256_broadcast_sd(&lambda);

	size_t q = n / 16;
	size_t r = n % 16;
	if(q > 0) {
		if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) {
			for (k=0;k<q;k++) {
				/* Charge 4 valeurs de chaque tableau */
				__m256d i1 = _mm256_load_pd(v1);
				__m256d j1 = _mm256_load_pd(v2);
				__m256d i2 = _mm256_load_pd(v1+4);
				__m256d j2 = _mm256_load_pd(v2+4);
				__m256d i3 = _mm256_load_pd(v1+8);
				__m256d j3 = _mm256_load_pd(v2+8);
				__m256d i4 = _mm256_load_pd(v1+12);
				__m256d j4 = _mm256_load_pd(v2+12);
				/* multiplie */
					   j1 = _mm256_mul_pd(j1, l1);
					   j2 = _mm256_mul_pd(j2, l2);
					   j3 = _mm256_mul_pd(j3, l3);
					   j4 = _mm256_mul_pd(j4, l4);
				/* Additionne */
				i1 = _mm256_add_pd(i1,j1);
				i2 = _mm256_add_pd(i2,j2);
				i3 = _mm256_add_pd(i3,j3);
				i4 = _mm256_add_pd(i4,j4);
				/* Sauvegarde */
				_mm256_store_pd(v1, i1);
				_mm256_store_pd(v1+4, i2);
				_mm256_store_pd(v1+8, i3);
				_mm256_store_pd(v1+12, i4);
				v1 += 16;
				v2 += 16;
			}
		}
		else {		
			for (k=0;k<q;k++) {
				/* Charge 4 valeurs de chaque tableau */
				__m256d i1 = _mm256_loadu_pd(v1);
				__m256d j1 = _mm256_loadu_pd(v2);
				__m256d i2 = _mm256_loadu_pd(v1+4);
				__m256d j2 = _mm256_loadu_pd(v2+4);
				__m256d i3 = _mm256_loadu_pd(v1+8);
				__m256d j3 = _mm256_loadu_pd(v2+8);
				__m256d i4 = _mm256_loadu_pd(v1+12);
				__m256d j4 = _mm256_loadu_pd(v2+12);
				/* multiplie */
					   j1 = _mm256_mul_pd(j1, l1);
					   j2 = _mm256_mul_pd(j2, l2);
					   j3 = _mm256_mul_pd(j3, l3);
					   j4 = _mm256_mul_pd(j4, l4);
				/* Additionne */
				i1 = _mm256_add_pd(i1,j1);
				i2 = _mm256_add_pd(i2,j2);
				i3 = _mm256_add_pd(i3,j3);
				i4 = _mm256_add_pd(i4,j4);
				/* Sauvegarde */
				_mm256_storeu_pd(v1, i1);
				_mm256_storeu_pd(v1+4, i2);
				_mm256_storeu_pd(v1+8, i3);
				_mm256_storeu_pd(v1+12, i4);
				v1 += 16;
				v2 += 16;
			}
		}
	}
	
	for(k = 0 ; k<r ; k++)
		v1[k] += lambda*v2[k];
}