Пример #1
0
int main() {

	double rand0 = (double) arc4random();

	if (!__builtin_cpu_supports("avx")) {
		printf("No AVX, skipping test");
		return 0;
	}

	__m256d y0 = _mm256_set_pd(rand0,	1,	-123,	123);
	__m256d y1 = _mm256_set_pd(-1,	rand0,	-121,	121);
	__m256d y2 = _mm256_set_pd(1233.0,	-0.1,	rand0,	1);
	__m256d y3 = _mm256_set_pd(0,		23.0,	-1,	rand0);

	__m256d result_first = _mm256_sub_pd(y0, y1);
	__m256d result_second = _mm256_sub_pd(y2, y3);

	printf("Gonna give the OS opportunity to trash my registers...\n");

	struct timeval start, now;
	int delta;

	(void)gettimeofday(&start, NULL);

	do {
		(void)gettimeofday(&now, NULL);
		delta = now.tv_sec - start.tv_sec;
		sched_yield();
	} while (delta < 3);


	printf("Here's the values I got\n");

	printf("rand0: %lf\n", rand0);
	double* first = (double*)&result_first;
	printf("first %lf\t	%lf\t	%lf\t	%lf\n", first[3], first[2], first[1], first[0]);

	double* second = (double*)&result_second;
	printf("second %lf\t	%lf\t	%lf\t	%lf\n", second[3], second[2], second[1], second[0]);

	printf("Making sure that calculating them by hand gets the same result\n");

#define ALMOST_EQ(A,B) assert((A) - (B) < 0.1)

	ALMOST_EQ(first[3], rand0 - (-1));
	ALMOST_EQ(first[2], 1 - rand0);
	ALMOST_EQ(first[1], -123 - (-121));
	ALMOST_EQ(first[0], 123 - 121);

	ALMOST_EQ(second[3], 1233.0 - (0));
	ALMOST_EQ(second[2], -0.1 - 23.0);
	ALMOST_EQ(second[1], rand0 - (-1));
	ALMOST_EQ(second[0], 1 - rand0);

	printf("Yep!\n");


	return 0;
}
Пример #2
0
boost::optional<double> SimpleClean::FindPeakAVX(const double *image, size_t width, size_t height, size_t& x, size_t& y, size_t startY, size_t endY, size_t horizontalBorder, size_t verticalBorder)
{
	double peakMax = std::numeric_limits<double>::min();
	size_t peakIndex = 0;
	
	__m256d mPeakMax = _mm256_set1_pd(peakMax);
	
	size_t xiStart = horizontalBorder, xiEnd = width - horizontalBorder;
	size_t yiStart = std::max(startY, verticalBorder), yiEnd = std::min(endY, height - verticalBorder);
	if(xiEnd < xiStart) xiEnd = xiStart;
	if(yiEnd < yiStart) yiEnd = yiStart;
	
	for(size_t yi=yiStart; yi!=yiEnd; ++yi)
	{
		size_t index = yi*width + xiStart;
		const double* const endPtr = image + yi*width + xiEnd - 4;
		const double *i=image + index;
		for(; i<endPtr; i+=4)
		{
			__m256d val = _mm256_loadu_pd(i);
			if(AllowNegativeComponent) {
				__m256d negVal = _mm256_sub_pd(_mm256_set1_pd(0.0), val);
				val = _mm256_max_pd(val, negVal);
			}
			int mask = _mm256_movemask_pd(_mm256_cmp_pd(val, mPeakMax, _CMP_GT_OQ));
			if(mask != 0)
			{
				for(size_t di=0; di!=4; ++di)
				{
					double value = i[di];
					if(AllowNegativeComponent) value = std::fabs(value);
					if(value > peakMax)
					{
						peakIndex = index+di;
						peakMax = std::fabs(i[di]);
						mPeakMax = _mm256_set1_pd(peakMax);
					}
				}
			}
			index+=4;
		}
		for(; i!=endPtr+4; ++i)
		{
			double value = *i;
			if(AllowNegativeComponent) value = std::fabs(value);
			if(value > peakMax)
			{
				peakIndex = index;
				peakMax = std::fabs(*i);
			}
			++index;
		}
	}
	x = peakIndex % width;
	y = peakIndex / width;
	return image[x + y*width];
}
Пример #3
0
extern "C" void product32x32_avx(double *a, double *b, double *c, int n) 
{
    for(int i=0; i<n; i++) {	
		__m256d t1 = _mm256_loadu_pd(&c[i*n +  0]);
		__m256d t2 = _mm256_loadu_pd(&c[i*n +  4]);
		__m256d t3 = _mm256_loadu_pd(&c[i*n +  8]);
		__m256d t4 = _mm256_loadu_pd(&c[i*n + 12]);
		__m256d t5 = _mm256_loadu_pd(&c[i*n + 16]);
		__m256d t6 = _mm256_loadu_pd(&c[i*n + 20]);
		__m256d t7 = _mm256_loadu_pd(&c[i*n + 24]);
		__m256d t8 = _mm256_loadu_pd(&c[i*n + 28]);
		for(int k=0; k<n; k++) {
			__m256d a1 = _mm256_set1_pd(a[k*n+i]);
			
			__m256d b1 = _mm256_loadu_pd(&b[k*n+0]);
			t1 = _mm256_sub_pd(t1,_mm256_mul_pd(a1,b1));
			
			__m256d b2 = _mm256_loadu_pd(&b[k*n+4]);
			t2 = _mm256_sub_pd(t2,_mm256_mul_pd(a1,b2));

			__m256d b3 = _mm256_loadu_pd(&b[k*n+8]);
			t3 = _mm256_sub_pd(t3,_mm256_mul_pd(a1,b3));

			__m256d b4 = _mm256_loadu_pd(&b[k*n+12]);
			t4 = _mm256_sub_pd(t4,_mm256_mul_pd(a1,b4));

			__m256d b5 = _mm256_loadu_pd(&b[k*n+16]);
			t5 = _mm256_sub_pd(t5,_mm256_mul_pd(a1,b5));

			__m256d b6 = _mm256_loadu_pd(&b[k*n+20]);
			t6 = _mm256_sub_pd(t6,_mm256_mul_pd(a1,b6));

			__m256d b7 = _mm256_loadu_pd(&b[k*n+24]);
			t7 = _mm256_sub_pd(t7,_mm256_mul_pd(a1,b7));

			__m256d b8 = _mm256_loadu_pd(&b[k*n+28]);
			t8 = _mm256_sub_pd(t8,_mm256_mul_pd(a1,b8));
		}
		_mm256_storeu_pd(&c[i*n +  0], t1);
		_mm256_storeu_pd(&c[i*n +  4], t2);
		_mm256_storeu_pd(&c[i*n +  8], t3);
		_mm256_storeu_pd(&c[i*n + 12], t4);
		_mm256_storeu_pd(&c[i*n + 16], t5);
		_mm256_storeu_pd(&c[i*n + 20], t6);
		_mm256_storeu_pd(&c[i*n + 24], t7);
		_mm256_storeu_pd(&c[i*n + 28], t8);
	}
}
double HodgkinHuxley::dV(double *V, double I) {
	const double C = 1.0;
	const double gNa = 120.0;
	const double gK = 36.0;
	const double gL = 0.3;
	const double ENa = 50.0;
	const double EK = -77.0;
	const double EL = -54.4;
#ifdef __AVX__
/*
AVX is an instruction set from Intel which allows simultaneous operation
on 4 doubles. Seems to be slower than optimized FPU, though.
*/
	double Va[] __attribute__ ((aligned (32))) = {V[0], V[0], V[0], 1.0},
		   Ea[] __attribute__ ((aligned (32))) = {EL, ENa, EK, 0.0},
		   Ga[] __attribute__ ((aligned (32))) = {-gL, -gNa * pow(V[2], 3.0) * V[3], -gK * pow(V[1], 4.0), I};
	
	// load V
	__m256d Vr = _mm256_load_pd(Va);
	// load E
	__m256d Er = _mm256_load_pd(Ea);
	// load G
	__m256d Gr = _mm256_load_pd(Ga);
	// subtract
	Vr = _mm256_sub_pd(Vr, Er);
	// dot product (why does intel not have _mm256_dp_pd ?)
	Vr = _mm256_mul_pd(Vr, Gr);
	__m256d temp = _mm256_hadd_pd(Vr, Vr);
	__m128d lo128 = _mm256_extractf128_pd(temp, 0);
	__m128d hi128 = _mm256_extractf128_pd(temp, 1);
	__m128d dotproduct = _mm_add_pd(lo128, hi128);
	
	double sseVal;
	// store
	_mm_storel_pd(&sseVal, dotproduct);
	sseVal /= C;
		
	return sseVal;
#else
	return (-gL * (V[0] - EL) - gNa * pow(V[2], 3.0) * V[3] * (V[0] - ENa)
		- gK * pow(V[1], 4.0) * (V[0] - EK) + I) / C;
#endif
}
Пример #5
0
    div(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img]
        //rhs = [y1.real, y1.img, y2.real, y2.img]

        //ymm0 = [y1.real, y1.real, y2.real, y2.real]
        __m256d ymm0 = _mm256_movedup_pd(rhs.value);

        //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256d ymm1 = _mm256_permute_pd(rhs.value, 0b1111);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101);

        //ymm4 = [x.img * y.img, x.real * y.img]
        __m256d ymm4 = _mm256_mul_pd(ymm2, ymm1);

        //ymm5 = subadd((lhs * ymm0), ymm4)

#ifdef __FMA__
        __m256d ymm5 = _mm256_fmsubadd_pd(lhs.value, ymm0, ymm4);
#else
        __m256d t1   = _mm256_mul_pd(lhs.value, ymm0);
        __m256d t2   = _mm256_sub_pd(_mm256_set1_pd(0.0), ymm4);
        __m256d ymm5 = _mm256_addsub_pd(t1, t2);
#endif

        //ymm3 = [y.imag^2, y.imag^2]
        __m256d ymm3 = _mm256_mul_pd(ymm1, ymm1);

        //ymm0 = (ymm0 * ymm0 + ymm3)

#ifdef __FMA__
        ymm0 = _mm256_fmadd_pd(ymm0, ymm0, ymm3);
#else
        __m256d t3   = _mm256_mul_pd(ymm0, ymm0);
        ymm0         = _mm256_add_pd(t3, ymm3);
#endif

        //result = ymm5 / ymm0
        return _mm256_div_pd(ymm5, ymm0);
    }
Пример #6
0
void calculate_fma_double (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY)
{
    __m256d dd = _mm256_set1_pd (scale);
    __m256d XX0 = _mm256_set1_pd (X0);

    for (unsigned j = YSTART; j < SY; j++)	{
        __m256d y0 = _mm256_set1_pd (j*scale + Y0);
        for (unsigned i = 0; i < SX; i += 4)	{

            __m128i ind = _mm_setr_epi32 (i, i + 1, i + 2, i + 3);
            __m256d x0 = _mm256_fmadd_pd (dd, _mm256_cvtepi32_pd (ind), XX0);
            __m256d x = x0;
            __m256d y = y0;
            __m256i counts = _mm256_setzero_si256 ();
            __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu);

            for (unsigned n = 0; n < 255; n++)	{
                __m256d x2 = _mm256_mul_pd (x, x);
                __m256d y2 = _mm256_mul_pd (y, y);
                __m256d abs = _mm256_add_pd (x2, y2);
                __m256i cmp = _mm256_castpd_si256 (_mm256_cmp_pd (abs, _mm256_set1_pd (4), 1));
                cmp_mask = _mm256_and_si256 (cmp_mask, cmp);
                if (_mm256_testz_si256 (cmp_mask, cmp_mask)) {
                    break;
                }
                counts = _mm256_sub_epi64 (counts, cmp_mask);
                __m256d t = _mm256_add_pd (x, x);
                y = _mm256_fmadd_pd (t, y, y0);
                x = _mm256_add_pd (_mm256_sub_pd (x2, y2), x0);
            }
            __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8));
            *(uint32_t*) out = _mm_extract_epi16 (_mm256_extracti128_si256 (result, 0), 0) | (_mm_extract_epi16 (_mm256_extracti128_si256 (result, 1), 0) << 16);
            out += 4;
        }
    }
}
Пример #7
0
// it moves vertically across blocks
void kernel_dtrmv_u_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 4;
	
/*	__builtin_prefetch( A + 0*lda );*/
/*	__builtin_prefetch( A + 2*lda );*/
/*	__builtin_prefetch( A + 4*lda );*/
/*	__builtin_prefetch( A + 6*lda );*/

/*	double *tA, *tx;*/

	int k;
/*	int ka = kmax-kna; // number from aligned positon*/
	
	__m256d
		zeros,
		tmp0, tmp1, 
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0_1_2_3,
		y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77;
	
/*	__m128d*/
/*		ax_temp,*/
/*		a_00_10, a_01_11, a_02_12, a_03_13,*/
/*		x_0_1,*/
/*		y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;*/
	
	y_00 = _mm256_setzero_pd();
	y_11 = _mm256_setzero_pd();
	y_22 = _mm256_setzero_pd();
	y_33 = _mm256_setzero_pd();
	y_44 = _mm256_setzero_pd();
	y_55 = _mm256_setzero_pd();
	y_66 = _mm256_setzero_pd();
	y_77 = _mm256_setzero_pd();
		
	k=0;
	for(; k<kmax-7; k+=8)
		{

/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, tmp0 );
		y_33 = _mm256_add_pd( y_33, tmp1 );
	
/*		__builtin_prefetch( A + sda*lda + 4*lda );*/
/*		__builtin_prefetch( A + sda*lda + 6*lda );*/

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, tmp0 );
		y_55 = _mm256_add_pd( y_55, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, tmp0 );
		y_77 = _mm256_add_pd( y_77, tmp1 );

		A += 4 + (sda-1)*lda;
		x += 4;


/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, tmp0 );
		y_33 = _mm256_add_pd( y_33, tmp1 );
	
/*		__builtin_prefetch( A + sda*lda + 4*lda );*/
/*		__builtin_prefetch( A + sda*lda + 6*lda );*/

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, tmp0 );
		y_55 = _mm256_add_pd( y_55, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, tmp0 );
		y_77 = _mm256_add_pd( y_77, tmp1 );

		A += 4 + (sda-1)*lda;
		x += 4;

		}
/*	for(; k<ka-3; k+=4)*/
/*		{*/

/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

/*		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );*/

/*		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );*/
/*		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );*/
/*		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );*/
/*		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );*/
/*	*/
/*		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/
/*		y_00 = _mm256_add_pd( y_00, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/
/*		y_11 = _mm256_add_pd( y_11, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/
/*		y_22 = _mm256_add_pd( y_22, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/
/*		y_33 = _mm256_add_pd( y_33, aaxx_temp );*/
/*	*/
/*		__builtin_prefetch( A + sda*lda + 4*lda );*/
/*		__builtin_prefetch( A + sda*lda + 6*lda );*/

/*		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );*/
/*		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );*/
/*		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );*/
/*		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );*/
/*	*/
/*		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/
/*		y_44 = _mm256_add_pd( y_44, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/
/*		y_55 = _mm256_add_pd( y_55, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/
/*		y_66 = _mm256_add_pd( y_66, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/
/*		y_77 = _mm256_add_pd( y_77, aaxx_temp );*/

/*		A += 4 + (sda-1)*lda;*/
/*		x += 4;*/

/*		}*/
		
	zeros = _mm256_setzero_pd();

	// top triangle
	x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

	a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
	a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe );
	a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
	a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
	a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
	tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
	y_00 = _mm256_add_pd( y_00, tmp0 );
	y_11 = _mm256_add_pd( y_11, tmp1 );
	tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
	y_22 = _mm256_add_pd( y_22, tmp0 );
	y_33 = _mm256_add_pd( y_33, tmp1 );

	// top square
	a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
	a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );

	tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
	y_44 = _mm256_add_pd( y_44, tmp0 );
	y_55 = _mm256_add_pd( y_55, tmp1 );
	tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
	y_66 = _mm256_add_pd( y_66, tmp0 );
	y_77 = _mm256_add_pd( y_77, tmp1 );

	A += 4 + (sda-1)*lda;
	x += 4;

	// bottom triangle
	x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

	a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
	a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe );
	a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
	a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
	a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );

	tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
	y_44 = _mm256_add_pd( y_44, tmp0 );
	y_55 = _mm256_add_pd( y_55, tmp1 );
	tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
	y_66 = _mm256_add_pd( y_66, tmp0 );
	y_77 = _mm256_add_pd( y_77, tmp1 );

	// store
	__m256d
		y_0_1_2_3, y_4_5_6_7;

	y_00 = _mm256_hadd_pd(y_00, y_11);
	y_22 = _mm256_hadd_pd(y_22, y_33);
	y_44 = _mm256_hadd_pd(y_44, y_55);
	y_66 = _mm256_hadd_pd(y_66, y_77);

	y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 );	
	y_00 = _mm256_permute2f128_pd(y_22, y_00, 19);	
	y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 );	
	y_44 = _mm256_permute2f128_pd(y_66, y_44, 19);	

	y_00 = _mm256_add_pd( y_00, y_11 );
	y_44 = _mm256_add_pd( y_44, y_55 );

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_00);
		_mm256_storeu_pd(&y[4], y_44);
		}
	else if(alg==1)
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		y_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 );

		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		y_4_5_6_7 = _mm256_loadu_pd( &y[4] );
	
		y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 );
		y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 );
	
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}

	}
Пример #8
0
void AVX2FMA3DNoise(Vector3d& result, const Vector3d& EPoint)
{

#if CHECK_FUNCTIONAL
    Vector3d param(EPoint);
#endif

    AVX2TABLETYPE *mp;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    const __m256d ONE_PD = _mm256_set1_pd(1.0);
    const __m128i short_si128 = _mm_set1_epi32(0xffff);

    const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0);
    const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON);
    const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy);
    const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn));

    const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0);

    const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn));
    const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD);

    const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn),
        _mm_set1_epi32(0xfff));

    const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn,
        _mm256_mul_pd(xyz_ixyzn,
            _mm256_sub_pd(_mm256_set1_pd(3.0),
                _mm256_add_pd(xyz_ixyzn, xyz_ixyzn))));

    const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn);

    const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20);
    const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0));
    const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1));

    const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy);

    const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));
    const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));

    int ints[4];
    _mm_storeu_si128((__m128i*)(ints), i_xyzn);

    const int ixiy_hash = Hash2d(ints[0], ints[1]);
    const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]);
    const int ixjy_hash = Hash2d(ints[0], ints[1] + 1);
    const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1);

    const int iz = ints[2];

    const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);
    const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);

    __m256d ss;
    __m256d blend;

    __m256d x = _mm256_setzero_pd(), y = _mm256_setzero_pd(), z = _mm256_setzero_pd();


    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0));
    //     blend = _mm256_blend_pd(iii, jjj, 0);

    INCSUMAVX_VECTOR(mp, ss, iii);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1));
    blend = _mm256_blend_pd(iii, jjj, 2);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3));
    blend = _mm256_blend_pd(iii, jjj, 6);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2));
    blend = _mm256_blend_pd(iii, jjj, 4);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2));
    blend = _mm256_blend_pd(iii, jjj, 12);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3));
    //     blend = _mm256_blend_pd(iii, jjj, 14);

    INCSUMAVX_VECTOR(mp, ss, jjj);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1));
    blend = _mm256_blend_pd(iii, jjj, 10);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0));
    blend = _mm256_blend_pd(iii, jjj, 8);

    INCSUMAVX_VECTOR(mp, ss, blend);


    __m256d xy = _mm256_hadd_pd(x,y);
    __m128d xy_up = _mm256_extractf128_pd(xy,1);
    xy_up = _mm_add_pd(_mm256_castpd256_pd128(xy),xy_up);
    _mm_storeu_pd(&result[X],xy_up);

    __m128d z_up = _mm256_extractf128_pd(z,1);
    z_up = _mm_add_pd(_mm256_castpd256_pd128(z),z_up);
    z_up = _mm_hadd_pd(z_up,z_up);
    result[Z] = _mm_cvtsd_f64(z_up);


#if CHECK_FUNCTIONAL
    {
        Vector3d portable_res;
        PortableDNoise(portable_res , param);
        if (fabs(portable_res[X] - result[X]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise X error");
        }
        if (fabs(portable_res[Y] - result[Y]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise Y error");
        }
        if (fabs(portable_res[Z] - result[Z]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise Z error");
        }

    }

#endif



    _mm256_zeroupper();
    return;

}
Пример #9
0
void kernel_dgemv_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );
	__builtin_prefetch( A + 4*lda );
	__builtin_prefetch( A + 6*lda );

	double *tA, *tx;

	int k;
	int ka = kmax; // number from aligned positon
	
	__m256d
		aaxx_temp, 
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0_1_2_3,
		y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77;
	
	__m128d
		ax_temp,
		a_00_10, a_01_11, a_02_12, a_03_13,
		x_0_1,
		y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;
	
	y_00 = _mm256_setzero_pd();
	y_11 = _mm256_setzero_pd();
	y_22 = _mm256_setzero_pd();
	y_33 = _mm256_setzero_pd();
	y_44 = _mm256_setzero_pd();
	y_55 = _mm256_setzero_pd();
	y_66 = _mm256_setzero_pd();
	y_77 = _mm256_setzero_pd();
	
	y_0 = _mm256_castpd256_pd128(y_00);
	y_1 = _mm256_castpd256_pd128(y_11);
	y_2 = _mm256_castpd256_pd128(y_22);
	y_3 = _mm256_castpd256_pd128(y_33);
	y_4 = _mm256_castpd256_pd128(y_44);
	y_5 = _mm256_castpd256_pd128(y_55);
	y_6 = _mm256_castpd256_pd128(y_66);
	y_7 = _mm256_castpd256_pd128(y_77);

	k = lda*(ka/lda);
	tA = A + (ka/lda)*sda*lda;
	tx = x + (ka/lda)*lda;

	if(ka-k>0) // it can be only ka-k = {1, 2, 3}
		{
		if((ka-k)>=2)
			{
		
			x_0_1 = _mm_load_pd( &tx[0] );

			a_00_10 = _mm_load_pd( &tA[0+lda*0] );
			a_01_11 = _mm_load_pd( &tA[0+lda*1] );
			a_02_12 = _mm_load_pd( &tA[0+lda*2] );
			a_03_13 = _mm_load_pd( &tA[0+lda*3] );

			ax_temp = _mm_mul_pd( a_00_10, x_0_1 );	
			y_0 = _mm_add_pd (y_0, ax_temp );
			ax_temp = _mm_mul_pd( a_01_11, x_0_1 );	
			y_1 = _mm_add_pd (y_1, ax_temp );
			ax_temp = _mm_mul_pd( a_02_12, x_0_1 );	
			y_2 = _mm_add_pd (y_2, ax_temp );
			ax_temp = _mm_mul_pd( a_03_13, x_0_1 );	
			y_3 = _mm_add_pd (y_3, ax_temp );
		
			a_00_10 = _mm_load_pd( &tA[0+lda*4] );
			a_01_11 = _mm_load_pd( &tA[0+lda*5] );
			a_02_12 = _mm_load_pd( &tA[0+lda*6] );
			a_03_13 = _mm_load_pd( &tA[0+lda*7] );

			ax_temp = _mm_mul_pd( a_00_10, x_0_1 );	
			y_4 = _mm_add_pd (y_4, ax_temp );
			ax_temp = _mm_mul_pd( a_01_11, x_0_1 );	
			y_5 = _mm_add_pd (y_5, ax_temp );
			ax_temp = _mm_mul_pd( a_02_12, x_0_1 );	
			y_6 = _mm_add_pd (y_6, ax_temp );
			ax_temp = _mm_mul_pd( a_03_13, x_0_1 );	
			y_7 = _mm_add_pd (y_7, ax_temp );
		
			tA += 2;
			tx += 2;
			k+=2;
		
			}

		if((ka-k)==1)
			{
		
			x_0_1 = _mm_load_sd( &tx[0] );

			a_00_10 = _mm_load_sd( &tA[0+lda*0] );
			a_01_11 = _mm_load_sd( &tA[0+lda*1] );
			a_02_12 = _mm_load_sd( &tA[0+lda*2] );
			a_03_13 = _mm_load_sd( &tA[0+lda*3] );

			ax_temp = _mm_mul_sd( a_00_10, x_0_1 );	
			y_0 = _mm_add_sd (y_0, ax_temp );
			ax_temp = _mm_mul_sd( a_01_11, x_0_1 );	
			y_1 = _mm_add_sd (y_1, ax_temp );
			ax_temp = _mm_mul_sd( a_02_12, x_0_1 );	
			y_2 = _mm_add_sd (y_2, ax_temp );
			ax_temp = _mm_mul_sd( a_03_13, x_0_1 );	
			y_3 = _mm_add_sd (y_3, ax_temp );
		
			a_00_10 = _mm_load_sd( &tA[0+lda*4] );
			a_01_11 = _mm_load_sd( &tA[0+lda*5] );
			a_02_12 = _mm_load_sd( &tA[0+lda*6] );
			a_03_13 = _mm_load_sd( &tA[0+lda*7] );

			ax_temp = _mm_mul_sd( a_00_10, x_0_1 );	
			y_4 = _mm_add_sd (y_4, ax_temp );
			ax_temp = _mm_mul_sd( a_01_11, x_0_1 );	
			y_5 = _mm_add_sd (y_5, ax_temp );
			ax_temp = _mm_mul_sd( a_02_12, x_0_1 );	
			y_6 = _mm_add_sd (y_6, ax_temp );
			ax_temp = _mm_mul_sd( a_03_13, x_0_1 );	
			y_7 = _mm_add_sd (y_7, ax_temp );
		
			tA += 1;
			tx += 1;
			k++;
		
			}

		}

	y_00 = _mm256_castpd128_pd256(y_0);
	y_11 = _mm256_castpd128_pd256(y_1);
	y_22 = _mm256_castpd128_pd256(y_2);
	y_33 = _mm256_castpd128_pd256(y_3);
	y_44 = _mm256_castpd128_pd256(y_4);
	y_55 = _mm256_castpd128_pd256(y_5);
	y_66 = _mm256_castpd128_pd256(y_6);
	y_77 = _mm256_castpd128_pd256(y_7);
		
	k=0;
	for(; k<ka-7; k+=8)
		{

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_55 = _mm256_add_pd( y_55, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_77 = _mm256_add_pd( y_77, aaxx_temp );

		A += 4 + (sda-1)*lda;
		x += 4;


		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_55 = _mm256_add_pd( y_55, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_77 = _mm256_add_pd( y_77, aaxx_temp );

		A += 4 + (sda-1)*lda;
		x += 4;

		}
	for(; k<ka-3; k+=4)
		{

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_55 = _mm256_add_pd( y_55, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_77 = _mm256_add_pd( y_77, aaxx_temp );

		A += 4 + (sda-1)*lda;
		x += 4;

		}
		
	__m256d
		y_0_1_2_3, y_4_5_6_7;

	y_00 = _mm256_hadd_pd(y_00, y_11);
	y_22 = _mm256_hadd_pd(y_22, y_33);
	y_44 = _mm256_hadd_pd(y_44, y_55);
	y_66 = _mm256_hadd_pd(y_66, y_77);

	y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 );	
	y_00 = _mm256_permute2f128_pd(y_22, y_00, 19);	
	y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 );	
	y_44 = _mm256_permute2f128_pd(y_66, y_44, 19);	

	y_00 = _mm256_add_pd( y_00, y_11 );
	y_44 = _mm256_add_pd( y_44, y_55 );

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_00);
		_mm256_storeu_pd(&y[4], y_44);
		}
	else if(alg==1)
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		y_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 );

		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		y_4_5_6_7 = _mm256_loadu_pd( &y[4] );
	
		y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 );
		y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 );
	
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}

	}
Пример #10
0
 sub(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
     return _mm256_sub_pd(lhs.value, rhs.value);
 }
Пример #11
0
// it moves horizontally inside a block
void kernel_dgemv_n_8_lib4(int kmax, double *A0, double *A1, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;

	__m256d
		ax_temp,
		a_00_10_20_30, a_01_11_21_31,
		a_40_50_60_70, a_41_51_61_71,
		x_0, x_1,
		y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3,
		y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7;
	
	y_0_1_2_3   = _mm256_setzero_pd();	
	y_4_5_6_7   = _mm256_setzero_pd();	
	y_0_1_2_3_b = _mm256_setzero_pd();	
	y_4_5_6_7_b = _mm256_setzero_pd();	

	if(kmax<=64)
		{

		k=0;
		for(; k<kmax-3; k+=4)
			{

/*			__builtin_prefetch( A0 + 4*lda );*/
/*			__builtin_prefetch( A1 + 4*lda );*/

			x_0 = _mm256_broadcast_sd( &x[0] );
			x_1 = _mm256_broadcast_sd( &x[1] );

			a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
			a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
			a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
			a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

			ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
			y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
			ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
			y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
			ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
			y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
			ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
			y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );

/*			__builtin_prefetch( A0 + 5*lda );*/
/*			__builtin_prefetch( A1 + 5*lda );*/

			x_0 = _mm256_broadcast_sd( &x[2] );
			x_1 = _mm256_broadcast_sd( &x[3] );

			a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
			a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] );
			a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
			a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] );

			ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
			y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
			ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
			y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
			ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
			y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
			ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
			y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
		
			A0 += 4*lda;
			A1 += 4*lda;
			x  += 4;

			}
		
		}
	else
		{

		k=0;
		for(; k<kmax-3; k+=4)
			{

			__builtin_prefetch( A0 + 4*lda );
			__builtin_prefetch( A1 + 4*lda );

			x_0 = _mm256_broadcast_sd( &x[0] );
			x_1 = _mm256_broadcast_sd( &x[1] );

			a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
			a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
			a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
			a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

			ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
			y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
			ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
			y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
			ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
			y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
			ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
			y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );

/*			__builtin_prefetch( A0 + 5*lda );*/
/*			__builtin_prefetch( A1 + 5*lda );*/
			__builtin_prefetch( A0 + 6*lda );
			__builtin_prefetch( A1 + 6*lda );

			x_0 = _mm256_broadcast_sd( &x[2] );
			x_1 = _mm256_broadcast_sd( &x[3] );

			a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
			a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] );
			a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
			a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] );

			ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
			y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
			ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
			y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
			ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
			y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
			ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
			y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
		
			A0 += 4*lda;
			A1 += 4*lda;
			x  += 4;

			}
	
		}


	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
		
		A0 += 2*lda;
		A1 += 2*lda;
		x  += 2;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );
	y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		
/*		A0 += 1*lda;*/
/*		A1 += 1*lda;*/
/*		x  += 1;*/

		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}

	}
Пример #12
0
 inline vector4d operator-(const vector4d& lhs, const vector4d& rhs)
 {
     return _mm256_sub_pd(lhs, rhs);
 }
Пример #13
0
void pagerank(list* plist, int ncores, int npages, int nedges, double dampener)
{
	/* For whoever reading this. Here's a strange hack. Don't edit. 
	Don't know why it speeds things up - unexplainble. Sorry. -Roger (4/Jun/2014) */
	if (ncores > 1)
		ncores --;


	/*************************/
	/* Padding for AVX later */
	/*************************/
	while ((npages + g_padding) % (ncores * 4) != 0)
	{
		g_padding++;
	}
	
	
	/***************/
	/* Declaration */
	/***************/
	double constant = (1.0 - dampener) / npages;
	g_npages = npages;
	g_nedges = nedges;

	/* Lists that uses page index */
	// double * curr_scores; 	/* Stores the scores for this round */
	// double * prev_scores; 	/* Stores the score for last round - for calculating converce */
	// int * page_inlinks; 	/* Number of inlinks per page[i] - to determine the loop */
	
	/* Lists the use edge index */
	// int * inlinks; 			/* the index for the numerator prev_score[inlinks[edge]] */
	// int * index_edge;		/* Stores where the corresponding edge is for the page index */
	// double * outlinks;		/* the numerator for each edge */
	
	/* Lists that use ncores as index */
	//g_sumDiff is declare globally

	posix_memalign((void *)&curr_scores, 32, (npages + g_padding) * sizeof(double));
	posix_memalign((void *)&prev_scores, 32, (npages + g_padding) * sizeof(double));
	posix_memalign((void *)&page_inlinks, 32, npages * sizeof(int));
	posix_memalign((void *)&inlinks, 32, nedges * sizeof (int));
	posix_memalign((void *)&index_edge, 32, nedges * sizeof (int));
	posix_memalign((void *)&outlinks, 32, nedges * sizeof (double));
	posix_memalign((void *)&g_sumDiff, 32, ncores * sizeof (double));


	/*****************************/
	/* Setting up data structure */
	/*****************************/
	node* curr = plist->head;
	unsigned int edge = 0;

	for (int i = 0; i < npages; ++i) /* For each node */
	{
		if (curr->page->inlinks != NULL) /* If this page has a inlinks */
		{
			page_inlinks[i] = curr->page->inlinks->length; /* page_inlinks[i] +1 */
			index_edge[i] = edge;

			node* list_node =  curr->page->inlinks->head; /* Setting the first node in the inlinks */

			for (int j = 0; j < curr->page->inlinks->length; ++j)
			{
				inlinks[edge] = list_node->page->index; /* inlinks[edge] +1 */
				outlinks[edge] = 1.0 / list_node->page->noutlinks; /* outlinks[edge] +1*/
				list_node = list_node -> next;
				++edge;
			}
		}

	  curr = curr->next;
	}

	
	/************************/
	/* First iteration (P0) */
	/************************/
	for (int i = 0; i < g_padding; i++)
	{
		curr_scores[npages + i] = 0.0;
		prev_scores[npages + i] = 0.0; //padding
	}

	double p0 = 1.0 / npages;

	for (int i = 0; i < npages; i++)
	{
		prev_scores[i] = p0;
		/* We should have started with assigning to curr_score and then swap 
		but this way it save me from swapping once more in the beginning */
	}
	

	/**************************************/
	/* Setting up arguements for parellel */
	/**************************************/
	pthread_t tids[ncores];
	workerargs wargs[ncores];

	for (int i = 0; i < ncores; i++)
	{
		wargs[i].i = i;
		wargs[i].start = i * ((npages + g_padding) / ncores);
		wargs[i].end = (i + 1) * ((npages + g_padding) / ncores);
		wargs[i].dampener = dampener;
		wargs[i].constant = constant;
	}

	wargs[ncores-1].end = npages + g_padding;

	// for (int i = 0; i < ncores; i++)
	// 	printf("<wargs[%d]: start:%d end:%d>\n", i, wargs[i].start, wargs[i].end);

	pthread_barrier_init(&score_barrier, NULL, ncores);
	pthread_barrier_init(&conv_barrier, NULL, ncores);
	
	for (int i = 0; i < ncores - 1; i++)
		pthread_create(&tids[i], NULL, worker, &wargs[i]);

	int start = wargs[ncores-1].start;
	int end = wargs[ncores-1].end;
	/*************************/
	/* Manager thread memory */
	/*************************/
	for (;;)
	{
		if ((start < g_npages) && (g_nedges > 0)) //padding test
			edge = index_edge[start]; /* for edge count */

		double conv = 0.0;

		/* WORKER: Calculating curr_score */
		for (int i = start; i < end; ++i) /* Calculation for each page */
		{
			if (i >= g_npages)
				break; 

			double sum = 0.0;

			for (int j = 0; j < page_inlinks[i]; ++j) /* Calculation for each inlink */
			{
				sum += prev_scores[inlinks[edge]] * outlinks[edge];
				//printf("<%d: %f = %f * %f>\n", i, prev_scores[inlinks[edge]] / outlinks[edge], prev_scores[inlinks[edge]], outlinks[edge]);
				++edge;
			}

			curr_scores[i] = constant + dampener * sum;
		}

		/* WORKER: Calculating conv */
		int nblocks = (end - start) * 0.25;

		for (int i = 0; i < nblocks; i++)
		{
			__m256d* curr_block = (__m256d*) &curr_scores[start]; // Cast it instead. 
			__m256d* prev_block = (__m256d*) &prev_scores[start];

			__m256d m1 = _mm256_sub_pd(curr_block[i], prev_block[i]);
			__m256d m2 = _mm256_mul_pd(m1, m1);

			conv += m2[0] + m2[1] + m2[2] + m2[3];
		}

		pthread_barrier_wait(&score_barrier); /* Wait until all workers are done */
		/* WORKER COMPLETE */
		/* MANAGER BEGIN */

		double* tmp = prev_scores; /* For each iteration: more curr to prev, and replace the old prev */
		prev_scores = curr_scores;
		curr_scores = tmp;

		/* MANAGER: Summing*/
		for (int i = 0; i < ncores; i++)
		{
			conv += g_sumDiff[i];
		}

		/* MANAGER: Checking */
		if (conv < EPSILON*EPSILON)
		{
			g_hasConverged = true; /* It has converged */

			pthread_barrier_wait(&conv_barrier);

			tmp = prev_scores; /* Reverse the swap then */
			prev_scores = curr_scores;
			curr_scores = tmp;

			break;
		}

		pthread_barrier_wait(&conv_barrier); /* Tell the worker they may begin again */
	}

	/********************/
	/* Printing results */
	/********************/
	// printf("curr_scores:\n");
	displayPageRank(plist, curr_scores);
	// printf("prev_scores:\n");
	// displayPageRank(plist, prev_scores);
	

	/******************/
	/* Cleaning up    */
	/******************/

	for (int i = 0; i < ncores - 1; i++)
		pthread_join(tids[i], NULL);

	pthread_barrier_destroy(&score_barrier);
	pthread_barrier_destroy(&conv_barrier);
	
	free(curr_scores);
	free(prev_scores);
	free(index_edge);
	free(page_inlinks);
	free(outlinks);
	free(inlinks);
	free(g_sumDiff);
	
}
Пример #14
0
// it moves vertically across blocks
void kernel_dtrmv_u_t_4_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 4;
	
/*	__builtin_prefetch( A + 0*lda );*/
/*	__builtin_prefetch( A + 2*lda );*/

/*	double *tA, *tx;*/

	int k;
	
	__m256d
		zeros,
		tmp0, tmp1,
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0_1_2_3,
		y_00, y_11, y_22, y_33;

	y_00 = _mm256_setzero_pd();
	y_11 = _mm256_setzero_pd();
	y_22 = _mm256_setzero_pd();
	y_33 = _mm256_setzero_pd();

	k=0;
	for(; k<kmax-7; k+=8)
		{
		
/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
		
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, tmp0 );
		y_33 = _mm256_add_pd( y_33, tmp1 );
		
		A += 4 + (sda-1)*lda;
		x += 4;


/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
		
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, tmp0 );
		y_33 = _mm256_add_pd( y_33, tmp1 );
		
		A += 4 + (sda-1)*lda;
		x += 4;

		}
	for(; k<kmax-3; k+=4)
		{
		
/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
		
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, tmp0 );
		y_33 = _mm256_add_pd( y_33, tmp1 );
		
		A += 4 + (sda-1)*lda;
		x += 4;

		}

	zeros = _mm256_setzero_pd();

	x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

	a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
	a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe );
	a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
	a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
	a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
	tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
	y_00 = _mm256_add_pd( y_00, tmp0 );
	y_11 = _mm256_add_pd( y_11, tmp1 );
	tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
	y_22 = _mm256_add_pd( y_22, tmp0 );
	y_33 = _mm256_add_pd( y_33, tmp1 );

	__m256d
		y_0_1_2_3;

	y_00 = _mm256_hadd_pd(y_00, y_11);
	y_22 = _mm256_hadd_pd(y_22, y_33);

	y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 );	
	y_00 = _mm256_permute2f128_pd(y_22, y_00, 19);	

	y_00 = _mm256_add_pd( y_00, y_11 );

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_00);
		}
	else if(alg==1)
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 );

		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
	
		y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 );
	
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}

	}
Пример #15
0
// it moves vertically across blocks
void kernel_dsymv_4_lib4(int kmax, double *A, int sda, double *x_n, double *y_n, double *z_n, double *x_t, double *y_t, double *z_t, int tri, int alg)
	{
	
	if(kmax<=0) 
		return;

/*printf("\nciao %d\n", kmax);	*/
	const int bs = 4;
	
	__builtin_prefetch( A + bs*0 );
	__builtin_prefetch( A + bs*2 );

	int k, ka;
	ka = kmax; // number from aligned positon

	double k_left;
	
//	double *sA, *sy_n, *sx_t;

	static double d_mask[4]  = {0.5, 1.5, 2.5, 3.5};

	__m256d
		v_mask,
		zeros, temp,
		a_00, a_01, a_02, a_03,
		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
	
	__m256i
		i_mask;

#if 0
	__m128d
		stemp,
		sa_00, sa_01, sa_02, sa_03,
		sx_n_0, sx_n_1, sx_n_2, sx_n_3, sy_n_0,
		sx_t_0, sy_t_0, sy_t_1, sy_t_2, sy_t_3;
#endif
	
	zeros = _mm256_setzero_pd();

	x_n_0 = _mm256_broadcast_sd( &x_n[0] );
	x_n_1 = _mm256_broadcast_sd( &x_n[1] );
	x_n_2 = _mm256_broadcast_sd( &x_n[2] );
	x_n_3 = _mm256_broadcast_sd( &x_n[3] );

	if(alg==-1) // TODO xor
		{
		x_n_0 = _mm256_sub_pd( zeros, x_n_0 );
		x_n_1 = _mm256_sub_pd( zeros, x_n_1 );
		x_n_2 = _mm256_sub_pd( zeros, x_n_2 );
		x_n_3 = _mm256_sub_pd( zeros, x_n_3 );
		}

	y_t_0 = _mm256_setzero_pd();
	y_t_1 = _mm256_setzero_pd();
	y_t_2 = _mm256_setzero_pd();
	y_t_3 = _mm256_setzero_pd();
	
#if 0
	sx_n_0 = _mm256_castpd256_pd128( x_n_0 );
	sx_n_1 = _mm256_castpd256_pd128( x_n_1 );
	sx_n_2 = _mm256_castpd256_pd128( x_n_2 );
	sx_n_3 = _mm256_castpd256_pd128( x_n_3 );

	sy_t_0 = _mm256_castpd256_pd128( y_t_0 );
	sy_t_1 = _mm256_castpd256_pd128( y_t_1 );
	sy_t_2 = _mm256_castpd256_pd128( y_t_2 );
	sy_t_3 = _mm256_castpd256_pd128( y_t_3 );

	k = bs*(ka/bs);
	sA = A + (ka/bs)*sda*bs;
	sy_n = y_n + (ka/bs)*bs;
	sx_t = x_t + (ka/bs)*bs;

	for(; k<ka; k++)
		{
		
		sy_n_0 = _mm_load_sd( &sy_n[0] );
		sx_t_0 = _mm_load_sd( &sx_t[0] );
		
		sa_00 = _mm_load_sd( &sA[0+bs*0] );
		sa_01 = _mm_load_sd( &sA[0+bs*1] );
		sa_02 = _mm_load_sd( &sA[0+bs*2] );
		sa_03 = _mm_load_sd( &sA[0+bs*3] );
		
		stemp  = _mm_mul_sd( sa_00, sx_n_0 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_00, sx_t_0 );
		sy_t_0 = _mm_add_sd( sy_t_0, stemp );
		stemp  = _mm_mul_sd( sa_01, sx_n_1 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_01, sx_t_0 );
		sy_t_1 = _mm_add_sd( sy_t_1, stemp );
		stemp  = _mm_mul_sd( sa_02, sx_n_2 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_02, sx_t_0 );
		sy_t_2 = _mm_add_sd( sy_t_2, stemp );
		stemp  = _mm_mul_sd( sa_03, sx_n_3 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_03, sx_t_0 );
		sy_t_3 = _mm_add_sd( sy_t_3, stemp );
		
		_mm_store_sd( &sy_n[0], sy_n_0 );

		
		sA += 1;
		sy_n += 1;
		sx_t += 1;

		}

	y_t_0 = _mm256_castpd128_pd256( sy_t_0 );
	y_t_1 = _mm256_castpd128_pd256( sy_t_1 );
	y_t_2 = _mm256_castpd128_pd256( sy_t_2 );
	y_t_3 = _mm256_castpd128_pd256( sy_t_3 );
#endif

	k=0;

	// corner
	if(tri==1)
		{
		
		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		temp  = _mm256_blend_pd( zeros, temp, 14 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		temp  = _mm256_blend_pd( zeros, temp, 14 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		temp  = _mm256_blend_pd( zeros, temp, 12 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		temp  = _mm256_blend_pd( zeros, temp, 12 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		temp  = _mm256_blend_pd( zeros, temp, 8 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		temp  = _mm256_blend_pd( zeros, temp, 8 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );
		

		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		k += 4;

		}

	for(; k<ka-7; k+=2*bs)
		{
		
		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );

		
		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );

		
		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		}

	for(; k<ka-3; k+=bs)
		{
		
		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );

		
		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		}
	if(k<ka)
		{

		k_left = ka-k;
		v_mask  = _mm256_sub_pd( _mm256_loadu_pd( d_mask ), _mm256_broadcast_sd( &k_left ) );
		i_mask  = _mm256_castpd_si256( v_mask );

//		__builtin_prefetch( A + sda*bs +bs*0 );
//		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_maskload_pd( &x_t[0], i_mask );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_maskstore_pd( &z_n[0], i_mask, y_n_0 );

		
//		A += sda*bs;
//		y_n += 4;
//		z_n += 4;
//		x_t += 4;

		}
	
	__m256d
		y_0_1_2_3;

	y_t_0 = _mm256_hadd_pd( y_t_0, y_t_1 );
	y_t_2 = _mm256_hadd_pd( y_t_2, y_t_3 );

	y_t_1 = _mm256_permute2f128_pd( y_t_2, y_t_0, 2  );	
	y_t_0 = _mm256_permute2f128_pd( y_t_2, y_t_0, 19 );	

	y_t_0 = _mm256_add_pd( y_t_0, y_t_1 );

	if(alg==1)
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_t_0 );
		_mm256_storeu_pd( &z_t[0], y_0_1_2_3 );
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] );
		y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_t_0 );
		_mm256_storeu_pd( &z_t[0], y_0_1_2_3 );
		}
	
	}
static inline __m256d gmx_mm256_exp2_pd(__m256d x)
{
    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
    const __m256d arglimit = _mm256_set1_pd(1022.0);
    const __m128i expbase  = _mm_set1_epi32(1023);

    const __m256d P2       = _mm256_set1_pd(2.30933477057345225087e-2);
    const __m256d P1       = _mm256_set1_pd(2.02020656693165307700e1);
    const __m256d P0       = _mm256_set1_pd(1.51390680115615096133e3);
    /* Q2 == 1.0 */
    const __m256d Q1       = _mm256_set1_pd(2.33184211722314911771e2);
    const __m256d Q0       = _mm256_set1_pd(4.36821166879210612817e3);
    const __m256d one      = _mm256_set1_pd(1.0);
    const __m256d two      = _mm256_set1_pd(2.0);

    __m256d       valuemask;
    __m256i       iexppart;
    __m128i       iexppart128a, iexppart128b;
    __m256d       fexppart;
    __m256d       intpart;
    __m256d       z, z2;
    __m256d       PolyP, PolyQ;

    iexppart128a  = _mm256_cvtpd_epi32(x);
    intpart       = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT);

    /* Add exponent bias */
    iexppart128a   = _mm_add_epi32(iexppart128a, expbase);

    /* We now want to shift the exponent 52 positions left, but to achieve this we need
     * to separate the 128-bit register data into two registers (4x64-bit > 128bit)
     * shift them, and then merge into a single __m256d.
     * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b.
     * It doesnt matter what we put in the 2nd/4th position, since that data will be
     * shifted out and replaced with zeros.
     */
    iexppart128b   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2));
    iexppart128a   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0));

    iexppart128b   = _mm_slli_epi64(iexppart128b, 52);
    iexppart128a   = _mm_slli_epi64(iexppart128a, 52);

    iexppart  = _mm256_castsi128_si256(iexppart128a);
    iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);

    valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ);
    fexppart  = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart));

    z         = _mm256_sub_pd(x, intpart);

    z2        = _mm256_mul_pd(z, z);

    PolyP     = _mm256_mul_pd(P2, z2);
    PolyP     = _mm256_add_pd(PolyP, P1);
    PolyQ     = _mm256_add_pd(z2, Q1);
    PolyP     = _mm256_mul_pd(PolyP, z2);
    PolyQ     = _mm256_mul_pd(PolyQ, z2);
    PolyP     = _mm256_add_pd(PolyP, P0);
    PolyQ     = _mm256_add_pd(PolyQ, Q0);
    PolyP     = _mm256_mul_pd(PolyP, z);

    z         = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP)));
    z         = _mm256_add_pd(one, _mm256_mul_pd(two, z));

    z         = _mm256_mul_pd(z, fexppart);

    return z;
}
Пример #17
0
// it moves horizontally inside a block
void kernel_dgemv_n_4_lib4(int kmax, double *A, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;

	__m256d
		ax_temp,
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0, x_1, x_2, x_3,
		y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3;
	
	y_0_1_2_3   = _mm256_setzero_pd();	
	y_0_1_2_3_b = _mm256_setzero_pd();	
	y_0_1_2_3_c = _mm256_setzero_pd();	
	y_0_1_2_3_d = _mm256_setzero_pd();	

	k=0;
	for(; k<kmax-3; k+=4)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		x_2 = _mm256_broadcast_sd( &x[2] );
		x_3 = _mm256_broadcast_sd( &x[3] );

		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 );
		y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp );
		ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 );
		y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp );
		
		A += 4*lda;
		x += 4;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d );

	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		A += 2*lda;
		x += 2;

		}

	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		
/*		A += 1*lda;*/
/*		x += 1;*/

		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}

	}
Пример #18
0
void * worker(void * args)
{
	workerargs* wargs = (workerargs*) args;

	int start = wargs->start;
	int end = wargs->end;
	double dampener = wargs->dampener;
	double constant = wargs->constant;

	// double * curr_scores = wargs->curr_scores;
	// double * prev_scores = wargs->prev_scores;
	// int * index_edge = wargs->index_edge;

	// int * page_inlinks = wargs->page_inlinks;
	// int * inlinks = wargs->inlinks;
	// double * outlinks = wargs->outlinks;

	for (;;) /* Calculates the curr_score as many times as it needs */
	{
		int edge;

		if ((start < g_npages) && (g_nedges > 0)) //padding test
			edge = index_edge[start]; /* for edge count */

		/* Swap */
		// double* tmp = prev_scores;
		// prev_scores = curr_scores;
		// curr_scores = tmp;

		double thread_conv = 0.0;

		for (int i = start; i < end; ++i) /* Calculation for each page */
		{
			if (i >= g_npages)
				break; 

			double sum = 0.0;

			for (int j = 0; j < page_inlinks[i]; ++j) /* Calculation for each inlink */
			{
				sum += prev_scores[inlinks[edge]] * outlinks[edge];
				//printf("<%d: %f = %f * %f>\n", i, prev_scores[inlinks[edge]] / outlinks[edge], prev_scores[inlinks[edge]], outlinks[edge]);
				++edge;
			}

			curr_scores[i] = constant + dampener * sum;

			/* Non-SSE calculation of tread_convergence */
			//thread_conv += (curr_scores[i] - prev_scores[i])*(curr_scores[i] - prev_scores[i]);
			//printf("Thread %d: %f\n", wargs->i, thread_conv);
			
			/* SSE Calculation of tread convergence */
			//Go to line 104
		}

		/* AVX version */
		int nblocks = (end - start) * 0.25;
		// printf("<thread: %d> <nblocks: %d> <start: %d> <end: %d>\n", wargs->i, nblocks, start, end);
		for (int i = 0; i < nblocks; i++)
		{
			__m256d* curr_block = (__m256d*) &curr_scores[start]; // Cast it instead. 
			__m256d* prev_block = (__m256d*) &prev_scores[start];

			__m256d m1 = _mm256_sub_pd(curr_block[i], prev_block[i]);
			__m256d m2 = _mm256_mul_pd(m1, m1);

			thread_conv += m2[0] + m2[1] + m2[2] + m2[3];
		}

		g_sumDiff[wargs->i] = thread_conv; /* Putting the tread_conv so that the main thread can read it. */
		// wargs->curr_scores = curr_scores; /* This is just calibrating the curr_scores for the main thread */

		pthread_barrier_wait(&score_barrier); /* Inform that the score is done calculating everything for this run */
		
		/* Put manager function here (if I want one thread to work as a manager as well */
		/* if (wargs->i == 0) //manager thread */

		pthread_barrier_wait(&conv_barrier); /* Wait until converge calculation is complete */

		if (g_hasConverged)
		{
			break;
		}
	}

	return NULL;
}
Пример #19
0
// it moves horizontally inside a block
void kernel_dtrmv_u_n_8_lib4(int kmax, double *A0, int sda, double *x, double *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	double *A1 = A0 + 4*sda;

	const int lda = 4;
	
	int k;

	__m128d
		tmp0,
		z_0, y_0_1, a_00_10;

	__m256d
		zeros,
		ax_temp,
		a_00_10_20_30, a_01_11_21_31,
		a_40_50_60_70, a_41_51_61_71,
		x_0, x_1,
		y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3,
		y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7;
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_4_5_6_7   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_4_5_6_7_b = _mm256_setzero_pd();	*/
		
	zeros = _mm256_setzero_pd();
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_c = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_d = _mm256_setzero_pd();*/
	
	// upper triangular

	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A0[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A0[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_0_1_2_3_b = _mm256_castpd128_pd256( y_0_1 );
	y_0_1_2_3_b = _mm256_blend_pd( y_0_1_2_3_b, y_0_1_2_3_b, 0xc );

	// forth col (avoid zero y_0_1_2_3)
	x_1     = _mm256_broadcast_sd( &x[3] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
	y_0_1_2_3 = _mm256_mul_pd( a_01_11_21_31, x_1 );

	// first col
	x_0     = _mm256_broadcast_sd( &x[2] );
	x_0     = _mm256_blend_pd( x_0, zeros, 0x8 );
	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );


	A0 += 4*lda;
	A1 += 4*lda;
	x  += 4;


	// upper squared
	x_0 = _mm256_broadcast_sd( &x[0] );
	x_1 = _mm256_broadcast_sd( &x[1] );

	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );

	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
	ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

	x_0 = _mm256_broadcast_sd( &x[2] );
	x_1 = _mm256_broadcast_sd( &x[3] );

	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );

	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
	ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );


	// lower triangular


	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A1[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A1[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_4_5_6_7_b = _mm256_castpd128_pd256( y_0_1 );
	y_4_5_6_7_b = _mm256_blend_pd( y_4_5_6_7_b, y_4_5_6_7_b, 0xc );

	// forth col (avoid zero y_4_5_6_7)
	x_1     = _mm256_broadcast_sd( &x[3] );
	a_01_11_21_31 = _mm256_load_pd( &A1[0+lda*3] );
	y_4_5_6_7 = _mm256_mul_pd( a_01_11_21_31, x_1 );

	// first col
	x_0     = _mm256_broadcast_sd( &x[2] );
	x_0     = _mm256_blend_pd( x_0, zeros, 0x8 );
	a_00_10_20_30 = _mm256_load_pd( &A1[0+lda*2] );
	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );


	A0 += 4*lda;
	A1 += 4*lda;
	x  += 4;


	k=8;
	for(; k<kmax-3; k+=4)
		{

/*		__builtin_prefetch( A0 + 4*lda );*/
/*		__builtin_prefetch( A1 + 4*lda );*/

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );

/*		__builtin_prefetch( A0 + 5*lda );*/
/*		__builtin_prefetch( A1 + 5*lda );*/

		x_0 = _mm256_broadcast_sd( &x[2] );
		x_1 = _mm256_broadcast_sd( &x[3] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
	
		A0 += 4*lda;
		A1 += 4*lda;
		x  += 4;

		}
		
	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
		
		A0 += 2*lda;
		A1 += 2*lda;
		x  += 2;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );
	y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		
/*		A0 += 1*lda;*/
/*		A1 += 1*lda;*/
/*		x  += 1;*/

		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}

	}
Пример #20
0
void ntt_transform(poly out, const poly o)
{ 
  int s, pos = 0, offset;
  __m256d vt,vo0,vo10,vo11,vo20,vo21,vo22,vo23,vc,vp,vpinv,neg2,neg4;
  __m256d vx0,vx1,vx2,vx3,vx4,vx5,vx6,vx7;
  
  vpinv = _mm256_set_pd(PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE);
  vp    = _mm256_set_pd(8383489., 8383489., 8383489., 8383489.);

  bitrev(out);

  vo10 = _mm256_load_pd(o+pos);
  vo20 = _mm256_load_pd(o+pos+4);
  neg2 = _mm256_load_pd(_neg2);
  neg4 = _mm256_load_pd(_neg4);
                                  
  // m = 2, m = 4, m = 8 (3 levels merged)
  for(s = 0; s<POLY_DEG; s+=8)
  {
    // No multiplication with omega required, respective value is 1
    vx0 = _mm256_load_pd(out+s);
    vt = _mm256_mul_pd(vx0,neg2);
    vx0 = _mm256_hadd_pd(vx0,vt);

    vx1 = _mm256_load_pd(out+s+4);
    vt = _mm256_mul_pd(vx1,neg2);
    vx1 = _mm256_hadd_pd(vx1,vt);

    vx0 = _mm256_mul_pd(vx0, vo10);
    vc = _mm256_mul_pd(vx0, vpinv);
    vc = _mm256_round_pd(vc,0x08);
    vc = _mm256_mul_pd(vc, vp);
    vx0 = _mm256_sub_pd(vx0,vc);
    vt = _mm256_permute2f128_pd (vx0, vx0, 0x01); // now contains x2,x3,x0,x1
    vx0 = _mm256_mul_pd(vx0, neg4);
    vx0 = _mm256_add_pd(vx0, vt);

    vx1 = _mm256_mul_pd(vx1, vo10);
    vc = _mm256_mul_pd(vx1, vpinv);
    vc = _mm256_round_pd(vc,0x08);
    vc = _mm256_mul_pd(vc, vp);
    vx1 = _mm256_sub_pd(vx1,vc);
    vt = _mm256_permute2f128_pd (vx1, vx1, 0x01); // now contains x2,x3,x0,x1
    vx1 = _mm256_mul_pd(vx1, neg4);
    vx1 = _mm256_add_pd(vx1, vt);

    vt = _mm256_mul_pd(vx1, vo20);
    vc = _mm256_mul_pd(vt, vpinv);
    vc = _mm256_round_pd(vc,0x08);
    vc = _mm256_mul_pd(vc, vp);
    vt = _mm256_sub_pd(vt,vc);
    vx1 = _mm256_sub_pd(vx0, vt);
    _mm256_store_pd(out+s+4, vx1);

    vx0 = _mm256_add_pd(vx0, vt);
    _mm256_store_pd(out+s+0, vx0);
  }
  
  pos += 8;

// m = 16, m = 32, m = 64 (3 levels merged)
  for(offset = 0; offset < 8; offset+=4)
  {
    vo0 = _mm256_load_pd(o+pos+offset);
    vo10 = _mm256_load_pd(o+pos+offset+8);
    vo11 = _mm256_load_pd(o+pos+offset+16);

    for(s = 0; s<POLY_DEG; s+=64)
    {
      vx1 = _mm256_load_pd(out+offset+s+8);
      vt = _mm256_mul_pd(vx1, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx0 = _mm256_load_pd(out+offset+s+0);
      vx1 = _mm256_sub_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+8, vx1);
      vx0 = _mm256_add_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+0, vx0);

      vx3 = _mm256_load_pd(out+offset+s+24);
      vt = _mm256_mul_pd(vx3, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx2 = _mm256_load_pd(out+offset+s+16);
      vx3 = _mm256_sub_pd(vx2, vt);
      //  _mm256_store_pd(out+offset+s+24, vx3);
      vx2 = _mm256_add_pd(vx2, vt);
      //  _mm256_store_pd(out+offset+s+16, vx2);

      vx5 = _mm256_load_pd(out+offset+s+40);
      vt = _mm256_mul_pd(vx5, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx4 = _mm256_load_pd(out+offset+s+32);
      vx5 = _mm256_sub_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+40, vx5);
      vx4 = _mm256_add_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+32, vx4);

      vx7 = _mm256_load_pd(out+offset+s+56);
      vt = _mm256_mul_pd(vx7, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx6 = _mm256_load_pd(out+offset+s+48);
      vx7 = _mm256_sub_pd(vx6, vt);
      //  _mm256_store_pd(out+offset+s+56, vx7);
      vx6 = _mm256_add_pd(vx6, vt);
      //  _mm256_store_pd(out+offset+s+48, vx6);


      //  vx2 = _mm256_load_pd(out+offset+s+16);
      vt = _mm256_mul_pd(vx2, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx0 = _mm256_load_pd(out+offset+s+0);
      vx2 = _mm256_sub_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+16, vx2);
      vx0 = _mm256_add_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+0, vx0);

      //  vx6 = _mm256_load_pd(out+offset+s+48);
      vt = _mm256_mul_pd(vx6, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx4 = _mm256_load_pd(out+offset+s+32);
      vx6 = _mm256_sub_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+48, vx6);
      vx4 = _mm256_add_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+32, vx4);


      //  vx3 = _mm256_load_pd(out+offset+s+24);
      vt = _mm256_mul_pd(vx3, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx1 = _mm256_load_pd(out+offset+s+8);
      vx3 = _mm256_sub_pd(vx1, vt);
      //  _mm256_store_pd(out+offset+s+24, vx3);
      vx1 = _mm256_add_pd(vx1, vt);
      //  _mm256_store_pd(out+offset+s+8, vx1);

      //  vx7 = _mm256_load_pd(out+offset+s+56);
      vt = _mm256_mul_pd(vx7, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx5 = _mm256_load_pd(out+offset+s+40);
      vx7 = _mm256_sub_pd(vx5, vt);
      //  _mm256_store_pd(out+offset+s+56, vx7);
      vx5 = _mm256_add_pd(vx5, vt);
      //  _mm256_store_pd(out+offset+s+40, vx5);



      //  vx4 = _mm256_load_pd(out+offset+s+32);
    vo20 = _mm256_load_pd(o+pos+offset+24);
      vt = _mm256_mul_pd(vx4, vo20);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx0 = _mm256_load_pd(out+offset+s+0);
      vx4 = _mm256_sub_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+32, vx4);
      vx0 = _mm256_add_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+0, vx0);

      //  vx5 = _mm256_load_pd(out+offset+s+40);
    vo21 = _mm256_load_pd(o+pos+offset+32);
      vt = _mm256_mul_pd(vx5, vo21);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx1 = _mm256_load_pd(out+offset+s+8);
      vx5 = _mm256_sub_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+40, vx5);
      vx1 = _mm256_add_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+8, vx1);

      //  vx6 = _mm256_load_pd(out+offset+s+48);
    vo22 = _mm256_load_pd(o+pos+offset+40);
      vt = _mm256_mul_pd(vx6, vo22);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx2 = _mm256_load_pd(out+offset+s+16);
      vx6 = _mm256_sub_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+48, vx6);
      vx2 = _mm256_add_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+16, vx2);

      //  vx7 = _mm256_load_pd(out+offset+s+56);
    vo23 = _mm256_load_pd(o+pos+offset+48);
      vt = _mm256_mul_pd(vx7, vo23);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx3 = _mm256_load_pd(out+offset+s+24);
      vx7 = _mm256_sub_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+56, vx7);
      vx3 = _mm256_add_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+24, vx3);
    }
  }


  pos += 56;

  // m = 128, m=256, m=512 (3 levels merged)
  for(offset=0;offset<64;offset+=4)
  {
    vo0 = _mm256_load_pd(o+pos+offset);
    vo10 = _mm256_load_pd(o+pos+offset+64);
    vo11 = _mm256_load_pd(o+pos+offset+128);

    for(s = 0; s<POLY_DEG; s+=512)
    {
      vx1 = _mm256_load_pd(out+offset+s+64);
      vt = _mm256_mul_pd(vx1, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx0 = _mm256_load_pd(out+offset+s+0);
      vx1 = _mm256_sub_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+64, vx1);
      vx0 = _mm256_add_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+0, vx0);

      vx3 = _mm256_load_pd(out+offset+s+192);
      vt = _mm256_mul_pd(vx3, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx2 = _mm256_load_pd(out+offset+s+128);
      vx3 = _mm256_sub_pd(vx2, vt);
      //_mm256_store_pd(out+offset+s+192, vx3);
      vx2 = _mm256_add_pd(vx2, vt);
      //_mm256_store_pd(out+offset+s+128, vx2);

      vx5 = _mm256_load_pd(out+offset+s+320);
      vt = _mm256_mul_pd(vx5, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx4 = _mm256_load_pd(out+offset+s+256);
      vx5 = _mm256_sub_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+320, vx5);
      vx4 = _mm256_add_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+256, vx4);

      vx7 = _mm256_load_pd(out+offset+s+448);
      vt = _mm256_mul_pd(vx7, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx6 = _mm256_load_pd(out+offset+s+384);
      vx7 = _mm256_sub_pd(vx6, vt);
      //_mm256_store_pd(out+offset+s+448, vx7);
      vx6 = _mm256_add_pd(vx6, vt);
      //_mm256_store_pd(out+offset+s+384, vx6);

    

      //vx2 = _mm256_load_pd(out+offset+s+128);
      vt = _mm256_mul_pd(vx2, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx0 = _mm256_load_pd(out+offset+s+0);
      vx2 = _mm256_sub_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+128, vx2);
      vx0 = _mm256_add_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+0, vx0);

      //vx3 = _mm256_load_pd(out+offset+s+192);
      vt = _mm256_mul_pd(vx3, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx1 = _mm256_load_pd(out+offset+s+64);
      vx3 = _mm256_sub_pd(vx1, vt);
      //_mm256_store_pd(out+offset+s+192, vx3);
      vx1 = _mm256_add_pd(vx1, vt);
      //_mm256_store_pd(out+offset+s+64, vx1);

      //vx6 = _mm256_load_pd(out+offset+s+384);
      vt = _mm256_mul_pd(vx6, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx4 = _mm256_load_pd(out+offset+s+256);
      vx6 = _mm256_sub_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+384, vx6);
      vx4 = _mm256_add_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+256, vx4);

      //vx7 = _mm256_load_pd(out+offset+s+448);
      vt = _mm256_mul_pd(vx7, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx5 = _mm256_load_pd(out+offset+s+320);
      vx7 = _mm256_sub_pd(vx5, vt);
      //_mm256_store_pd(out+offset+s+448, vx7);
      vx5 = _mm256_add_pd(vx5, vt);
      //_mm256_store_pd(out+offset+s+320, vx5);


    
      //vx4 = _mm256_load_pd(out+offset+s+256);
    vo20 = _mm256_load_pd(o+pos+offset+192);
      vt = _mm256_mul_pd(vx4, vo20);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx0 = _mm256_load_pd(out+offset+s+0);
      vx4 = _mm256_sub_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+256, vx4);
      vx0 = _mm256_add_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+0, vx0);

      //vx5 = _mm256_load_pd(out+offset+s+320);
    vo21 = _mm256_load_pd(o+pos+offset+256);
      vt = _mm256_mul_pd(vx5, vo21);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx1 = _mm256_load_pd(out+offset+s+64);
      vx5 = _mm256_sub_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+320, vx5);
      vx1 = _mm256_add_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+64, vx1);

      //vx6 = _mm256_load_pd(out+offset+s+384);
    vo22 = _mm256_load_pd(o+pos+offset+320);
      vt = _mm256_mul_pd(vx6, vo22);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx2 = _mm256_load_pd(out+offset+s+128);
      vx6 = _mm256_sub_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+384, vx6);
      vx2 = _mm256_add_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+128, vx2);

      //vx7 = _mm256_load_pd(out+offset+s+448);
    vo23 = _mm256_load_pd(o+pos+offset+384);
      vt = _mm256_mul_pd(vx7, vo23);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx3 = _mm256_load_pd(out+offset+s+192);
      vx7 = _mm256_sub_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+448, vx7);

      vx3 = _mm256_add_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+192, vx3);
    }
  }
}
Пример #21
0
// it moves horizontally inside a block (A upper triangular)
void kernel_dtrmv_u_n_4_lib4(int kmax, double *A, double *x, double *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;
	
	__m128d
		tmp0,
		z_0, y_0_1, a_00_10;

	__m256d
		zeros,
		ax_temp,
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0, x_1, x_2, x_3,
		y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3;
		
	zeros = _mm256_setzero_pd();
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_c = _mm256_setzero_pd();	*/
	y_0_1_2_3_d = _mm256_setzero_pd();
	
	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_0_1_2_3_c = _mm256_castpd128_pd256( y_0_1 );
	y_0_1_2_3_c = _mm256_blend_pd( y_0_1_2_3_c, y_0_1_2_3_d, 0xc );

	// forth col (avoid zero y_0_1_2_3)
	x_3     = _mm256_broadcast_sd( &x[3] );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	y_0_1_2_3 = _mm256_mul_pd( a_03_13_23_33, x_3 );

	// first col
	x_2     = _mm256_broadcast_sd( &x[2] );
	x_2     = _mm256_blend_pd( x_2, zeros, 0x8 );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
	y_0_1_2_3_b = _mm256_mul_pd( a_02_12_22_32, x_2 );

	A += 4*lda;
	x += 4;

	k=4;
	for(; k<kmax-3; k+=4)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		x_2 = _mm256_broadcast_sd( &x[2] );
		x_3 = _mm256_broadcast_sd( &x[3] );

		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 );
		y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp );
		ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 );
		y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp );
		
		A += 4*lda;
		x += 4;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d );

	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		A += 2*lda;
		x += 2;

		}

	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		
		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}

	}
Пример #22
0
BI_FORCE_INLINE inline avx_double operator-(const avx_double& o1,
    const double& o2) {
  avx_double res;
  res.packed = _mm256_sub_pd(o1.packed, _mm256_set1_pd(o2));
  return res;
}
Пример #23
0
BI_FORCE_INLINE inline avx_double& operator-=(avx_double& o1,
    const avx_double& o2) {
  o1.packed = _mm256_sub_pd(o1.packed, o2.packed);
  return o1;
}
Пример #24
0
// Process audio effects for 8 channels simultaneously:
void processEffects(const vec8_i32 &inpSamples, vec8_i32 &outSamples, const long n)
{
    // Extract int samples and convert to doubles:
    const vec4_d64 ds0 = _mm256_div_pd(
        _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 0)),
        _mm256_set1_pd((double)INT_MAX)
        );
    const vec4_d64 ds1 = _mm256_div_pd(
        _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 1)),
        _mm256_set1_pd((double)INT_MAX)
        );

    // Monitor input levels:
    fx.fi_monitor.levels[n + 0] = scalar_to_dBFS(ds0);
    fx.fi_monitor.levels[n + 1] = scalar_to_dBFS(ds1);

    vec4_d64 s0, s1;

    // f0_gain:
    {
        s0 = _mm256_mul_pd(ds0, fx.f0_gain.calc.gain[n + 0]);
        s1 = _mm256_mul_pd(ds1, fx.f0_gain.calc.gain[n + 1]);
    }

    // Monitor levels:
    fx.f0_output.levels[n + 0] = scalar_to_dBFS(s0);
    fx.f0_output.levels[n + 1] = scalar_to_dBFS(s1);

    // f1_compressor:
    {
        const vec4_dBFS l0 = scalar_to_dBFS_offs(s0);
        const vec4_dBFS l1 = scalar_to_dBFS_offs(s1);

        // over = s - thresh
        vec4_dB over0 = _mm256_sub_pd(l0, fx.f1_compressor.input.threshold[n + 0]);
        vec4_dB over1 = _mm256_sub_pd(l1, fx.f1_compressor.input.threshold[n + 1]);

        // over = if over < 0.0 then 0.0 else over;
        over0 = mm256_if_then_else(_mm256_cmp_pd(over0, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over0);
        over1 = mm256_if_then_else(_mm256_cmp_pd(over1, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over1);

        // over += DC_OFFSET
        over0 = _mm256_add_pd(over0, DC_OFFSET);
        over1 = _mm256_add_pd(over1, DC_OFFSET);

        // env = over + coef * ( env - over )
        const vec4_dB attack_env0  = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0)));
        const vec4_dB attack_env1  = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1)));
        const vec4_dB release_env0  = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0)));
        const vec4_dB release_env1  = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1)));

        // env = if over > env then attack_env else release_env
        fx.f1_compressor.state.env[n + 0] = mm256_if_then_else(_mm256_cmp_pd(over0, fx.f1_compressor.state.env[n + 0], _CMP_GT_OQ), attack_env0, release_env0);
        fx.f1_compressor.state.env[n + 1] = mm256_if_then_else(_mm256_cmp_pd(over1, fx.f1_compressor.state.env[n + 1], _CMP_GT_OQ), attack_env1, release_env1);

        // over = env - DC_OFFSET
        over0 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], DC_OFFSET);
        over1 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], DC_OFFSET);

        // grdB = ( over * ( ratio - 1.0 ) )
        vec4_dB gr0dB = _mm256_mul_pd(over0, fx.f1_compressor.calc.ratio_min_1[n + 0]);
        vec4_dB gr1dB = _mm256_mul_pd(over1, fx.f1_compressor.calc.ratio_min_1[n + 1]);

        // gr = dB_to_scalar(grdB)
        fx.f1_compressor.monitor.gain_reduction[n + 0] = dB_to_scalar(gr0dB);
        fx.f1_compressor.monitor.gain_reduction[n + 1] = dB_to_scalar(gr1dB);

        // Apply gain reduction to inputs:
        s0 = _mm256_mul_pd(s0, fx.f1_compressor.monitor.gain_reduction[n + 0]);
        s1 = _mm256_mul_pd(s1, fx.f1_compressor.monitor.gain_reduction[n + 1]);

        // Apply make-up gain:
        s0 = _mm256_mul_pd(s0, fx.f1_compressor.calc.gain[n + 0]);
        s1 = _mm256_mul_pd(s1, fx.f1_compressor.calc.gain[n + 1]);
    }

    // Monitor output levels:
    fx.fo_monitor.levels[n + 0] = scalar_to_dBFS(s0);
    fx.fo_monitor.levels[n + 1] = scalar_to_dBFS(s1);

    // TODO(jsd): Better limiter implementation!
    // Limit final samples:
    s0 = _mm256_max_pd(_mm256_min_pd(s0, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0));
    s1 = _mm256_max_pd(_mm256_min_pd(s1, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0));

    // Convert doubles back to 32-bit ints:
    s0 = _mm256_mul_pd(s0, _mm256_set1_pd((double)INT_MAX));
    s1 = _mm256_mul_pd(s1, _mm256_set1_pd((double)INT_MAX));
    const vec8_i32 os = _mm256_setr_m128i(_mm256_cvtpd_epi32(s0), _mm256_cvtpd_epi32(s1));

    // Write outputs:
    _mm256_stream_si256(&outSamples, os);
}
Пример #25
0
/**
  * Calculate all values in one step per pixel. Requires grabbing the neighboring pixels.
  */
FORCE_INLINE double single_pixel(
        double *im, int center, int top, int left, int right, int bottom,
        const __m256i mask1110,
        const __m256d rgb0W,
        const __m256d onehalf,
        const __m256d minustwelvehalf){
//    double r = im[center];
//    double g = im[center+1];
//    double b = im[center+2];

//    double r1 = im[top];
//    double g1 = im[top+1];
//    double b1 = im[top+2];
//    double r2 = im[left];
//    double g2 = im[left+1];
//    double b2 = im[left+2];
//    double r3 = im[right];
//    double g3 = im[right+1];
//    double b3 = im[right+2];
//    double r4 = im[bottom];
//    double g4 = im[bottom+1];
//    double b4 = im[bottom+2];

    __m256d c = _mm256_maskload_pd(&(im[center]),mask1110);
    __m256d c1 = _mm256_loadu_pd(&(im[top]));
    __m256d c2 = _mm256_loadu_pd(&(im[left]));
    __m256d c3 = _mm256_loadu_pd(&(im[right]));
    __m256d c4 = _mm256_loadu_pd(&(im[bottom]));

    COST_INC_LOAD(20);

//    double grey = rw * r + gw * g + bw * b;
//    double grey1 = rw * r1 + gw * g1 + bw * b1;
//    double grey2 = rw * r2 + gw * g2 + bw * b2;
//    double grey3 = rw * r3 + gw * g3 + bw * b3;
//    double grey4 = rw * r4 + gw * g4 + bw * b4;

    __m256d greyc = _mm256_mul_pd(c,rgb0W);
    __m256d grey1 = _mm256_mul_pd(c1,rgb0W);
    __m256d grey2 = _mm256_mul_pd(c2,rgb0W);
    __m256d grey3 = _mm256_mul_pd(c3,rgb0W);
    __m256d grey4 = _mm256_mul_pd(c4,rgb0W);

    //AVX: double: horizontal add for 1 vector
     __m256d c_perm = _mm256_permute2f128_pd(c, c, 0b00100001);//1,2
     __m256d c_h = _mm256_hadd_pd(c,c_perm);
     __m128d c_h_lo = _mm256_extractf128_pd (c_h, 0);// lo
     __m128d c_h_hi = _mm256_extractf128_pd (c_h, 1);// hi
     double c_hsum_lo = _mm_cvtsd_f64(c_h_lo);
     double c_hsum_hi = _mm_cvtsd_f64(c_h_hi);
     double c_hsum = c_hsum_lo + c_hsum_hi;

     //AVX: double: horizontal add for 1 vector
      __m256d greyc_perm = _mm256_permute2f128_pd(greyc, greyc, 0b00100001);//1,2
      __m256d greyc_h = _mm256_hadd_pd(greyc,greyc_perm);
      __m128d greyc_h_lo = _mm256_extractf128_pd (greyc_h, 0);// lo
      __m128d greyc_h_hi = _mm256_extractf128_pd (greyc_h, 1);// hi
      double greyc_hsum_lo = _mm_cvtsd_f64(greyc_h_lo);
      double greyc_hsum_hi = _mm_cvtsd_f64(greyc_h_hi);
      double greyc_hsum = greyc_hsum_lo + greyc_hsum_hi;

    //AVX: _m256d: horizontal add for 4 vectors at once
    __m256d grey12 = _mm256_hadd_pd(grey1,grey2);
    __m256d grey34 = _mm256_hadd_pd(grey3,grey4);
    __m256d grey_1234_blend = _mm256_blend_pd(grey12, grey34, 0b1100); //0011
    __m256d grey_1234_perm = _mm256_permute2f128_pd(grey12, grey34, 0b00100001);//1,2
    __m256d grey_1234 =  _mm256_add_pd(grey_1234_perm, grey_1234_blend);

    //AVX: double: horizontal add for 1 vector
     __m256d grey1234_perm = _mm256_permute2f128_pd(grey_1234, grey_1234, 0b00100001);//1,2
     __m256d grey1234_h = _mm256_hadd_pd(grey_1234,grey1234_perm);
     __m128d grey1234_h_lo = _mm256_extractf128_pd (grey1234_h, 0);// lo
     __m128d grey1234_h_hi = _mm256_extractf128_pd (grey1234_h, 1);// hi
     double grey1234_hsum_lo = _mm_cvtsd_f64(grey1234_h_lo);
     double grey1234_hsum_hi = _mm_cvtsd_f64(grey1234_h_hi);
     double grey1234_sum = grey1234_hsum_lo + grey1234_hsum_hi;

    COST_INC_ADD(10); //+ operations wasted on AVX
    COST_INC_MUL(15); //+ operations wasted on AVX

    double mu = c_hsum / 3.0;
    COST_INC_ADD(2);
    COST_INC_DIV(1);

//    double rmu = r-mu;
//    double gmu = g-mu;
//    double bmu = b-mu;

    __m256d c_mu = _mm256_set1_pd(mu);
    __m256d c_rgbmu = _mm256_sub_pd(c,c_mu);
    COST_INC_ADD(3); //+1 operations wasted on AVX

//    double rz = r-0.5;
//    double gz = g-0.5;
//    double bz = b-0.5;

    __m256d c_rgbz = _mm256_sub_pd(c,onehalf);
    COST_INC_ADD(3); //+1 operations wasted on AVX

//    double rzrz = rz*rz;
//    double gzgz = gz*gz;
//    double bzbz = bz*bz;

    __m256d c_rgbz_sq = _mm256_mul_pd(c_rgbz,c_rgbz);
    COST_INC_MUL(3); //+1 operations wasted on AVX

//    double re = exp(-12.5*rzrz);
//    double ge = exp(-12.5*gzgz);
//    double be = exp(-12.5*bzbz);

    __m256d c_rgbe_tmp = _mm256_mul_pd(minustwelvehalf,c_rgbz_sq);

    __m128 c_rgbe_tmp_ps = _mm256_cvtpd_ps(c_rgbe_tmp);
    __m128 c_rgbe_ps = exp_ps(c_rgbe_tmp_ps);
    __m256d c_rgbe = _mm256_cvtps_pd(c_rgbe_ps);

    COST_INC_EXP(3);
    COST_INC_MUL(3); //+1 operations wasted on AVX

//    double t1 = sqrt((rmu*rmu + gmu*gmu + bmu*bmu)/3.0);
    __m256d c_rgbmu_sq = _mm256_mul_pd(c_rgbmu,c_rgbmu);

    __m128d t1_tmp1_lo = _mm256_extractf128_pd (c_rgbmu_sq, 0);// lo
    __m128d t1_tmp1_hi = _mm256_extractf128_pd (c_rgbmu_sq, 1);// hi
    __m128d t1_tmp1_lo_sum = _mm_hadd_pd (t1_tmp1_lo, t1_tmp1_lo);
    double t1_tmp1_hi_lo = _mm_cvtsd_f64(t1_tmp1_hi);
    double t1_tmp1_lo_sum_lo = _mm_cvtsd_f64(t1_tmp1_lo_sum);

    double t1_tmp1 = t1_tmp1_lo_sum_lo + t1_tmp1_hi_lo;

    double t1_tmp2 = t1_tmp1 / 3.0;
    double t1 = sqrt(t1_tmp2);

    COST_INC_SQRT(1);
    COST_INC_ADD(3);
    COST_INC_MUL(3); //+1 operations wasted on AVX
    COST_INC_DIV(1);
    double t2 = fabs(t1);
    COST_INC_ABS(1);

//    double t3 = re*ge*be;

    __m128d t3_tmp1_lo = _mm256_extractf128_pd (c_rgbe, 0);// lo
    __m128d t3_tmp1_hi = _mm256_extractf128_pd (c_rgbe, 1);// hi

    double t3_tmp1_lo_lo = _mm_cvtsd_f64(t3_tmp1_lo);
    double t3_tmp1_hi_lo = _mm_cvtsd_f64(t3_tmp1_hi);
    __m128d t3_tmp1_lo_swapped = _mm_permute_pd(t3_tmp1_lo, 1);// swap
    double t3_tmp1_lo_hi = _mm_cvtsd_f64(t3_tmp1_lo_swapped);

    double t3 = t3_tmp1_lo_lo * t3_tmp1_lo_hi * t3_tmp1_hi_lo;

    COST_INC_MUL(2);
    double t4 = fabs(t3);
    COST_INC_ABS(1);

    double t5 = t2 * t4;
    COST_INC_MUL(1);

//    double t6 = -4.0*grey+grey1+grey2+grey3+grey4;

    double minusfour_times_grey = -4.0*greyc_hsum;
    double t6 = minusfour_times_grey+grey1234_sum;

    COST_INC_MUL(1);
    COST_INC_ADD(2); //2 operations saved due to AVX

    double t7 = fabs(t6);
    COST_INC_ABS(1);

    double t8 = t5 * t7;
    COST_INC_MUL(1);

    double t9 = t8 + 1.0E-12;
    COST_INC_ADD(1);

    return t9;
}
Пример #26
0
 /*!
  * \brief Subtract the two given values and return the result.
  */
 ETL_STATIC_INLINE(avx_simd_double) sub(avx_simd_double lhs, avx_simd_double rhs) {
     return _mm256_sub_pd(lhs.value, rhs.value);
 }
Пример #27
0
DBL AVX2FMA3Noise(const Vector3d& EPoint, int noise_generator)
{
    AVX2TABLETYPE *mp;
    DBL sum = 0.0;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator == kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
        if (sum < 0.0) sum = 0.0;
        if (sum > 1.0) sum = 1.0;

        return sum;
    }

    const __m256d ONE_PD = _mm256_set1_pd(1);
    const __m128i short_si128 = _mm_set1_epi32(0xffff);

    const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0);
    const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON);
    const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy);
    const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn));

    const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0);

    const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn));
    const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD);

    const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn),
        _mm_set1_epi32(0xfff));

    const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn,
        _mm256_mul_pd(xyz_ixyzn,
            _mm256_sub_pd(_mm256_set1_pd(3.0),
                _mm256_add_pd(xyz_ixyzn, xyz_ixyzn))));

    const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn);

    const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20);
    const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0));
    const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1));

    const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy);

    const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));
    const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));

    int ints[4];
    _mm_storeu_si128((__m128i*)(ints), i_xyzn);

    const int ixiy_hash = Hash2d(ints[0], ints[1]);
    const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]);
    const int ixjy_hash = Hash2d(ints[0], ints[1] + 1);
    const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1);

    const int iz = ints[2];

    const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);
    const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);

    __m256d sumr = _mm256_setzero_pd();
    __m256d sumr1 = _mm256_setzero_pd();


    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)];
    INCSUMAVX_NOBLEND(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)), iii);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 2);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 4);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)), iii, jjj, 6);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)), iii, jjj, 8);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 10);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 12);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)];
    INCSUMAVX_NOBLEND(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)), jjj);

    {
        sumr = _mm256_add_pd(sumr, sumr1);

        __m128d sumr_up = _mm256_extractf128_pd(sumr,1);
        sumr_up = _mm_add_pd(_mm256_castpd256_pd128(sumr),sumr_up);
        sumr_up = _mm_hadd_pd(sumr_up,sumr_up);
        sum = _mm_cvtsd_f64(sumr_up);
    }

    if (noise_generator == kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        sum += 1.05242;
        sum *= 0.48985582;
        /*sum *= 0.5;
        sum += 0.5;*/

        if (sum < 0.0)
            sum = 0.0;
        if (sum > 1.0)
            sum = 1.0;
    }
    else
    {
        sum = sum + 0.5;                     /* range at this point -0.5 - 0.5... */

        if (sum < 0.0)
            sum = 0.0;
        if (sum > 1.0)
            sum = 1.0;
    }



#if CHECK_FUNCTIONAL
    {
        DBL orig_sum = PortableNoise(EPoint, noise_generator);
        if (fabs(orig_sum - sum) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("Noise error");
        }

    }

#endif

    _mm256_zeroupper();
    return (sum);
}
Пример #28
0
void CalculateBasisComponents(const MDoubleArray& weights, const BaryCoords& coords,
                              const MIntArray& triangleVertices, const MPointArray& points,
                              const MFloatVectorArray& normals, const MIntArray& sampleIds,
                              double* alignedStorage,
                              MPoint& origin, MVector& up, MVector& normal) {
  // Start with the recreated point and normal using the barycentric coordinates of the hit point.
  unsigned int hitIndex = weights.length()-1;
#ifdef __AVX__
  __m256d originV = Dot4<MPoint>(coords[0], coords[1], coords[2], 0.0,
                                points[triangleVertices[0]], points[triangleVertices[1]],
                                points[triangleVertices[2]], MPoint::origin);
  __m256d hitNormalV = Dot4<MVector>(coords[0], coords[1], coords[2], 0.0,
                                normals[triangleVertices[0]], normals[triangleVertices[1]],
                                normals[triangleVertices[2]], MVector::zero);
  __m256d hitWeightV = _mm256_set1_pd(weights[hitIndex]);
  // Create the barycentric point and normal.
  __m256d normalV = _mm256_mul_pd(hitNormalV, hitWeightV);
  // Then use the weighted adjacent data.
  for (unsigned int j = 0; j < hitIndex; j += 4) {
    __m256d tempNormal = Dot4<MVector>(weights[j], weights[j+1], weights[j+2], weights[j+3],
                                       normals[sampleIds[j]], normals[sampleIds[j+1]],
                                       normals[sampleIds[j+2]], normals[sampleIds[j+3]]);
    normalV = _mm256_add_pd(tempNormal, normalV);
  }

  _mm256_store_pd(alignedStorage, originV);
  origin.x = alignedStorage[0];
  origin.y = alignedStorage[1];
  origin.z = alignedStorage[2];
  _mm256_store_pd(alignedStorage, normalV);
  normal.x = alignedStorage[0];
  normal.y = alignedStorage[1];
  normal.z = alignedStorage[2];

  // Calculate the up vector
  const MPoint& pt1 = points[triangleVertices[0]];
  const MPoint& pt2 = points[triangleVertices[1]];
  __m256d p1 = _mm256_set_pd(pt1.w, pt1.z, pt1.y, pt1.x);
  __m256d p2 = _mm256_set_pd(pt2.w, pt2.z, pt2.y, pt2.x);
  p1 = _mm256_add_pd(p1, p2);
  __m256d half = _mm256_set_pd(0.5, 0.5, 0.5, 0.5);
  p1 = _mm256_mul_pd(p1, half);
  __m256d upV = _mm256_sub_pd(p1, originV);
  _mm256_store_pd(alignedStorage, upV);
  up.x = alignedStorage[0];
  up.y = alignedStorage[1];
  up.z = alignedStorage[2];
#else
  MVector hitNormal;
  // Create the barycentric point and normal.
  for (int i = 0; i < 3; ++i) {
    origin += points[triangleVertices[i]] * coords[i];
    hitNormal += MVector(normals[triangleVertices[i]]) * coords[i];
  }
  // Use crawl data to calculate normal
  normal = hitNormal * weights[hitIndex];
  for (unsigned int j = 0; j < hitIndex; j++) {
    normal += MVector(normals[sampleIds[j]]) * weights[j];
  }

  // Calculate the up vector
  // The triangle vertices are sorted by decreasing barycentric coordinates so the first two are
  // the two closest vertices in the triangle.
  up = ((points[triangleVertices[0]] + points[triangleVertices[1]]) * 0.5) - origin;
#endif
  normal.normalize();
  GetValidUp(weights, points, sampleIds, origin, normal, up);
}
Пример #29
0
void kernel_dgemv_t_4_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );

	double *tA, *tx;

	int k;
	int ka = kmax; // number from aligned positon
	
	__m256d
		aaxx_temp,
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0_1_2_3,
		y_00, y_11, y_22, y_33;
	
	__m128d
		ax_temp,
		a_00_10, a_01_11, a_02_12, a_03_13,
		x_0_1,
		y_0, y_1, y_2, y_3;
	
	y_00 = _mm256_setzero_pd();
	y_11 = _mm256_setzero_pd();
	y_22 = _mm256_setzero_pd();
	y_33 = _mm256_setzero_pd();
	
	y_0 = _mm256_castpd256_pd128(y_00);
	y_1 = _mm256_castpd256_pd128(y_11);
	y_2 = _mm256_castpd256_pd128(y_22);
	y_3 = _mm256_castpd256_pd128(y_33);

	k = lda*(ka/lda);
	tA = A + (ka/lda)*sda*lda;
	tx = x + (ka/lda)*lda;

	for(; k<ka; k++)
		{
		x_0_1 = _mm_load_sd( &tx[0] );

		a_00_10 = _mm_load_sd( &tA[0+lda*0] );
		a_01_11 = _mm_load_sd( &tA[0+lda*1] );
		a_02_12 = _mm_load_sd( &tA[0+lda*2] );
		a_03_13 = _mm_load_sd( &tA[0+lda*3] );
		
			ax_temp = _mm_mul_sd( a_00_10, x_0_1 );	
			y_0 = _mm_add_sd (y_0, ax_temp );
			ax_temp = _mm_mul_sd( a_01_11, x_0_1 );	
			y_1 = _mm_add_sd (y_1, ax_temp );
			ax_temp = _mm_mul_sd( a_02_12, x_0_1 );	
			y_2 = _mm_add_sd (y_2, ax_temp );
			ax_temp = _mm_mul_sd( a_03_13, x_0_1 );	
			y_3 = _mm_add_sd (y_3, ax_temp );
		
		tA += 1;
		tx += 1;

		}

	y_00 = _mm256_castpd128_pd256(y_0);
	y_11 = _mm256_castpd128_pd256(y_1);
	y_22 = _mm256_castpd128_pd256(y_2);
	y_33 = _mm256_castpd128_pd256(y_3);

	k=0;
	for(; k<ka-7; k+=8)
		{
		
		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
		
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
		
		A += 4 + (sda-1)*lda;
		x += 4;


		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
		
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
		
		A += 4 + (sda-1)*lda;
		x += 4;

		}
	for(; k<ka-3; k+=4)
		{
		
		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
		
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
		
		A += 4 + (sda-1)*lda;
		x += 4;

		}

	__m256d
		y_0_1_2_3;

	y_00 = _mm256_hadd_pd(y_00, y_11);
	y_22 = _mm256_hadd_pd(y_22, y_33);

	y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 );	
	y_00 = _mm256_permute2f128_pd(y_22, y_00, 19);	

	y_00 = _mm256_add_pd( y_00, y_11 );

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_00);
		}
	else if(alg==1)
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 );

		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
	
		y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 );
	
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}

	}