Exemple #1
0
        void dmul(unsigned int N, const double* a, const double* b, double* y) {
            flops_counter += N ;
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d Y1, Y2, A1, A2, B1, B2 ;
                unsigned int i = 0 ;
                while(i<N) {
                    _mm_prefetch((const char*)(&a[i] + 256), _MM_HINT_NTA) ;
                    _mm_prefetch((const char*)(&b[i] + 256), _MM_HINT_NTA) ;
                    A1 = _mm_load_pd(&a[i]) ;
                    B1 = _mm_load_pd(&b[i]) ;
                    Y1 = _mm_mul_pd(A1,B1) ;
                    i += 2 ;
                    A2 = _mm_load_pd(&a[i]) ;
                    B2 = _mm_load_pd(&b[i]) ;
                    Y2 = _mm_mul_pd(A2,B2) ;
                    i += 2 ;
                    _mm_stream_pd(&y[i - 4], Y1) ;
                    _mm_stream_pd(&y[i - 2], Y2) ;
                }
                _mm_sfence() ;
                return ;
            }
#endif
            for(unsigned int i=0; i<N; i++) {
                y[i] = a[i] * b[i] ;
            }
        }
Exemple #2
0
        void  daxpy(unsigned int N, double a, const double* x, double* y) {
            flops_counter += (2*N) ;
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d Y1, Y2, X1, X2, AA ;
                SSE_ALIGNED(double temp[2]) ;
                temp[0] = a ; temp[1] = a ;
                AA = _mm_load_pd(temp) ;
                unsigned int i = 0 ;
                while(i<N) {
                    _mm_prefetch((const char*)(&x[i] + 128), _MM_HINT_NTA) ;
                    _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ;
                    X1 = _mm_load_pd(&x[i]) ;
                    Y1 = _mm_load_pd(&y[i]) ;
                    Y1 = _mm_add_pd(Y1, _mm_mul_pd(X1, AA)) ;
                    i += 2 ;
                    X2 = _mm_load_pd(&x[i]) ;
                    Y2 = _mm_load_pd(&y[i]) ;
                    Y2 = _mm_add_pd(Y2, _mm_mul_pd(X2, AA)) ;
                    i += 2 ;
                    _mm_stream_pd(&y[i - 4], Y1) ;
                    _mm_stream_pd(&y[i - 2], Y2) ;
                }
                _mm_sfence() ;
                return ;
            }
#endif
            for(unsigned int i=0; i<N; i++) {
                y[i] += a * x[i] ;
            }
        }
Exemple #3
0
        double  ddot(unsigned int N, const double* x, const double* y)  {
            flops_counter += (2*N) ;
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d X1, Y1, X2, Y2 ;
                __m128d acc1 = _mm_setzero_pd() ;
                __m128d acc2 = _mm_setzero_pd() ;
                SSE_ALIGNED(double temp[2]) ;
                unsigned int i = 0 ;
                while(i<N) {
                    _mm_prefetch((const char*)(&x[i] + 128), _MM_HINT_NTA) ;
                    _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ;
                    X1 = _mm_load_pd(&x[i]) ;
                    Y1 = _mm_load_pd(&y[i]) ;
                    acc1 = _mm_add_pd(acc1, _mm_mul_pd(X1,Y1)) ;
                    i += 2 ;
                    X2 = _mm_load_pd(&x[i]) ;
                    Y2 = _mm_load_pd(&y[i]) ;
                    acc2 = _mm_add_pd(acc2, _mm_mul_pd(X2,Y2)) ;
                    i += 2 ;
                }
                acc1 = _mm_add_pd(acc1, acc2) ;
                _mm_store_pd(temp, acc1)  ;
                return temp[0] + temp[1] ;
            }
#endif
            double result = 0.0 ;
            for(unsigned int i=0; i<N; i++) {
                result += x[i]*y[i] ;
            }
            return result ;
        }
Exemple #4
0
// Faster than multiply when you have to mutiply many vectors by the same matrix
// Using this function, we can efficiently prefetch data, and only have to
// transpose the matrix once
void Mat44::BatchMult(const float4 * const in, float4 *out, u32 len) const
{
	Mat44 tr = Transpose();
	__m128 matcols[] =
	{
		_mm_load_ps(tr.mat),
		_mm_load_ps(tr.mat+4),
		_mm_load_ps(tr.mat+8),
		_mm_load_ps(tr.mat+12)
	};

	while(len--)
	{
		__m128 v = _mm_load_ps(in[len].GetVec());
		_mm_prefetch((const char*)&in[len+1], _MM_HINT_T0);

		// Broadcast vector into SSE registers
		__m128 xb = _mm_shuffle_ps(v,v,0x00);
		__m128 yb = _mm_shuffle_ps(v,v,0x55);
		__m128 zb = _mm_shuffle_ps(v,v,0xAA);
		__m128 wb = _mm_shuffle_ps(v,v,0xFF);

		// Perform multiplication by matrix columns
		xb = _mm_mul_ps(xb, matcols[0]);
		yb = _mm_mul_ps(yb, matcols[1]);
		zb = _mm_mul_ps(zb, matcols[2]);
		wb = _mm_mul_ps(wb, matcols[3]);

		// Add results
		__m128 r = _mm_add_ps(_mm_add_ps(xb, yb),_mm_add_ps(zb, wb));

		_mm_prefetch((const char*)&out[len+1], _MM_HINT_T0);
		_mm_store_ps(out[len].GetVec(), r);
	}
};
Exemple #5
0
void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size)
{
	static const size_t stride = sizeof(__m128i)*4;
	static const u32 PSD = 64;
	
	static const __m128i lomask = _mm_set1_epi32(0x00FF00FF);
	static const __m128i round = _mm_set1_epi16(128);

	assert(source1 != NULL && source2 != NULL && dest != NULL);
	assert(size % stride == 0);
	assert(alpha >= 0.0 && alpha <= 1.0);

	const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1);
	const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2);
	__m128i* dest128 = reinterpret_cast<__m128i*>(dest);

	__m128i s = _mm_setzero_si128();
	__m128i d = _mm_setzero_si128();
	const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f));
	
	__m128i drb, dga, srb, sga;
	
	for (size_t k = 0, length = size/stride; k < length; ++k)
	{		
		_mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA);	
		_mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA);
		// TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/

		for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2)
		{
			// r = d + (s-d)*alpha/256
			s = _mm_load_si128(source128_1);	// AABBGGRR
			d = _mm_load_si128(source128_2);	// AABBGGRR

			srb = _mm_and_si128(lomask, s);		// 00BB00RR		// unpack
			sga = _mm_srli_epi16(s, 8);			// AA00GG00		// unpack
			
			drb = _mm_and_si128(lomask, d);		// 00BB00RR		// unpack
			dga = _mm_srli_epi16(d, 8);			// AA00GG00		// unpack

			srb = _mm_sub_epi16(srb, drb);		// BBBBRRRR		// sub
			srb = _mm_mullo_epi16(srb, a);		// BBBBRRRR		// mul
			srb = _mm_add_epi16(srb, round);
			
			sga = _mm_sub_epi16(sga, dga);		// AAAAGGGG		// sub
			sga = _mm_mullo_epi16(sga, a);		// AAAAGGGG		// mul
			sga = _mm_add_epi16(sga, round);

			srb = _mm_srli_epi16(srb, 8);		// 00BB00RR		// prepack and div
			sga = _mm_andnot_si128(lomask, sga);// AA00GG00		// prepack and div

			srb = _mm_or_si128(srb, sga);		// AABBGGRR		// pack

			srb = _mm_add_epi8(srb, d);			// AABBGGRR		// add		there is no overflow(R.N)

			_mm_store_si128(dest128, srb);
		}
	}
}
Exemple #6
0
double	vector_ps_double (const double* pa,const double* pb,size_t n)
{
    size_t k;
    /* multiplication 4 par 4 */
    size_t q = n / 4;
    size_t r = n % 4;
    double w;
    _mm_prefetch (pa,_MM_HINT_NTA);
    _mm_prefetch (pb,_MM_HINT_NTA);
    if (q > 0) {
	__m128d acc1 = _mm_setzero_pd();
	__m128d acc2 = _mm_setzero_pd();
	if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
	    for (k=0;k<q;k++) {
		/* Charge 2 doubles dans chaque tableau */
		__m128d i1 = _mm_load_pd(pa);
		__m128d j1 = _mm_load_pd(pb);
		__m128d i2 = _mm_load_pd(pa+2);
		__m128d j2 = _mm_load_pd(pb+2);
		/* incrément de 4 doubles en tout (2 pour i et 2 pour j) */
		/* Multiplie */
		__m128d s1 = _mm_mul_pd(i1,j1);
		__m128d s2 = _mm_mul_pd(i2,j2);
		pa += 4;
		pb += 4;
		/* Accumule */
		acc1 = _mm_add_pd(acc1,s1);
		acc2 = _mm_add_pd(acc2,s2);
	    }
	}
	else {
	    for (k=0;k<q;k++) {
		/* Charge 2 doubles dans chaque tableau */
		__m128d i1 = _mm_loadu_pd(pa);
		__m128d j1 = _mm_loadu_pd(pb);
		__m128d i2 = _mm_loadu_pd(pa+2);
		__m128d j2 = _mm_loadu_pd(pb+2);
		/* Multiplie */
		__m128d s1 = _mm_mul_pd(i1,j1);
		__m128d s2 = _mm_mul_pd(i2,j2);
		pa += 4;
		pb += 4;
		/* Accumule */
		acc1 = _mm_add_pd(acc1,s1);
		acc2 = _mm_add_pd(acc2,s2);
	    }
	}
	/* Somme finale */
	acc1 = _mm_add_pd(acc1,acc2);
	acc1 = _mm_hadd_pd(acc1,acc1);
	_mm_store_sd(&w,acc1);
    }
    else {
	w = 0;
    }
    for (k=0;k<r;k++)
	w += (*pa++) * (*pb++);
    return w;
}
Exemple #7
0
void
init_xrpow_core_sse(gr_info * const cod_info, FLOAT xrpow[576], int upper, FLOAT * sum)
{
    int     i;
    float   tmp_max = 0;
    float   tmp_sum = 0;
    int     upper4 = (upper / 4) * 4;
    int     rest = upper-upper4;

    const vecfloat_union fabs_mask = {{ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }};
    const __m128 vec_fabs_mask = _mm_loadu_ps(&fabs_mask._float[0]);
    vecfloat_union vec_xrpow_max;
    vecfloat_union vec_sum;
    vecfloat_union vec_tmp;

    _mm_prefetch((char *) cod_info->xr, _MM_HINT_T0);
    _mm_prefetch((char *) xrpow, _MM_HINT_T0);

    vec_xrpow_max._m128 = _mm_set_ps1(0);
    vec_sum._m128 = _mm_set_ps1(0);

    for (i = 0; i < upper4; i += 4) {
        vec_tmp._m128 = _mm_loadu_ps(&(cod_info->xr[i])); /* load */
        vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */
        vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128);
        vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128)));
        vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */
        _mm_storeu_ps(&(xrpow[i]), vec_tmp._m128); /* store into xrpow[] */
    }
    vec_tmp._m128 = _mm_set_ps1(0);
    switch (rest) {
        case 3: vec_tmp._float[2] = cod_info->xr[upper4+2];
        case 2: vec_tmp._float[1] = cod_info->xr[upper4+1];
        case 1: vec_tmp._float[0] = cod_info->xr[upper4+0];
            vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */
            vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128);
            vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128)));
            vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */
            switch (rest) {
                case 3: xrpow[upper4+2] = vec_tmp._float[2];
                case 2: xrpow[upper4+1] = vec_tmp._float[1];
                case 1: xrpow[upper4+0] = vec_tmp._float[0];
                default:
                    break;
            }
        default:
            break;
    }
    tmp_sum = vec_sum._float[0] + vec_sum._float[1] + vec_sum._float[2] + vec_sum._float[3];
    {
        float ma = vec_xrpow_max._float[0] > vec_xrpow_max._float[1]
                ? vec_xrpow_max._float[0] : vec_xrpow_max._float[1];
        float mb = vec_xrpow_max._float[2] > vec_xrpow_max._float[3]
                ? vec_xrpow_max._float[2] : vec_xrpow_max._float[3];
        tmp_max = ma > mb ? ma : mb;
    }
    cod_info->xrpow_max = tmp_max;
    *sum = tmp_sum;
}
Exemple #8
0
void
test_prefetch (char *p)
{
  _mm_prefetch (p, _MM_HINT_T0);
  _mm_prefetch (p+4, _MM_HINT_T1);
  _mm_prefetch (p+8, _MM_HINT_T2);
  _mm_prefetch (p+12, _MM_HINT_NTA);
}
Exemple #9
0
void PreOver_FastSSE2(void* dest, const void* source1, const void* source2, size_t size)
{
	static const size_t stride = sizeof(__m128i)*4;
	static const u32 PSD = 64;

	static const __m128i lomask = _mm_set1_epi32(0x00FF00FF);

	assert(source1 != NULL && source2 != NULL && dest != NULL);
	assert(size % stride == 0);

	const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1);
	const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2);
	__m128i*	   dest128 = reinterpret_cast<__m128i*>(dest);		

	__m128i d, s, a, rb, ag;
	
	// TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N)
	for(int k = 0, length = size/stride; k < length; ++k)	
	{
		// TODO: put prefetch between calculations?(R.N)
		_mm_prefetch(reinterpret_cast<const s8*>(source128_1+PSD), _MM_HINT_NTA);
		_mm_prefetch(reinterpret_cast<const s8*>(source128_2+PSD), _MM_HINT_NTA);	

		//work on entire cacheline before next prefetch
		for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2)
		{
			// TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/

			s = _mm_load_si128(source128_1);		// AABGGRR
			d = _mm_load_si128(source128_2);		// AABGGRR
						
			// set alpha to lo16 from dest_
			rb = _mm_srli_epi32(d, 24);			// 000000AA
			a = _mm_slli_epi32(rb, 16);			// 00AA0000
			a = _mm_or_si128(rb, a);			// 00AA00AA

			// fix alpha a = a > 127 ? a+1 : a
			// NOTE: If removed an *overflow* will occur with large values (R.N)
			rb = _mm_srli_epi16(a, 7);
			a = _mm_add_epi16(a, rb);
			
			rb = _mm_and_si128(lomask, s);		// 00B00RR		unpack
			rb = _mm_mullo_epi16(rb, a);		// BBRRRR		mul (D[A]*S)
			rb = _mm_srli_epi16(rb, 8);			// 00B00RR		prepack and div [(D[A]*S)]/255

			ag = _mm_srli_epi16(s, 8); 			// 00AA00GG		unpack
			ag = _mm_mullo_epi16(ag, a);		// AAAAGGGG		mul (D[A]*S)
			ag = _mm_andnot_si128(lomask, ag);	// AA00GG00		prepack and div [(D[A]*S)]/255
					
			rb = _mm_or_si128(rb, ag);			// AABGGRR		pack
					
			rb = _mm_sub_epi8(s, rb);			// sub S-[(D[A]*S)/255]
			d = _mm_add_epi8(d, rb);			// add D+[S-(D[A]*S)/255]

			_mm_store_si128(dest128, d);
		}
	}		
}
/* Fast remote SCI copy for systems with write-combining enabled.
   This is the version using SSE instructions to copy 128 Byte blocks,
   and flushes after 64 Byte. */
void _mpid_smi_sse64_memcpy(void *dest, const void *src, size_t size)
{
	char*	a = (char*) src;
	char*	b = (char*) dest;
	size_t	j = 0;
	__m128	xmm[8];

	/* Align the destination to a 64 Byte boundary */
	for(; (j < size) && (((size_t) &b[j]) % 64 != 0); j++)
		((char*) b)[j] = ((char*) a)[j];
	
	// Loads two cache lines of data to a location closer to the processor.
	_mm_prefetch(a+j, _MM_HINT_NTA);
	_mm_prefetch(a+j+64, _MM_HINT_NTA);

	/* copy 128 byte per loop */
	for (; (j+128) < size; j+=128) 
	{
		// Loads two cache lines of data to a location closer to the processor.
		_mm_prefetch(a+j+128, _MM_HINT_NTA);
		_mm_prefetch(a+j+192, _MM_HINT_NTA);

		/* load 128 Byte into xmm register */
		xmm[0] = _mm_load_ps((float*) &a[j]);
		xmm[1] = _mm_load_ps((float*) &a[j+16]);
		xmm[2] = _mm_load_ps((float*) &a[j+32]);
		xmm[3] = _mm_load_ps((float*) &a[j+48]);
		xmm[4] = _mm_load_ps((float*) &a[j+64]);
		xmm[5] = _mm_load_ps((float*) &a[j+80]);
		xmm[6] = _mm_load_ps((float*) &a[j+96]);
		xmm[7] = _mm_load_ps((float*) &a[j+112]);

		/* store 64 byte */
		_mm_stream_ps((float*) &b[j], xmm[0]);
		_mm_stream_ps((float*) &b[j+16], xmm[1]);
		_mm_stream_ps((float*) &b[j+32], xmm[2]);
		_mm_stream_ps((float*) &b[j+48], xmm[3]);
		
		/* flush the write-combine buffer */
		_mm_sfence(); 

		/* store 64 byte */
		_mm_stream_ps((float*) &b[j+64], xmm[4]);
		_mm_stream_ps((float*) &b[j+80], xmm[5]);
		_mm_stream_ps((float*) &b[j+96], xmm[6]);
		_mm_stream_ps((float*) &b[j+112], xmm[7]);

		/* flush the write-combine buffer */
		_mm_sfence();  
	}

	/* copy tail */
	for(; j<size; j++)
		((char*) b)[j] = ((char*) a)[j];
}
Exemple #11
0
double evaluateGTRGAMMA_MIC(int *ex1, int *ex2, int *wgt,
                 double *x1_start, double *x2_start,
                 double *tipVector,
                 unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling)
{
	double sum = 0.0;

    /* the left node is a tip */
    if(tipX1)
    {

        double aTipVec[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
        for(int k = 0; k < 16; k++)
        {
            for(int l = 0; l < 4; l++)
            {
                aTipVec[k*16 + l] = aTipVec[k*16 + 4 + l] = aTipVec[k*16 + 8 + l] = aTipVec[k*16 + 12 + l] = tipVector[k*4 + l];
            }
        }

        /* loop over the sites of this partition */
        for (int i = 0; i < n; i++)
        {
            _mm_prefetch((const char*) &x2_start[span*(i+8)], _MM_HINT_T1);
            _mm_prefetch((const char*) &x2_start[span*(i+8) + 8], _MM_HINT_T1);

            _mm_prefetch((const char*) &x2_start[span*(i+1)], _MM_HINT_T0);
            _mm_prefetch((const char*) &x2_start[span*(i+1) + 8], _MM_HINT_T0);

          /* access pre-computed tip vector values via a lookup table */
          const double *x1 = &(aTipVec[16 * tipX1[i]]);
          /* access the other(inner) node at the other end of the branch */
          const double *x2 = &(x2_start[span * i]);

          double term = 0.;

          #pragma ivdep
          #pragma vector aligned
          for(int j = 0; j < span; j++)
              term += x1[j] * x2[j] * diagptable[j];

          if(!fastScaling)
              term = log(0.25 * term) + (ex2[i] * log(PLL_MINLIKELIHOOD));
          else
              term = log(0.25 * term);

          sum += wgt[i] * term;
        }
    }
    else
    {
        for (int i = 0; i < n; i++)
inline
T atomic_compare_exchange( volatile T * const dest , const T compare ,
    typename Kokkos::Impl::enable_if<
                  ( sizeof(T) != 4 )
               && ( sizeof(T) != 8 )
            #if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
               && ( sizeof(T) != 16 )
            #endif
             , const T >::type& val )
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif

  while( !Impl::lock_address_host_space( (void*) dest ) );
  T return_val = *dest;
  if( return_val == compare ) {
    // Don't use the following line of code here:
    //
    //const T tmp = *dest = val;
    //
    // Instead, put each assignment in its own statement.  This is
    // because the overload of T::operator= for volatile *this should
    // return void, not volatile T&.  See Kokkos #177:
    //
    // https://github.com/kokkos/kokkos/issues/177
    *dest = val;
    const T tmp = *dest;
    #ifndef KOKKOS_COMPILER_CLANG
    (void) tmp;
    #endif
  }
  Impl::unlock_address_host_space( (void*) dest );
  return return_val;
}
Exemple #13
0
void TripletConnection::propagate_backward()
{
	if (stdp_active) { 
		SpikeContainer::const_iterator spikes_end = dst->get_spikes_immediate()->end();
		// loop over all spikes
		for (SpikeContainer::const_iterator spike = dst->get_spikes_immediate()->begin() ; // spike = post_spike
				spike != spikes_end ; 
				++spike ) {
			// Since we need the local id of the postsynaptic neuron that spiked 
			// multiple times, we translate it here:
			NeuronID translated_spike = dst->global2rank(*spike); 

			// loop over all presynaptic partners
			for (const NeuronID * c = bkw->get_row_begin(*spike) ; c != bkw->get_row_end(*spike) ; ++c ) {

				#ifdef CODE_ACTIVATE_PREFETCHING_INTRINSICS
				// prefetches next memory cells to reduce number of last-level cache misses
				_mm_prefetch((const char *)bkw_data[c-bkw_ind+2],  _MM_HINT_NTA);
				#endif

				// computes plasticity update
				AurynWeight * weight = bkw->get_data(c); 
				*weight += dw_post(*c,translated_spike);

				// clips too large weights
				if (*weight>get_max_weight()) *weight=get_max_weight();
			}
		}
	}
}
Exemple #14
0
char *gen(char *buf,int size,int offset){int i;
  for(i=0;i<size;i++){
    buf[i+offset]=(rand_r(&r_seed)%128)+1;
  }
  buf[i+offset]=0;

  #ifdef DIRTY_CACHE
    /*As we wrote to writeback cache we are dealing with dirty cache lines.*/
  #else
    for(i=0;i<=size+64;i+=64){
        _mm_prefetch(buf+i+offset ,_MM_HINT_T0);
    }
  #endif
  #ifdef NO_CACHE
    for(i=0;i<=size+64;i+=64){
      _mm_clflush(buf+i+offset);
    }
  #endif

  #ifdef NO_ICACHE
    forget_icache(rand_r(&r_seed)%2048);
    forget_icache(rand_r(&r_seed)%2048);
    forget_icache(rand_r(&r_seed)%2048);
    forget_icache(rand_r(&r_seed)%2048);
  #endif

  return buf+offset;
}
Exemple #15
0
// TODO: should be optimized for different combinations (R.N)
void Shuffle_SSE2(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha)
{
	static const size_t stride = sizeof(__m128i)*4;
	static const u32 PSD = 64;

	static const __m128i himask	= _mm_set1_epi32(0xFF000000);	
	static const __m128i lomask	= _mm_set1_epi32(0x000000FF);
	
	assert(source != NULL && dest != NULL);
	assert(red > -1 && red < 4 && green > -1 && green < 4 && blue > -1 && blue < 4 && alpha > -1 && alpha < 4);
	assert(size % stride == 0);

	const __m128i* source128 = reinterpret_cast<const __m128i*>(source);
	__m128i*	   dest128 = reinterpret_cast<__m128i*>(dest);	

	__m128i s, m0, m1, r;

	const int shft0 = (red)*8;
	const int shft1 = (green)*8;
	const int shft2 = (3-blue)*8;
	const int shft3 = (3-alpha)*8;

	for(int k = 0, length = size/stride; k < length; ++k)	
	{
		// TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N)		
		// TODO: put prefetch between calculations?(R.N)
		_mm_prefetch(reinterpret_cast<const s8*>(source128 + PSD), _MM_HINT_NTA);

		// work on entire cacheline before next prefetch

		// TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/

		for(int n = 0; n < 4; ++n, ++dest128, ++source128)
		{
			s = _mm_load_si128(source128);
			
			m0 = _mm_srli_epi32(s, shft0);
			m0 = _mm_and_si128(m0, lomask);

			m1 = _mm_srli_epi32(s, shft1);
			m1 = _mm_and_si128(m1, lomask);
			m1 = _mm_slli_epi32(m1, 8);
			
			r = _mm_or_si128(m0, m1);

			m0 = _mm_slli_epi32(s, shft2);
			m0 = _mm_and_si128(m0, himask);
			m0 = _mm_srli_epi32(m0, 8);			

			m1 = _mm_slli_epi32(s, shft3);
			m1 = _mm_and_si128(m1, himask);
			
			m0 = _mm_or_si128(m0, m1);

			r = _mm_or_si128(r, m0);

			_mm_store_si128(dest128, r);
		}
	}
}
inline
T atomic_exchange( volatile T * const dest ,
  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
                                  , const T & >::type val )
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif

  union U {
    Impl::cas128_t i ;
    T t ;
    inline U() {};
  } assume , oldval , newval ;

  oldval.t = *dest ;
  newval.t = val;

  do {
    assume.i = oldval.i ;
    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
  } while ( assume.i != oldval.i );

  return oldval.t ;
}
inline
void atomic_assign( volatile T * const dest ,
  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
                                  , const T & >::type val )
{
  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;

#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif

  const type v = *((type*)&val); // Extract to be sure the value doesn't change

  type assumed ;

  union U {
    T val_T ;
    type val_type ;
    inline U() {};
  } old ;

  old.val_T = *dest ;

  do {
    assumed = old.val_type ;
    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
  } while ( assumed != old.val_type );
}
__inline__ __device__
T atomic_exchange( volatile T * const dest ,
    typename Kokkos::Impl::enable_if<
                  ( sizeof(T) != 4 )
               && ( sizeof(T) != 8 )
             , const T >::type& val )
{
  T return_val;
  // This is a way to (hopefully) avoid dead lock in a warp
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif

  int done = 0;
  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
  unsigned int done_active = 0;
  while (active!=done_active) {
    if(!done) {
      if( Impl::lock_address_cuda_space( (void*) dest ) ) {
        return_val = *dest;
        *dest = val;
        Impl::unlock_address_cuda_space( (void*) dest );
        done = 1;
      }
    }
    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
  }
  return return_val;
}
Exemple #19
0
void * memcpy ( void * destination, const void * source, size_t num )
{
    const Uint8 *src = (const Uint8 *)source;
    Uint8 *dst = (Uint8 *)destination;
    size_t i;
    
    /* All WIN64 architectures have SSE, right? */
    if (!((uintptr_t) src & 15) && !((uintptr_t) dst & 15)) {
        __m128 values[4];
        for (i = num / 64; i--;) {
            _mm_prefetch(src, _MM_HINT_NTA);
            values[0] = *(__m128 *) (src + 0);
            values[1] = *(__m128 *) (src + 16);
            values[2] = *(__m128 *) (src + 32);
            values[3] = *(__m128 *) (src + 48);
            _mm_stream_ps((float *) (dst + 0), values[0]);
            _mm_stream_ps((float *) (dst + 16), values[1]);
            _mm_stream_ps((float *) (dst + 32), values[2]);
            _mm_stream_ps((float *) (dst + 48), values[3]);
            src += 64;
            dst += 64;
        }
        num &= 63;
    }

    while (num--) {
        *dst++ = *src++;
    }
    return destination;
}
inline
long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
{ 
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
  return __sync_val_compare_and_swap(dest,compare,val);
}
Exemple #21
0
_mm_prefetch_buffer(char * buffer, int num_bytes)
{
	__m128i * buf = (__m128i*) buffer;
	unsigned int i;
	for (i = 0; i < (num_bytes / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
	{
		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
	}
}
Exemple #22
0
    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
    static void Prefetch(void* address) {
#if defined(PLATFORM_64)
    #if defined(PLATFORM_WINDOWS)
        _mm_prefetch((char*)address, _MM_HINT_NTA);
    #else
        static_assert(false, "Not yet implemented.");
    #endif
#endif
    }
Exemple #23
0
 double zdotu_soa(
                const int    N,
                const double* da,
                const double* db,
                const int    ix,
                const double* dc,
                const double* dd,
                const int    iy,
                double*  res
                )
{
        __m256d ymm0;
        __m256d ymm1;
        __m256d ymm2;
        __m256d ymm3;
        __m256d ymm4 = _mm256_setzero_pd();
        __m256d ymm5 = _mm256_setzero_pd();
        //
        int ii;
//#pragma unroll
        for(ii = 0; ii < N/4; ii++)
        {
		_mm_prefetch((const char*) da + 0x200, 1);
		_mm_prefetch((const char*) db + 0x200, 1);
		_mm_prefetch((const char*) dc + 0x200, 1);
		_mm_prefetch((const char*) dd + 0x200, 1);
                //IACA_START;
                // 8*4*4 = 128 bytes
                ymm0 = _mm256_loadu_pd(da + 4*ii);
                ymm1 = _mm256_loadu_pd(db + 4*ii);
                ymm2 = _mm256_loadu_pd(dc + 4*ii);
                ymm3 = _mm256_loadu_pd(dd + 4*ii);
                // 2*4*4 = 32 flops
                ymm4 = _mm256_fmsub_pd(ymm0, ymm2, _mm256_fmsub_pd(ymm1, ymm3, ymm4));
                ymm5 = _mm256_fmadd_pd(ymm0, ymm3, _mm256_fmadd_pd(ymm1, ymm2, ymm5));
		// flops/bute ratio = 1/4
                //IACA_END
        }
        double* re = (double*)&ymm4;
        double* im = (double*)&ymm5;
	//
        res[0] = re[0] + re[1] + re[2] + re[3];
        res[1] = im[0] + im[1] + im[2] + im[3];
}
Exemple #24
0
void simd_memcpy_cache(void *dst, void *src, size_t nbytes)
{
    size_t i;
    size_t sm = nbytes - nbytes%sizeof(int);
    size_t ilen = nbytes/sizeof(int);
    size_t ilen_sm = ilen - ilen%16;

    //printf("nbytes=%zu,ilen=%zu,ilen_sm=%zu\n",
    //nbytes,ilen,ilen_sm);


    char *cdst=(char*)dst;
    char *csrc=(char*)src;

    int * idst=(int*)dst;
    int * isrc=(int*)src;

    __m128i l0,l1,l2,l3;

    _mm_prefetch((__m128i*)&isrc[0], _MM_HINT_T0);
    _mm_prefetch((__m128i*)&isrc[4], _MM_HINT_T0);
    _mm_prefetch((__m128i*)&isrc[8], _MM_HINT_T0);
    _mm_prefetch((__m128i*)&isrc[12], _MM_HINT_T0);

    for(i=0; i<ilen_sm; i+=16)
    {
        l0 =  _mm_load_si128((__m128i*)&isrc[i+0]);
        l1 =  _mm_load_si128((__m128i*)&isrc[i+4]);
        l2 =  _mm_load_si128((__m128i*)&isrc[i+8]);
        l3 =  _mm_load_si128((__m128i*)&isrc[i+12]);

        _mm_prefetch((__m128i*)&isrc[i+16], _MM_HINT_T0);
        _mm_prefetch((__m128i*)&isrc[i+20], _MM_HINT_T0);
        _mm_prefetch((__m128i*)&isrc[i+24], _MM_HINT_T0);
        _mm_prefetch((__m128i*)&isrc[i+28], _MM_HINT_T0);

        _mm_store_si128((__m128i*)&idst[i+0],  l0);
        _mm_store_si128((__m128i*)&idst[i+4],  l1);
        _mm_store_si128((__m128i*)&idst[i+8],  l2);
        _mm_store_si128((__m128i*)&idst[i+12], l3);

    }

    for(i=ilen_sm; i<ilen; i++)
    {
        idst[i] = isrc[i];
    }

    for(i=(ilen*4); i<nbytes; i++)
    {
        cdst[i] = csrc[i];
    }
}
Exemple #25
0
  void prefetch(void const* pointer)
  {
#ifdef BOOST_SIMD_ARCH_X86
    #ifdef __GNUC__
      __builtin_prefetch(pointer, 0, 0);
    #elif defined( BOOST_SIMD_HAS_SSE_SUPPORT )
      _mm_prefetch( static_cast<char const *>(pointer), Strategy);
    #endif
#endif
  }
void prefetch_Cblock(const double* C, int col, int row, int m, int n, int k, int bm, int bn, int bk){
	double* C_prefetch = (double*)C + (col * m + row);
	for(int i = 0; i < bn; i++){
		double* C_prefetch_m = C_prefetch;
		for(int j = 0; j < (bm + CACHE_LINE - 1) / CACHE_LINE; j++){
			_mm_prefetch(C_prefetch_m, L2);
			C_prefetch_m += CACHE_LINE;
		}
		C_prefetch += m;
	}
}
void prefetch_Bblock(const double* B, int col, int row, int m, int n, int k, int bm, int bn, int bk){
	double* B_prefetch = (double*)B + (col * k + row);
	for(int i = 0; i < bn; i++){
		double* B_prefetch_k = B_prefetch;
		for(int j = 0; j < (bk + CACHE_LINE - 1) / CACHE_LINE; j++){
			_mm_prefetch(B_prefetch_k, L2);
			B_prefetch_k += CACHE_LINE;
		}
		B_prefetch += k;
	}
}
void prefetch_Ablock(const double* A, int col, int row, int m, int n, int k, int bm, int bn, int bk){
	double* A_prefetch = (double*)A + (col * m + row);
	for(int i = 0; i < bk; i++){
		double* A_prefetch_m = A_prefetch;
		for(int j = 0; j < (bm + CACHE_LINE - 1) / CACHE_LINE; j++){
			_mm_prefetch(A_prefetch_m, L2);
			A_prefetch_m += CACHE_LINE;
		}
		A_prefetch += m;
	}
}
Exemple #29
0
static void adddiff_sse2_t(Byte *pDst, ptrdiff_t dst_pitch, const Byte *pSrc, ptrdiff_t src_pitch, int width, int height)
{
    int mod32_width = (width / 32) * 32;
    auto pDst2 = pDst;
    auto pSrc2 = pSrc;
    auto v128 = _mm_set1_epi32(0x80808080);

    for ( int j = 0; j < height; ++j ) {
        for ( int i = 0; i < mod32_width; i+=32 ) {
            _mm_prefetch(reinterpret_cast<const char*>(pDst)+i+128, _MM_HINT_T0);
            _mm_prefetch(reinterpret_cast<const char*>(pSrc)+i+128, _MM_HINT_T0);

            auto dst = simd_load_si128<mem_mode>(pDst+i);
            auto dst2 = simd_load_si128<mem_mode>(pDst+i+16);
            auto src = simd_load_si128<mem_mode>(pSrc+i);
            auto src2 = simd_load_si128<mem_mode>(pSrc+i+16);

            auto dstsub = _mm_sub_epi8(dst, v128);
            auto dstsub2 = _mm_sub_epi8(dst2, v128);

            auto srcsub = _mm_sub_epi8(src, v128);
            auto srcsub2 = _mm_sub_epi8(src2, v128);

            auto added = _mm_adds_epi8(dstsub, srcsub);
            auto added2 = _mm_adds_epi8(dstsub2, srcsub2);

            auto result = _mm_add_epi8(added, v128);
            auto result2 = _mm_add_epi8(added2, v128);

            simd_store_si128<mem_mode>(pDst+i, result);
            simd_store_si128<mem_mode>(pDst+i+16, result2);
        }
        pDst += dst_pitch;
        pSrc += src_pitch;
    }

    if (width > mod32_width) {
        adddiff_c(pDst2 + mod32_width, dst_pitch, pSrc2 + mod32_width, src_pitch, width - mod32_width, height);
    }
}
static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size)
{
	__m128i *dqdest = (__m128i*)dest;
	const __m128i *dqsrc  = (const __m128i*)src;
	const __m128i *end  = (const __m128i*)((const char*)src + size);

	do {
		_mm_prefetch(dqsrc + 4, _MM_HINT_NTA);
		_mm_prefetch(dqsrc + 6, _MM_HINT_NTA);

		__m128i xmm0 = _mm_load_si128(dqsrc + 0);
		__m128i xmm1 = _mm_load_si128(dqsrc + 1);
		__m128i xmm2 = _mm_load_si128(dqsrc + 2);
		__m128i xmm3 = _mm_load_si128(dqsrc + 3);
		dqsrc  += 4;
		_mm_stream_si128(dqdest + 0, xmm0);
		_mm_stream_si128(dqdest + 1, xmm1);
		_mm_stream_si128(dqdest + 2, xmm2);
		_mm_stream_si128(dqdest	+ 3, xmm3);
		dqdest += 4;
	} while (dqsrc != end);
}