void dmul(unsigned int N, const double* a, const double* b, double* y) { flops_counter += N ; #ifdef GX_SSE if(SSE2_supported) { __m128d Y1, Y2, A1, A2, B1, B2 ; unsigned int i = 0 ; while(i<N) { _mm_prefetch((const char*)(&a[i] + 256), _MM_HINT_NTA) ; _mm_prefetch((const char*)(&b[i] + 256), _MM_HINT_NTA) ; A1 = _mm_load_pd(&a[i]) ; B1 = _mm_load_pd(&b[i]) ; Y1 = _mm_mul_pd(A1,B1) ; i += 2 ; A2 = _mm_load_pd(&a[i]) ; B2 = _mm_load_pd(&b[i]) ; Y2 = _mm_mul_pd(A2,B2) ; i += 2 ; _mm_stream_pd(&y[i - 4], Y1) ; _mm_stream_pd(&y[i - 2], Y2) ; } _mm_sfence() ; return ; } #endif for(unsigned int i=0; i<N; i++) { y[i] = a[i] * b[i] ; } }
void daxpy(unsigned int N, double a, const double* x, double* y) { flops_counter += (2*N) ; #ifdef GX_SSE if(SSE2_supported) { __m128d Y1, Y2, X1, X2, AA ; SSE_ALIGNED(double temp[2]) ; temp[0] = a ; temp[1] = a ; AA = _mm_load_pd(temp) ; unsigned int i = 0 ; while(i<N) { _mm_prefetch((const char*)(&x[i] + 128), _MM_HINT_NTA) ; _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ; X1 = _mm_load_pd(&x[i]) ; Y1 = _mm_load_pd(&y[i]) ; Y1 = _mm_add_pd(Y1, _mm_mul_pd(X1, AA)) ; i += 2 ; X2 = _mm_load_pd(&x[i]) ; Y2 = _mm_load_pd(&y[i]) ; Y2 = _mm_add_pd(Y2, _mm_mul_pd(X2, AA)) ; i += 2 ; _mm_stream_pd(&y[i - 4], Y1) ; _mm_stream_pd(&y[i - 2], Y2) ; } _mm_sfence() ; return ; } #endif for(unsigned int i=0; i<N; i++) { y[i] += a * x[i] ; } }
double ddot(unsigned int N, const double* x, const double* y) { flops_counter += (2*N) ; #ifdef GX_SSE if(SSE2_supported) { __m128d X1, Y1, X2, Y2 ; __m128d acc1 = _mm_setzero_pd() ; __m128d acc2 = _mm_setzero_pd() ; SSE_ALIGNED(double temp[2]) ; unsigned int i = 0 ; while(i<N) { _mm_prefetch((const char*)(&x[i] + 128), _MM_HINT_NTA) ; _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ; X1 = _mm_load_pd(&x[i]) ; Y1 = _mm_load_pd(&y[i]) ; acc1 = _mm_add_pd(acc1, _mm_mul_pd(X1,Y1)) ; i += 2 ; X2 = _mm_load_pd(&x[i]) ; Y2 = _mm_load_pd(&y[i]) ; acc2 = _mm_add_pd(acc2, _mm_mul_pd(X2,Y2)) ; i += 2 ; } acc1 = _mm_add_pd(acc1, acc2) ; _mm_store_pd(temp, acc1) ; return temp[0] + temp[1] ; } #endif double result = 0.0 ; for(unsigned int i=0; i<N; i++) { result += x[i]*y[i] ; } return result ; }
// Faster than multiply when you have to mutiply many vectors by the same matrix // Using this function, we can efficiently prefetch data, and only have to // transpose the matrix once void Mat44::BatchMult(const float4 * const in, float4 *out, u32 len) const { Mat44 tr = Transpose(); __m128 matcols[] = { _mm_load_ps(tr.mat), _mm_load_ps(tr.mat+4), _mm_load_ps(tr.mat+8), _mm_load_ps(tr.mat+12) }; while(len--) { __m128 v = _mm_load_ps(in[len].GetVec()); _mm_prefetch((const char*)&in[len+1], _MM_HINT_T0); // Broadcast vector into SSE registers __m128 xb = _mm_shuffle_ps(v,v,0x00); __m128 yb = _mm_shuffle_ps(v,v,0x55); __m128 zb = _mm_shuffle_ps(v,v,0xAA); __m128 wb = _mm_shuffle_ps(v,v,0xFF); // Perform multiplication by matrix columns xb = _mm_mul_ps(xb, matcols[0]); yb = _mm_mul_ps(yb, matcols[1]); zb = _mm_mul_ps(zb, matcols[2]); wb = _mm_mul_ps(wb, matcols[3]); // Add results __m128 r = _mm_add_ps(_mm_add_ps(xb, yb),_mm_add_ps(zb, wb)); _mm_prefetch((const char*)&out[len+1], _MM_HINT_T0); _mm_store_ps(out[len].GetVec(), r); } };
void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); static const __m128i round = _mm_set1_epi16(128); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); assert(alpha >= 0.0 && alpha <= 1.0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i s = _mm_setzero_si128(); __m128i d = _mm_setzero_si128(); const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f)); __m128i drb, dga, srb, sga; for (size_t k = 0, length = size/stride; k < length; ++k) { _mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA); // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // r = d + (s-d)*alpha/256 s = _mm_load_si128(source128_1); // AABBGGRR d = _mm_load_si128(source128_2); // AABBGGRR srb = _mm_and_si128(lomask, s); // 00BB00RR // unpack sga = _mm_srli_epi16(s, 8); // AA00GG00 // unpack drb = _mm_and_si128(lomask, d); // 00BB00RR // unpack dga = _mm_srli_epi16(d, 8); // AA00GG00 // unpack srb = _mm_sub_epi16(srb, drb); // BBBBRRRR // sub srb = _mm_mullo_epi16(srb, a); // BBBBRRRR // mul srb = _mm_add_epi16(srb, round); sga = _mm_sub_epi16(sga, dga); // AAAAGGGG // sub sga = _mm_mullo_epi16(sga, a); // AAAAGGGG // mul sga = _mm_add_epi16(sga, round); srb = _mm_srli_epi16(srb, 8); // 00BB00RR // prepack and div sga = _mm_andnot_si128(lomask, sga);// AA00GG00 // prepack and div srb = _mm_or_si128(srb, sga); // AABBGGRR // pack srb = _mm_add_epi8(srb, d); // AABBGGRR // add there is no overflow(R.N) _mm_store_si128(dest128, srb); } } }
double vector_ps_double (const double* pa,const double* pb,size_t n) { size_t k; /* multiplication 4 par 4 */ size_t q = n / 4; size_t r = n % 4; double w; _mm_prefetch (pa,_MM_HINT_NTA); _mm_prefetch (pb,_MM_HINT_NTA); if (q > 0) { __m128d acc1 = _mm_setzero_pd(); __m128d acc2 = _mm_setzero_pd(); if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { for (k=0;k<q;k++) { /* Charge 2 doubles dans chaque tableau */ __m128d i1 = _mm_load_pd(pa); __m128d j1 = _mm_load_pd(pb); __m128d i2 = _mm_load_pd(pa+2); __m128d j2 = _mm_load_pd(pb+2); /* incrément de 4 doubles en tout (2 pour i et 2 pour j) */ /* Multiplie */ __m128d s1 = _mm_mul_pd(i1,j1); __m128d s2 = _mm_mul_pd(i2,j2); pa += 4; pb += 4; /* Accumule */ acc1 = _mm_add_pd(acc1,s1); acc2 = _mm_add_pd(acc2,s2); } } else { for (k=0;k<q;k++) { /* Charge 2 doubles dans chaque tableau */ __m128d i1 = _mm_loadu_pd(pa); __m128d j1 = _mm_loadu_pd(pb); __m128d i2 = _mm_loadu_pd(pa+2); __m128d j2 = _mm_loadu_pd(pb+2); /* Multiplie */ __m128d s1 = _mm_mul_pd(i1,j1); __m128d s2 = _mm_mul_pd(i2,j2); pa += 4; pb += 4; /* Accumule */ acc1 = _mm_add_pd(acc1,s1); acc2 = _mm_add_pd(acc2,s2); } } /* Somme finale */ acc1 = _mm_add_pd(acc1,acc2); acc1 = _mm_hadd_pd(acc1,acc1); _mm_store_sd(&w,acc1); } else { w = 0; } for (k=0;k<r;k++) w += (*pa++) * (*pb++); return w; }
void init_xrpow_core_sse(gr_info * const cod_info, FLOAT xrpow[576], int upper, FLOAT * sum) { int i; float tmp_max = 0; float tmp_sum = 0; int upper4 = (upper / 4) * 4; int rest = upper-upper4; const vecfloat_union fabs_mask = {{ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }}; const __m128 vec_fabs_mask = _mm_loadu_ps(&fabs_mask._float[0]); vecfloat_union vec_xrpow_max; vecfloat_union vec_sum; vecfloat_union vec_tmp; _mm_prefetch((char *) cod_info->xr, _MM_HINT_T0); _mm_prefetch((char *) xrpow, _MM_HINT_T0); vec_xrpow_max._m128 = _mm_set_ps1(0); vec_sum._m128 = _mm_set_ps1(0); for (i = 0; i < upper4; i += 4) { vec_tmp._m128 = _mm_loadu_ps(&(cod_info->xr[i])); /* load */ vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */ vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128); vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128))); vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */ _mm_storeu_ps(&(xrpow[i]), vec_tmp._m128); /* store into xrpow[] */ } vec_tmp._m128 = _mm_set_ps1(0); switch (rest) { case 3: vec_tmp._float[2] = cod_info->xr[upper4+2]; case 2: vec_tmp._float[1] = cod_info->xr[upper4+1]; case 1: vec_tmp._float[0] = cod_info->xr[upper4+0]; vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */ vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128); vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128))); vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */ switch (rest) { case 3: xrpow[upper4+2] = vec_tmp._float[2]; case 2: xrpow[upper4+1] = vec_tmp._float[1]; case 1: xrpow[upper4+0] = vec_tmp._float[0]; default: break; } default: break; } tmp_sum = vec_sum._float[0] + vec_sum._float[1] + vec_sum._float[2] + vec_sum._float[3]; { float ma = vec_xrpow_max._float[0] > vec_xrpow_max._float[1] ? vec_xrpow_max._float[0] : vec_xrpow_max._float[1]; float mb = vec_xrpow_max._float[2] > vec_xrpow_max._float[3] ? vec_xrpow_max._float[2] : vec_xrpow_max._float[3]; tmp_max = ma > mb ? ma : mb; } cod_info->xrpow_max = tmp_max; *sum = tmp_sum; }
void test_prefetch (char *p) { _mm_prefetch (p, _MM_HINT_T0); _mm_prefetch (p+4, _MM_HINT_T1); _mm_prefetch (p+8, _MM_HINT_T2); _mm_prefetch (p+12, _MM_HINT_NTA); }
void PreOver_FastSSE2(void* dest, const void* source1, const void* source2, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i d, s, a, rb, ag; // TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N) for(int k = 0, length = size/stride; k < length; ++k) { // TODO: put prefetch between calculations?(R.N) _mm_prefetch(reinterpret_cast<const s8*>(source128_1+PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const s8*>(source128_2+PSD), _MM_HINT_NTA); //work on entire cacheline before next prefetch for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ s = _mm_load_si128(source128_1); // AABGGRR d = _mm_load_si128(source128_2); // AABGGRR // set alpha to lo16 from dest_ rb = _mm_srli_epi32(d, 24); // 000000AA a = _mm_slli_epi32(rb, 16); // 00AA0000 a = _mm_or_si128(rb, a); // 00AA00AA // fix alpha a = a > 127 ? a+1 : a // NOTE: If removed an *overflow* will occur with large values (R.N) rb = _mm_srli_epi16(a, 7); a = _mm_add_epi16(a, rb); rb = _mm_and_si128(lomask, s); // 00B00RR unpack rb = _mm_mullo_epi16(rb, a); // BBRRRR mul (D[A]*S) rb = _mm_srli_epi16(rb, 8); // 00B00RR prepack and div [(D[A]*S)]/255 ag = _mm_srli_epi16(s, 8); // 00AA00GG unpack ag = _mm_mullo_epi16(ag, a); // AAAAGGGG mul (D[A]*S) ag = _mm_andnot_si128(lomask, ag); // AA00GG00 prepack and div [(D[A]*S)]/255 rb = _mm_or_si128(rb, ag); // AABGGRR pack rb = _mm_sub_epi8(s, rb); // sub S-[(D[A]*S)/255] d = _mm_add_epi8(d, rb); // add D+[S-(D[A]*S)/255] _mm_store_si128(dest128, d); } } }
/* Fast remote SCI copy for systems with write-combining enabled. This is the version using SSE instructions to copy 128 Byte blocks, and flushes after 64 Byte. */ void _mpid_smi_sse64_memcpy(void *dest, const void *src, size_t size) { char* a = (char*) src; char* b = (char*) dest; size_t j = 0; __m128 xmm[8]; /* Align the destination to a 64 Byte boundary */ for(; (j < size) && (((size_t) &b[j]) % 64 != 0); j++) ((char*) b)[j] = ((char*) a)[j]; // Loads two cache lines of data to a location closer to the processor. _mm_prefetch(a+j, _MM_HINT_NTA); _mm_prefetch(a+j+64, _MM_HINT_NTA); /* copy 128 byte per loop */ for (; (j+128) < size; j+=128) { // Loads two cache lines of data to a location closer to the processor. _mm_prefetch(a+j+128, _MM_HINT_NTA); _mm_prefetch(a+j+192, _MM_HINT_NTA); /* load 128 Byte into xmm register */ xmm[0] = _mm_load_ps((float*) &a[j]); xmm[1] = _mm_load_ps((float*) &a[j+16]); xmm[2] = _mm_load_ps((float*) &a[j+32]); xmm[3] = _mm_load_ps((float*) &a[j+48]); xmm[4] = _mm_load_ps((float*) &a[j+64]); xmm[5] = _mm_load_ps((float*) &a[j+80]); xmm[6] = _mm_load_ps((float*) &a[j+96]); xmm[7] = _mm_load_ps((float*) &a[j+112]); /* store 64 byte */ _mm_stream_ps((float*) &b[j], xmm[0]); _mm_stream_ps((float*) &b[j+16], xmm[1]); _mm_stream_ps((float*) &b[j+32], xmm[2]); _mm_stream_ps((float*) &b[j+48], xmm[3]); /* flush the write-combine buffer */ _mm_sfence(); /* store 64 byte */ _mm_stream_ps((float*) &b[j+64], xmm[4]); _mm_stream_ps((float*) &b[j+80], xmm[5]); _mm_stream_ps((float*) &b[j+96], xmm[6]); _mm_stream_ps((float*) &b[j+112], xmm[7]); /* flush the write-combine buffer */ _mm_sfence(); } /* copy tail */ for(; j<size; j++) ((char*) b)[j] = ((char*) a)[j]; }
double evaluateGTRGAMMA_MIC(int *ex1, int *ex2, int *wgt, double *x1_start, double *x2_start, double *tipVector, unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling) { double sum = 0.0; /* the left node is a tip */ if(tipX1) { double aTipVec[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int k = 0; k < 16; k++) { for(int l = 0; l < 4; l++) { aTipVec[k*16 + l] = aTipVec[k*16 + 4 + l] = aTipVec[k*16 + 8 + l] = aTipVec[k*16 + 12 + l] = tipVector[k*4 + l]; } } /* loop over the sites of this partition */ for (int i = 0; i < n; i++) { _mm_prefetch((const char*) &x2_start[span*(i+8)], _MM_HINT_T1); _mm_prefetch((const char*) &x2_start[span*(i+8) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x2_start[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x2_start[span*(i+1) + 8], _MM_HINT_T0); /* access pre-computed tip vector values via a lookup table */ const double *x1 = &(aTipVec[16 * tipX1[i]]); /* access the other(inner) node at the other end of the branch */ const double *x2 = &(x2_start[span * i]); double term = 0.; #pragma ivdep #pragma vector aligned for(int j = 0; j < span; j++) term += x1[j] * x2[j] * diagptable[j]; if(!fastScaling) term = log(0.25 * term) + (ex2[i] * log(PLL_MINLIKELIHOOD)); else term = log(0.25 * term); sum += wgt[i] * term; } } else { for (int i = 0; i < n; i++)
inline T atomic_compare_exchange( volatile T * const dest , const T compare , typename Kokkos::Impl::enable_if< ( sizeof(T) != 4 ) && ( sizeof(T) != 8 ) #if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_ENABLE_ISA_X86_64 ) && ( sizeof(T) != 16 ) #endif , const T >::type& val ) { #if defined( KOKKOS_ENABLE_RFO_PREFETCH ) _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); #endif while( !Impl::lock_address_host_space( (void*) dest ) ); T return_val = *dest; if( return_val == compare ) { // Don't use the following line of code here: // //const T tmp = *dest = val; // // Instead, put each assignment in its own statement. This is // because the overload of T::operator= for volatile *this should // return void, not volatile T&. See Kokkos #177: // // https://github.com/kokkos/kokkos/issues/177 *dest = val; const T tmp = *dest; #ifndef KOKKOS_COMPILER_CLANG (void) tmp; #endif } Impl::unlock_address_host_space( (void*) dest ); return return_val; }
void TripletConnection::propagate_backward() { if (stdp_active) { SpikeContainer::const_iterator spikes_end = dst->get_spikes_immediate()->end(); // loop over all spikes for (SpikeContainer::const_iterator spike = dst->get_spikes_immediate()->begin() ; // spike = post_spike spike != spikes_end ; ++spike ) { // Since we need the local id of the postsynaptic neuron that spiked // multiple times, we translate it here: NeuronID translated_spike = dst->global2rank(*spike); // loop over all presynaptic partners for (const NeuronID * c = bkw->get_row_begin(*spike) ; c != bkw->get_row_end(*spike) ; ++c ) { #ifdef CODE_ACTIVATE_PREFETCHING_INTRINSICS // prefetches next memory cells to reduce number of last-level cache misses _mm_prefetch((const char *)bkw_data[c-bkw_ind+2], _MM_HINT_NTA); #endif // computes plasticity update AurynWeight * weight = bkw->get_data(c); *weight += dw_post(*c,translated_spike); // clips too large weights if (*weight>get_max_weight()) *weight=get_max_weight(); } } } }
char *gen(char *buf,int size,int offset){int i; for(i=0;i<size;i++){ buf[i+offset]=(rand_r(&r_seed)%128)+1; } buf[i+offset]=0; #ifdef DIRTY_CACHE /*As we wrote to writeback cache we are dealing with dirty cache lines.*/ #else for(i=0;i<=size+64;i+=64){ _mm_prefetch(buf+i+offset ,_MM_HINT_T0); } #endif #ifdef NO_CACHE for(i=0;i<=size+64;i+=64){ _mm_clflush(buf+i+offset); } #endif #ifdef NO_ICACHE forget_icache(rand_r(&r_seed)%2048); forget_icache(rand_r(&r_seed)%2048); forget_icache(rand_r(&r_seed)%2048); forget_icache(rand_r(&r_seed)%2048); #endif return buf+offset; }
// TODO: should be optimized for different combinations (R.N) void Shuffle_SSE2(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i himask = _mm_set1_epi32(0xFF000000); static const __m128i lomask = _mm_set1_epi32(0x000000FF); assert(source != NULL && dest != NULL); assert(red > -1 && red < 4 && green > -1 && green < 4 && blue > -1 && blue < 4 && alpha > -1 && alpha < 4); assert(size % stride == 0); const __m128i* source128 = reinterpret_cast<const __m128i*>(source); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i s, m0, m1, r; const int shft0 = (red)*8; const int shft1 = (green)*8; const int shft2 = (3-blue)*8; const int shft3 = (3-alpha)*8; for(int k = 0, length = size/stride; k < length; ++k) { // TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N) // TODO: put prefetch between calculations?(R.N) _mm_prefetch(reinterpret_cast<const s8*>(source128 + PSD), _MM_HINT_NTA); // work on entire cacheline before next prefetch // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ for(int n = 0; n < 4; ++n, ++dest128, ++source128) { s = _mm_load_si128(source128); m0 = _mm_srli_epi32(s, shft0); m0 = _mm_and_si128(m0, lomask); m1 = _mm_srli_epi32(s, shft1); m1 = _mm_and_si128(m1, lomask); m1 = _mm_slli_epi32(m1, 8); r = _mm_or_si128(m0, m1); m0 = _mm_slli_epi32(s, shft2); m0 = _mm_and_si128(m0, himask); m0 = _mm_srli_epi32(m0, 8); m1 = _mm_slli_epi32(s, shft3); m1 = _mm_and_si128(m1, himask); m0 = _mm_or_si128(m0, m1); r = _mm_or_si128(r, m0); _mm_store_si128(dest128, r); } } }
inline T atomic_exchange( volatile T * const dest , typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t) , const T & >::type val ) { #if defined( KOKKOS_ENABLE_RFO_PREFETCH ) _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); #endif union U { Impl::cas128_t i ; T t ; inline U() {}; } assume , oldval , newval ; oldval.t = *dest ; newval.t = val; do { assume.i = oldval.i ; oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i ); } while ( assume.i != oldval.i ); return oldval.t ; }
inline void atomic_assign( volatile T * const dest , typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long) , const T & >::type val ) { typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ; #if defined( KOKKOS_ENABLE_RFO_PREFETCH ) _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); #endif const type v = *((type*)&val); // Extract to be sure the value doesn't change type assumed ; union U { T val_T ; type val_type ; inline U() {}; } old ; old.val_T = *dest ; do { assumed = old.val_type ; old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v ); } while ( assumed != old.val_type ); }
__inline__ __device__ T atomic_exchange( volatile T * const dest , typename Kokkos::Impl::enable_if< ( sizeof(T) != 4 ) && ( sizeof(T) != 8 ) , const T >::type& val ) { T return_val; // This is a way to (hopefully) avoid dead lock in a warp #if defined( KOKKOS_ENABLE_RFO_PREFETCH ) _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); #endif int done = 0; unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); unsigned int done_active = 0; while (active!=done_active) { if(!done) { if( Impl::lock_address_cuda_space( (void*) dest ) ) { return_val = *dest; *dest = val; Impl::unlock_address_cuda_space( (void*) dest ); done = 1; } } done_active = KOKKOS_IMPL_CUDA_BALLOT(done); } return return_val; }
void * memcpy ( void * destination, const void * source, size_t num ) { const Uint8 *src = (const Uint8 *)source; Uint8 *dst = (Uint8 *)destination; size_t i; /* All WIN64 architectures have SSE, right? */ if (!((uintptr_t) src & 15) && !((uintptr_t) dst & 15)) { __m128 values[4]; for (i = num / 64; i--;) { _mm_prefetch(src, _MM_HINT_NTA); values[0] = *(__m128 *) (src + 0); values[1] = *(__m128 *) (src + 16); values[2] = *(__m128 *) (src + 32); values[3] = *(__m128 *) (src + 48); _mm_stream_ps((float *) (dst + 0), values[0]); _mm_stream_ps((float *) (dst + 16), values[1]); _mm_stream_ps((float *) (dst + 32), values[2]); _mm_stream_ps((float *) (dst + 48), values[3]); src += 64; dst += 64; } num &= 63; } while (num--) { *dst++ = *src++; } return destination; }
inline long atomic_compare_exchange( volatile long * const dest, const long compare, const long val ) { #if defined( KOKKOS_ENABLE_RFO_PREFETCH ) _mm_prefetch( (const char*) dest, _MM_HINT_ET0 ); #endif return __sync_val_compare_and_swap(dest,compare,val); }
_mm_prefetch_buffer(char * buffer, int num_bytes) { __m128i * buf = (__m128i*) buffer; unsigned int i; for (i = 0; i < (num_bytes / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA); } }
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - static void Prefetch(void* address) { #if defined(PLATFORM_64) #if defined(PLATFORM_WINDOWS) _mm_prefetch((char*)address, _MM_HINT_NTA); #else static_assert(false, "Not yet implemented."); #endif #endif }
double zdotu_soa( const int N, const double* da, const double* db, const int ix, const double* dc, const double* dd, const int iy, double* res ) { __m256d ymm0; __m256d ymm1; __m256d ymm2; __m256d ymm3; __m256d ymm4 = _mm256_setzero_pd(); __m256d ymm5 = _mm256_setzero_pd(); // int ii; //#pragma unroll for(ii = 0; ii < N/4; ii++) { _mm_prefetch((const char*) da + 0x200, 1); _mm_prefetch((const char*) db + 0x200, 1); _mm_prefetch((const char*) dc + 0x200, 1); _mm_prefetch((const char*) dd + 0x200, 1); //IACA_START; // 8*4*4 = 128 bytes ymm0 = _mm256_loadu_pd(da + 4*ii); ymm1 = _mm256_loadu_pd(db + 4*ii); ymm2 = _mm256_loadu_pd(dc + 4*ii); ymm3 = _mm256_loadu_pd(dd + 4*ii); // 2*4*4 = 32 flops ymm4 = _mm256_fmsub_pd(ymm0, ymm2, _mm256_fmsub_pd(ymm1, ymm3, ymm4)); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, _mm256_fmadd_pd(ymm1, ymm2, ymm5)); // flops/bute ratio = 1/4 //IACA_END } double* re = (double*)&ymm4; double* im = (double*)&ymm5; // res[0] = re[0] + re[1] + re[2] + re[3]; res[1] = im[0] + im[1] + im[2] + im[3]; }
void simd_memcpy_cache(void *dst, void *src, size_t nbytes) { size_t i; size_t sm = nbytes - nbytes%sizeof(int); size_t ilen = nbytes/sizeof(int); size_t ilen_sm = ilen - ilen%16; //printf("nbytes=%zu,ilen=%zu,ilen_sm=%zu\n", //nbytes,ilen,ilen_sm); char *cdst=(char*)dst; char *csrc=(char*)src; int * idst=(int*)dst; int * isrc=(int*)src; __m128i l0,l1,l2,l3; _mm_prefetch((__m128i*)&isrc[0], _MM_HINT_T0); _mm_prefetch((__m128i*)&isrc[4], _MM_HINT_T0); _mm_prefetch((__m128i*)&isrc[8], _MM_HINT_T0); _mm_prefetch((__m128i*)&isrc[12], _MM_HINT_T0); for(i=0; i<ilen_sm; i+=16) { l0 = _mm_load_si128((__m128i*)&isrc[i+0]); l1 = _mm_load_si128((__m128i*)&isrc[i+4]); l2 = _mm_load_si128((__m128i*)&isrc[i+8]); l3 = _mm_load_si128((__m128i*)&isrc[i+12]); _mm_prefetch((__m128i*)&isrc[i+16], _MM_HINT_T0); _mm_prefetch((__m128i*)&isrc[i+20], _MM_HINT_T0); _mm_prefetch((__m128i*)&isrc[i+24], _MM_HINT_T0); _mm_prefetch((__m128i*)&isrc[i+28], _MM_HINT_T0); _mm_store_si128((__m128i*)&idst[i+0], l0); _mm_store_si128((__m128i*)&idst[i+4], l1); _mm_store_si128((__m128i*)&idst[i+8], l2); _mm_store_si128((__m128i*)&idst[i+12], l3); } for(i=ilen_sm; i<ilen; i++) { idst[i] = isrc[i]; } for(i=(ilen*4); i<nbytes; i++) { cdst[i] = csrc[i]; } }
void prefetch(void const* pointer) { #ifdef BOOST_SIMD_ARCH_X86 #ifdef __GNUC__ __builtin_prefetch(pointer, 0, 0); #elif defined( BOOST_SIMD_HAS_SSE_SUPPORT ) _mm_prefetch( static_cast<char const *>(pointer), Strategy); #endif #endif }
void prefetch_Cblock(const double* C, int col, int row, int m, int n, int k, int bm, int bn, int bk){ double* C_prefetch = (double*)C + (col * m + row); for(int i = 0; i < bn; i++){ double* C_prefetch_m = C_prefetch; for(int j = 0; j < (bm + CACHE_LINE - 1) / CACHE_LINE; j++){ _mm_prefetch(C_prefetch_m, L2); C_prefetch_m += CACHE_LINE; } C_prefetch += m; } }
void prefetch_Bblock(const double* B, int col, int row, int m, int n, int k, int bm, int bn, int bk){ double* B_prefetch = (double*)B + (col * k + row); for(int i = 0; i < bn; i++){ double* B_prefetch_k = B_prefetch; for(int j = 0; j < (bk + CACHE_LINE - 1) / CACHE_LINE; j++){ _mm_prefetch(B_prefetch_k, L2); B_prefetch_k += CACHE_LINE; } B_prefetch += k; } }
void prefetch_Ablock(const double* A, int col, int row, int m, int n, int k, int bm, int bn, int bk){ double* A_prefetch = (double*)A + (col * m + row); for(int i = 0; i < bk; i++){ double* A_prefetch_m = A_prefetch; for(int j = 0; j < (bm + CACHE_LINE - 1) / CACHE_LINE; j++){ _mm_prefetch(A_prefetch_m, L2); A_prefetch_m += CACHE_LINE; } A_prefetch += m; } }
static void adddiff_sse2_t(Byte *pDst, ptrdiff_t dst_pitch, const Byte *pSrc, ptrdiff_t src_pitch, int width, int height) { int mod32_width = (width / 32) * 32; auto pDst2 = pDst; auto pSrc2 = pSrc; auto v128 = _mm_set1_epi32(0x80808080); for ( int j = 0; j < height; ++j ) { for ( int i = 0; i < mod32_width; i+=32 ) { _mm_prefetch(reinterpret_cast<const char*>(pDst)+i+128, _MM_HINT_T0); _mm_prefetch(reinterpret_cast<const char*>(pSrc)+i+128, _MM_HINT_T0); auto dst = simd_load_si128<mem_mode>(pDst+i); auto dst2 = simd_load_si128<mem_mode>(pDst+i+16); auto src = simd_load_si128<mem_mode>(pSrc+i); auto src2 = simd_load_si128<mem_mode>(pSrc+i+16); auto dstsub = _mm_sub_epi8(dst, v128); auto dstsub2 = _mm_sub_epi8(dst2, v128); auto srcsub = _mm_sub_epi8(src, v128); auto srcsub2 = _mm_sub_epi8(src2, v128); auto added = _mm_adds_epi8(dstsub, srcsub); auto added2 = _mm_adds_epi8(dstsub2, srcsub2); auto result = _mm_add_epi8(added, v128); auto result2 = _mm_add_epi8(added2, v128); simd_store_si128<mem_mode>(pDst+i, result); simd_store_si128<mem_mode>(pDst+i+16, result2); } pDst += dst_pitch; pSrc += src_pitch; } if (width > mod32_width) { adddiff_c(pDst2 + mod32_width, dst_pitch, pSrc2 + mod32_width, src_pitch, width - mod32_width, height); } }
static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size) { __m128i *dqdest = (__m128i*)dest; const __m128i *dqsrc = (const __m128i*)src; const __m128i *end = (const __m128i*)((const char*)src + size); do { _mm_prefetch(dqsrc + 4, _MM_HINT_NTA); _mm_prefetch(dqsrc + 6, _MM_HINT_NTA); __m128i xmm0 = _mm_load_si128(dqsrc + 0); __m128i xmm1 = _mm_load_si128(dqsrc + 1); __m128i xmm2 = _mm_load_si128(dqsrc + 2); __m128i xmm3 = _mm_load_si128(dqsrc + 3); dqsrc += 4; _mm_stream_si128(dqdest + 0, xmm0); _mm_stream_si128(dqdest + 1, xmm1); _mm_stream_si128(dqdest + 2, xmm2); _mm_stream_si128(dqdest + 3, xmm3); dqdest += 4; } while (dqsrc != end); }