void dscal(unsigned int N, double a, double* y) { flops_counter += N ; #ifdef GX_SSE if(SSE2_supported) { __m128d Y1, Y2, AA ; SSE_ALIGNED(double temp[2]) ; temp[0] = a ; temp[1] = a ; AA = _mm_load_pd(temp) ; unsigned int i = 0 ; while(i<N) { _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ; Y1 = _mm_load_pd(&y[i]) ; Y1 = _mm_mul_pd(Y1, AA) ; i += 2 ; Y2 = _mm_load_pd(&y[i]) ; Y2 = _mm_mul_pd(Y2, AA) ; i += 2 ; _mm_stream_pd(&y[i - 4], Y1) ; _mm_stream_pd(&y[i - 2], Y2) ; } _mm_sfence() ; return ; } #endif for(unsigned int i=0; i<N; i++) { y[i] *= a ; } }
void dmul(unsigned int N, const double* a, const double* b, double* y) { flops_counter += N ; #ifdef GX_SSE if(SSE2_supported) { __m128d Y1, Y2, A1, A2, B1, B2 ; unsigned int i = 0 ; while(i<N) { _mm_prefetch((const char*)(&a[i] + 256), _MM_HINT_NTA) ; _mm_prefetch((const char*)(&b[i] + 256), _MM_HINT_NTA) ; A1 = _mm_load_pd(&a[i]) ; B1 = _mm_load_pd(&b[i]) ; Y1 = _mm_mul_pd(A1,B1) ; i += 2 ; A2 = _mm_load_pd(&a[i]) ; B2 = _mm_load_pd(&b[i]) ; Y2 = _mm_mul_pd(A2,B2) ; i += 2 ; _mm_stream_pd(&y[i - 4], Y1) ; _mm_stream_pd(&y[i - 2], Y2) ; } _mm_sfence() ; return ; } #endif for(unsigned int i=0; i<N; i++) { y[i] = a[i] * b[i] ; } }
void dzero(unsigned int N, double* y) { #ifdef GX_SSE if(SSE2_supported) { __m128d Z = _mm_setzero_pd() ; for(unsigned int i=0; i<N; i+=4) { _mm_stream_pd(&y[i], Z) ; _mm_stream_pd(&y[i + 2], Z) ; } _mm_sfence() ; return ; } #endif memset(y, 0, N*sizeof(double)) ; }
void transpose_aligned(double *a, double *b, int N1, int N2, double factor) { int i,j,k,k1,it,jt,itt,jtt,conflict,tmp,tmpN; double *pA, *pB; register __m128d x, y, z, w,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); for (it = 0; it < N1; it=it+tilesize) { for (jt = 0; jt < N2; jt=jt+tilesize) { k = 0; for (j = jt; j < jt+tilesize; j=j+2) { for (i = it; i < it+tilesize; i=i+2) { pA = a+i*N2+j; x = _mm_load_pd(pA); y = _mm_load_pd(pA + N2); x = _mm_mul_pd(x,fac_vector); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); k = (j-jt)*tilesize + (i-it); _mm_store_pd(buf + k,z); _mm_store_pd(buf + k + tilesize,w); } } k = 0; k1 = 0; for (j = jt; j < jt+tilesize; j++) { pB = b+j*N1+it; k = (j-jt)*tilesize; x = _mm_load_pd(&buf[k]); y = _mm_load_pd(&buf[k]+2); z = _mm_load_pd(&buf[k]+2*2); w = _mm_load_pd(&buf[k]+3*2); _mm_stream_pd(pB,x); _mm_stream_pd(pB+2,y); _mm_stream_pd(pB+2*2,z); _mm_stream_pd(pB+3*2,w); } } } }
void test_mm_stream_pd(double *A, __m128d B) { // DAG-LABEL: test_mm_stream_pd // DAG: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16, !nontemporal // // ASM-LABEL: test_mm_stream_pd // ASM: movntpd _mm_stream_pd(A, B); }
void dcopy(unsigned int N, const double* x, double* y) { #ifdef GX_SSE if(SSE2_supported) { __m128d X1,X2 ; unsigned int i = 0 ; while(i<N) { _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ; X1 = _mm_load_pd(&x[i]) ; i+=2 ; X2 = _mm_load_pd(&x[i]) ; i+=2 ; _mm_stream_pd(&y[i - 4], X1) ; _mm_stream_pd(&y[i - 2], X2) ; } _mm_sfence() ; return ; } #endif memcpy(y, x, N * sizeof(double)) ; }
inline __always_inline static void sse2_memzero128aligned(void *ptr, int n) { __m128d d = (__m128d)_mm_setzero_si128 (); assert(((stm_word_t)ptr)%16==0); assert(n%128==0); char *p, *endptr = ((char*)ptr)+n; // = ptr; for(p = ptr; p < endptr; p+=128) { _mm_stream_pd((double*)&p[0], d); _mm_stream_pd((double*)&p[16], d); _mm_stream_pd((double*)&p[32], d); _mm_stream_pd((double*)&p[48], d); _mm_stream_pd((double*)&p[64], d); _mm_stream_pd((double*)&p[80], d); _mm_stream_pd((double*)&p[96], d); _mm_stream_pd((double*)&p[112], d); } _mm_sfence(); }
void dotmul_intrinsic(double *A, double *B, double &C, int SIZE) { register int k; double sarr[2]; register __m128d partial_sum = _mm_setzero_pd(); register __m128d catch_multiplication = _mm_setzero_pd(); for(k = 0; k < SIZE; k += 2) { // load 64 bit data (2 x double) register __m128d a = _mm_load_pd(&A[k]); register __m128d b = _mm_load_pd(&B[k]); catch_multiplication = _mm_mul_pd(a, b); partial_sum = _mm_add_pd(partial_sum, catch_multiplication); } _mm_stream_pd(sarr, partial_sum); C = sarr[0] + sarr[1]; }
inline static void sse2_memset128aligned(void *ptr, int n, stm_word_t word) { #ifdef __LP64__ __m128d d = (__m128d)_mm_set_epi64((__m64)word, (__m64)word); #else __m128d d = (__m128d)_mm_set_epi32(word, word, word, word); #endif assert(((stm_word_t)ptr)%16==0); assert(n%128==0); char *p, *endptr = ((char*)ptr)+n; // = ptr; for(p = ptr; p < endptr; p+=128) { _mm_stream_pd((double*)&p[0], d); _mm_stream_pd((double*)&p[16], d); _mm_stream_pd((double*)&p[32], d); _mm_stream_pd((double*)&p[48], d); _mm_stream_pd((double*)&p[64], d); _mm_stream_pd((double*)&p[80], d); _mm_stream_pd((double*)&p[96], d); _mm_stream_pd((double*)&p[112], d); } _mm_sfence(); }
/* _mm_stream_pd version, used for transposing from a stripe buffer to columns. */ static void fftOPSubTrans( const FFTComplex *_src, FFTComplex *_dst, size_t srcRowSize, // src, in FFTComplex, a.k.a. src numCols size_t dstRowSize) // dst, in FFTComplex, a.k.a. dst numCols { /* rowDex, colDex refer to _src */ for(size_t colDex=0; colDex<FFT_COMPLEX_PER_SUBMATRIX; colDex++) { const FFTVector *invp = (const FFTVector *)(_src + colDex); FFTVector *outvp = (FFTVector *)(_dst + colDex*dstRowSize); for(size_t rowDex=0; rowDex<FFT_COMPLEX_PER_SUBMATRIX; rowDex++) { register FFTVector tmp = *invp; _mm_stream_pd((double*)outvp, tmp); outvp += 1; invp += srcRowSize; } } }
/* * Intel single precision, _mm_stream_pd version, used for transposing * from a stripe buffer to columns. */ static void fftOPSubTrans( const FFTComplex *_src, FFTComplex *_dst, size_t srcRowSize, // src, in FFTComplex, a.k.a. src numCols size_t dstRowSize) // dst, in FFTComplex, a.k.a. dst numCols { double *src = (double *)_src; double *dst = (double *)_dst; dumpSub("fftOPSubTrans start", _src, srcRowSize); /* * row and col refer to coordinates in src * row size of dst is dstRowSize */ unsigned curcol; for(curcol=0; curcol<FFT_COMPLEX_PER_SUBMATRIX; curcol+=2) { __m128d vin1; __m128d vin2; __m128d vin3; __m128d vin4; __m128d vin5; __m128d vin6; __m128d vin7; __m128d vin8; __m128d vOut_row1_1; __m128d vOut_row1_2; __m128d vOut_row1_3; __m128d vOut_row1_4; __m128d vOut_row2_1; __m128d vOut_row2_2; __m128d vOut_row2_3; __m128d vOut_row2_4; const double *pIn = src + curcol; double *pOut = dst + curcol*dstRowSize; // load in two columns from src at curcol vin1 = _mm_load_pd(pIn+0*srcRowSize); vin2 = _mm_load_pd(pIn+1*srcRowSize); vin3 = _mm_load_pd(pIn+2*srcRowSize); vin4 = _mm_load_pd(pIn+3*srcRowSize); vin5 = _mm_load_pd(pIn+4*srcRowSize); vin6 = _mm_load_pd(pIn+5*srcRowSize); vin7 = _mm_load_pd(pIn+6*srcRowSize); vin8 = _mm_load_pd(pIn+7*srcRowSize); /////////////////////////////////////////////// // transpose for first row out vOut_row1_1 = _mm_unpacklo_pd(vin1, vin2); vOut_row1_2 = _mm_unpacklo_pd(vin3, vin4); vOut_row1_3 = _mm_unpacklo_pd(vin5, vin6); vOut_row1_4 = _mm_unpacklo_pd(vin7, vin8); _mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row1_1); _mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row1_2); _mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row1_3); _mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row1_4); /////////////////////////////////////////////// // transpose for second row out pOut += dstRowSize; vOut_row2_1 = _mm_unpackhi_pd(vin1, vin2); vOut_row2_2 = _mm_unpackhi_pd(vin3, vin4); vOut_row2_3 = _mm_unpackhi_pd(vin5, vin6); vOut_row2_4 = _mm_unpackhi_pd(vin7, vin8); _mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row2_1); _mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row2_2); _mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row2_3); _mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row2_4); } dumpSub("fftOPSubTrans end", _dst, dstRowSize); }
int main(int argc,char *argv[]) { const char *label[4] = {"Copy", "Scale","Add", "Triad"}; const double bytes[4] = {2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N}; double rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; int quantum; int BytesPerWord,j,k,size; PetscInt node = -1; double scalar, t, times[4][NTIMES]; #if !STATIC_ALLOC double *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c; #endif PetscInitialize(&argc,&argv,0,help); MPI_Comm_size(PETSC_COMM_WORLD,&size); PetscOptionsGetInt(NULL,"-node",&node,NULL); /* --- SETUP --- determine precision and check timing --- */ PetscPrintf(PETSC_COMM_WORLD,HLINE); BytesPerWord = sizeof(double); PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); PetscPrintf(PETSC_COMM_WORLD,HLINE); PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n", N, OFFSET); PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n", (3 * N * BytesPerWord) / 1048576.0); PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES); PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n"); /* Get initial value for system clock. */ #if !STATIC_ALLOC if (node == -1) { posix_memalign((void**)&a,64,N*sizeof(double)); posix_memalign((void**)&b,64,N*sizeof(double)); posix_memalign((void**)&c,64,N*sizeof(double)); } else if (node == -2) { a = malloc(N*sizeof(double)); b = malloc(N*sizeof(double)); c = malloc(N*sizeof(double)); #if defined(HAVE_NUMA) } else { a = numa_alloc_onnode(N*sizeof(double),node); b = numa_alloc_onnode(N*sizeof(double),node); c = numa_alloc_onnode(N*sizeof(double),node); #endif } #endif #if FAULT_TOGETHER for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } #else for (j=0; j<N; j++) a[j] = 1.0; for (j=0; j<N; j++) b[j] = 2.0; for (j=0; j<N; j++) c[j] = 0.0; #endif PetscPrintf(PETSC_COMM_WORLD,HLINE); if ((quantum = checktick()) >= 1) PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be %d microseconds.\n", quantum); else PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be less than one microsecond.\n"); t = Second(); for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (Second() - t); PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order" " of %d microseconds.\n", (int) t); PetscPrintf(PETSC_COMM_WORLD," (= %d clock ticks)\n", (int) (t/quantum)); PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n"); PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n"); PetscPrintf(PETSC_COMM_WORLD,HLINE); PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n"); PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n"); PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n"); PetscPrintf(PETSC_COMM_WORLD,HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { MPI_Barrier(PETSC_COMM_WORLD); /* ### COPY: c <- a ### */ times[0][k] = Second(); MPI_Barrier(PETSC_COMM_WORLD); #if USE_MEMCPY memcpy(c,a,N*sizeof(double)); #elif SSE2 for (j=0; j<N; j+=8) { _mm_stream_pd(c+j+0,_mm_load_pd(a+j+0)); _mm_stream_pd(c+j+2,_mm_load_pd(a+j+2)); _mm_stream_pd(c+j+4,_mm_load_pd(a+j+4)); _mm_stream_pd(c+j+6,_mm_load_pd(a+j+6)); # if PREFETCH_NTA _mm_prefetch(a+j+64,_MM_HINT_NTA); # endif } #else for (j=0; j<N; j++) c[j] = a[j]; #endif MPI_Barrier(PETSC_COMM_WORLD); times[0][k] = Second() - times[0][k]; /* ### SCALE: b <- scalar * c ### */ times[1][k] = Second(); MPI_Barrier(PETSC_COMM_WORLD); #if SSE2 { __m128d scalar2 = _mm_set1_pd(scalar); for (j=0; j<N; j+=8) { _mm_stream_pd(b+j+0,_mm_mul_pd(scalar2,_mm_load_pd(c+j+0))); _mm_stream_pd(b+j+2,_mm_mul_pd(scalar2,_mm_load_pd(c+j+2))); _mm_stream_pd(b+j+4,_mm_mul_pd(scalar2,_mm_load_pd(c+j+4))); _mm_stream_pd(b+j+6,_mm_mul_pd(scalar2,_mm_load_pd(c+j+6))); # if PREFETCH_NTA _mm_prefetch(c+j+64,_MM_HINT_NTA); # endif } } #else for (j=0; j<N; j++) b[j] = scalar*c[j]; #endif MPI_Barrier(PETSC_COMM_WORLD); times[1][k] = Second() - times[1][k]; /* ### ADD: c <- a + b ### */ times[2][k] = Second(); MPI_Barrier(PETSC_COMM_WORLD); #if SSE2 { for (j=0; j<N; j+=8) { _mm_stream_pd(c+j+0,_mm_add_pd(_mm_load_pd(a+j+0),_mm_load_pd(b+j+0))); _mm_stream_pd(c+j+2,_mm_add_pd(_mm_load_pd(a+j+2),_mm_load_pd(b+j+2))); _mm_stream_pd(c+j+4,_mm_add_pd(_mm_load_pd(a+j+4),_mm_load_pd(b+j+4))); _mm_stream_pd(c+j+6,_mm_add_pd(_mm_load_pd(a+j+6),_mm_load_pd(b+j+6))); # if PREFETCH_NTA _mm_prefetch(a+j+64,_MM_HINT_NTA); _mm_prefetch(b+j+64,_MM_HINT_NTA); # endif } } #else for (j=0; j<N; j++) c[j] = a[j]+b[j]; #endif MPI_Barrier(PETSC_COMM_WORLD); times[2][k] = Second() - times[2][k]; /* ### TRIAD: a <- b + scalar * c ### */ times[3][k] = Second(); MPI_Barrier(PETSC_COMM_WORLD); #if SSE2 { __m128d scalar2 = _mm_set1_pd(scalar); for (j=0; j<N; j+=8) { _mm_stream_pd(a+j+0,_mm_add_pd(_mm_load_pd(b+j+0),_mm_mul_pd(scalar2,_mm_load_pd(c+j+0)))); _mm_stream_pd(a+j+2,_mm_add_pd(_mm_load_pd(b+j+2),_mm_mul_pd(scalar2,_mm_load_pd(c+j+2)))); _mm_stream_pd(a+j+4,_mm_add_pd(_mm_load_pd(b+j+4),_mm_mul_pd(scalar2,_mm_load_pd(c+j+4)))); _mm_stream_pd(a+j+6,_mm_add_pd(_mm_load_pd(b+j+6),_mm_mul_pd(scalar2,_mm_load_pd(c+j+6)))); # if PREFETCH_NTA _mm_prefetch(b+j+64,_MM_HINT_NTA); _mm_prefetch(c+j+64,_MM_HINT_NTA); # endif } } #else for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; #endif MPI_Barrier(PETSC_COMM_WORLD); times[3][k] = Second() - times[3][k]; } /* --- SUMMARY --- */ for (k=0; k<NTIMES; k++) for (j=0; j<4; j++) { rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } PetscPrintf(PETSC_COMM_WORLD,"%8s: %11s %11s %11s %11s %11s\n","Function","Rate (MB/s)","Total (MB/s)","RMS time","Min time","Max time"); for (j=0; j<4; j++) { rmstime[j] = sqrt(rmstime[j]/(double)NTIMES); PetscPrintf(PETSC_COMM_WORLD,"%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06*bytes[j]/mintime[j], size*1.0e-06*bytes[j]/mintime[j], rmstime[j], mintime[j], maxtime[j]); } PetscFinalize(); return 0; }
void blurRemoveMinMax_(const Mat& src, Mat& dest, const int r) { const Size ksize = Size(2 * r + 1, 2 * r + 1); if (src.data != dest.data)src.copyTo(dest); Mat xv; Mat nv; Mat element = Mat::ones(2 * r + 1, 2 * r + 1, CV_8U); dilate(src, xv, element); erode(src, nv, element); Mat mind; Mat maxd; Mat mask; absdiff(src, nv, mind);//can move to loop absdiff(src, xv, maxd);// min(mind, maxd, mask);// T* n = nv.ptr<T>(0); T* x = xv.ptr<T>(0); T* d = dest.ptr<T>(0); T* nd = mind.ptr<T>(0); T* mk = mask.ptr<T>(0); int remsize = src.size().area(); #if CV_SSE4_1 if (src.depth() == CV_8U) { const int ssesize = src.size().area() / 16; remsize = src.size().area() - ssesize * 16; for (int i = 0; i < ssesize; i++) { __m128i mmk = _mm_load_si128((__m128i*)mk); __m128i mnd = _mm_load_si128((__m128i*)nd); __m128i mmn = _mm_load_si128((__m128i*)n); __m128i mmx = _mm_load_si128((__m128i*)x); __m128i msk = _mm_cmpeq_epi8(mnd, mmk); _mm_stream_si128((__m128i*)d, _mm_blendv_epi8(mmx, mmn, msk)); nd += 16; mk += 16; d += 16; n += 16; x += 16; } } else if (src.depth() == CV_16S || src.depth() == CV_16U) { const int ssesize = src.size().area() / 8; remsize = src.size().area() - ssesize * 8; for (int i = 0; i < ssesize; i++) { __m128i mmk = _mm_load_si128((__m128i*)mk); __m128i mnd = _mm_load_si128((__m128i*)nd); __m128i mmn = _mm_load_si128((__m128i*)n); __m128i mmx = _mm_load_si128((__m128i*)x); __m128i msk = _mm_cmpeq_epi16(mnd, mmk); _mm_stream_si128((__m128i*)d, _mm_blendv_epi8(mmx, mmn, msk)); nd += 8; mk += 8; d += 8; n += 8; x += 8; } } else if (src.depth() == CV_32F) { const int ssesize = src.size().area() / 4; remsize = src.size().area() - ssesize * 4; for (int i = 0; i < ssesize; i++) { __m128 mmk = _mm_load_ps((float*)mk); __m128 mnd = _mm_load_ps((float*)nd); __m128 mmn = _mm_load_ps((float*)n); __m128 mmx = _mm_load_ps((float*)x); __m128 msk = _mm_cmpeq_ps(mnd, mmk); _mm_stream_ps((float*)d, _mm_blendv_ps(mmx, mmn, msk)); nd += 4; mk += 4; d += 4; n += 4; x += 4; } } else if (src.depth() == CV_64F) { const int ssesize = src.size().area() / 2; remsize = src.size().area() - ssesize * 2; for (int i = 0; i < ssesize; i++) { __m128d mmk = _mm_load_pd((double*)mk); __m128d mnd = _mm_load_pd((double*)nd); __m128d mmn = _mm_load_pd((double*)n); __m128d mmx = _mm_load_pd((double*)x); __m128d msk = _mm_cmpeq_pd(mnd, mmk); _mm_stream_pd((double*)d, _mm_blendv_pd(mmx, mmn, msk)); nd += 2; mk += 2; d += 2; n += 2; x += 2; } } #endif for (int i = 0; i < remsize; i++) { { if (nd[i] == mk[i]) { d[i] = n[i]; } else { d[i] = x[i]; } } } }
test (double *p, __m128d s) { return _mm_stream_pd (p, s); }