Beispiel #1
0
        void dscal(unsigned int N, double a, double* y) {
            flops_counter += N ;
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d Y1, Y2, AA ;
                SSE_ALIGNED(double temp[2]) ;
                temp[0] = a ; temp[1] = a ;
                AA = _mm_load_pd(temp) ;
                unsigned int i = 0 ;
                while(i<N) {
                    _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ;
                    Y1 = _mm_load_pd(&y[i]) ;
                    Y1 = _mm_mul_pd(Y1, AA) ;
                    i += 2 ;
                    Y2 = _mm_load_pd(&y[i]) ;
                    Y2 = _mm_mul_pd(Y2, AA) ;
                    i += 2 ;
                    _mm_stream_pd(&y[i - 4], Y1) ;
                    _mm_stream_pd(&y[i - 2], Y2) ;
                }
                _mm_sfence() ;
                return ; 
            }
#endif
            for(unsigned int i=0; i<N; i++) {
                y[i] *= a ;
            }
        }
Beispiel #2
0
        void dmul(unsigned int N, const double* a, const double* b, double* y) {
            flops_counter += N ;
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d Y1, Y2, A1, A2, B1, B2 ;
                unsigned int i = 0 ;
                while(i<N) {
                    _mm_prefetch((const char*)(&a[i] + 256), _MM_HINT_NTA) ;
                    _mm_prefetch((const char*)(&b[i] + 256), _MM_HINT_NTA) ;
                    A1 = _mm_load_pd(&a[i]) ;
                    B1 = _mm_load_pd(&b[i]) ;
                    Y1 = _mm_mul_pd(A1,B1) ;
                    i += 2 ;
                    A2 = _mm_load_pd(&a[i]) ;
                    B2 = _mm_load_pd(&b[i]) ;
                    Y2 = _mm_mul_pd(A2,B2) ;
                    i += 2 ;
                    _mm_stream_pd(&y[i - 4], Y1) ;
                    _mm_stream_pd(&y[i - 2], Y2) ;
                }
                _mm_sfence() ;
                return ;
            }
#endif
            for(unsigned int i=0; i<N; i++) {
                y[i] = a[i] * b[i] ;
            }
        }
Beispiel #3
0
        void dzero(unsigned int N, double* y) {
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d Z = _mm_setzero_pd() ;
                for(unsigned int i=0; i<N; i+=4) {
                    _mm_stream_pd(&y[i], Z) ;
                    _mm_stream_pd(&y[i + 2], Z) ;
                }
                _mm_sfence() ;
                return ;
            }
#endif
            memset(y, 0, N*sizeof(double)) ;
        }
void transpose_aligned(double *a, double *b, int N1, int N2, double factor) {

    int i,j,k,k1,it,jt,itt,jtt,conflict,tmp,tmpN;
    double *pA, *pB;


    register __m128d x, y, z, w,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    for (it = 0; it < N1; it=it+tilesize) {
        for (jt = 0; jt < N2; jt=jt+tilesize) {

            k = 0;
            for (j = jt; j < jt+tilesize; j=j+2) {
                for (i = it; i < it+tilesize; i=i+2) {
                    pA = a+i*N2+j;
                    x = _mm_load_pd(pA);
                    y = _mm_load_pd(pA + N2);
                    x = _mm_mul_pd(x,fac_vector);
                    y = _mm_mul_pd(y,fac_vector);
                    z = _mm_shuffle_pd( x, y, 0);
                    w = _mm_shuffle_pd( x, y, 3);
                    k = (j-jt)*tilesize + (i-it);
                    _mm_store_pd(buf + k,z);
                    _mm_store_pd(buf + k + tilesize,w);
                }
            }

            k = 0;
            k1 = 0;
            for (j = jt; j < jt+tilesize; j++) {
                pB = b+j*N1+it;
                k = (j-jt)*tilesize;
                x = _mm_load_pd(&buf[k]);
                y = _mm_load_pd(&buf[k]+2);
                z = _mm_load_pd(&buf[k]+2*2);
                w = _mm_load_pd(&buf[k]+3*2);
                _mm_stream_pd(pB,x);
                _mm_stream_pd(pB+2,y);
                _mm_stream_pd(pB+2*2,z);
                _mm_stream_pd(pB+3*2,w);

            }
        }
    }
}
Beispiel #5
0
void test_mm_stream_pd(double *A, __m128d B) {
  // DAG-LABEL: test_mm_stream_pd
  // DAG: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16, !nontemporal
  //
  // ASM-LABEL: test_mm_stream_pd
  // ASM: movntpd
  _mm_stream_pd(A, B);
}
Beispiel #6
0
        void dcopy(unsigned int N, const double* x, double* y) {
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d X1,X2 ;
                unsigned int i = 0 ;
                while(i<N) {
                    _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ;
                    X1 = _mm_load_pd(&x[i]) ;
                    i+=2 ;
                    X2 = _mm_load_pd(&x[i]) ;
                    i+=2 ;
                    _mm_stream_pd(&y[i - 4], X1) ;
                    _mm_stream_pd(&y[i - 2], X2) ;
                }
                _mm_sfence() ;
                return ;
            }
#endif
            memcpy(y, x, N * sizeof(double)) ;
        }
Beispiel #7
0
inline __always_inline static void sse2_memzero128aligned(void *ptr, int n)
{
    __m128d d = (__m128d)_mm_setzero_si128 ();

    assert(((stm_word_t)ptr)%16==0);
    assert(n%128==0);
    char *p, *endptr = ((char*)ptr)+n; // = ptr;
    for(p = ptr; p < endptr; p+=128) {
	_mm_stream_pd((double*)&p[0], d);
	_mm_stream_pd((double*)&p[16], d);
	_mm_stream_pd((double*)&p[32], d);
	_mm_stream_pd((double*)&p[48], d);
	_mm_stream_pd((double*)&p[64], d);
	_mm_stream_pd((double*)&p[80], d);
	_mm_stream_pd((double*)&p[96], d);
	_mm_stream_pd((double*)&p[112], d);
    }
    _mm_sfence();
}
Beispiel #8
0
void dotmul_intrinsic(double *A, double *B, double &C, int SIZE)
{
  register int k;
  double sarr[2];

  register __m128d partial_sum = _mm_setzero_pd();
  register __m128d catch_multiplication = _mm_setzero_pd();

  for(k = 0; k < SIZE; k += 2)
  {
    // load 64 bit data (2 x double)
    register __m128d a = _mm_load_pd(&A[k]);
    register __m128d b = _mm_load_pd(&B[k]);

    catch_multiplication = _mm_mul_pd(a, b);
    partial_sum = _mm_add_pd(partial_sum, catch_multiplication);
  }

  _mm_stream_pd(sarr, partial_sum);
  C = sarr[0] + sarr[1];
}
Beispiel #9
0
inline static void sse2_memset128aligned(void *ptr, int n, stm_word_t word)
{
#ifdef __LP64__
    __m128d d = (__m128d)_mm_set_epi64((__m64)word, (__m64)word);
#else    
    __m128d d = (__m128d)_mm_set_epi32(word, word, word, word);
#endif
    assert(((stm_word_t)ptr)%16==0);
    assert(n%128==0);
    
    char *p, *endptr = ((char*)ptr)+n; // = ptr;
    for(p = ptr; p < endptr; p+=128) {
	_mm_stream_pd((double*)&p[0], d);
	_mm_stream_pd((double*)&p[16], d);
	_mm_stream_pd((double*)&p[32], d);
	_mm_stream_pd((double*)&p[48], d);
	_mm_stream_pd((double*)&p[64], d);
	_mm_stream_pd((double*)&p[80], d);
	_mm_stream_pd((double*)&p[96], d);
	_mm_stream_pd((double*)&p[112], d);
    }
    _mm_sfence();
}
/* _mm_stream_pd version, used for transposing from a stripe buffer to columns. */
static void fftOPSubTrans(
	  const FFTComplex	*_src,
	  FFTComplex		*_dst,
	  size_t			srcRowSize,		// src, in FFTComplex, a.k.a. src numCols
	  size_t			dstRowSize)		// dst, in FFTComplex, a.k.a. dst numCols
{
	/* rowDex, colDex refer to _src */
	for(size_t colDex=0; colDex<FFT_COMPLEX_PER_SUBMATRIX; colDex++) {

		const FFTVector *invp = (const FFTVector *)(_src + colDex);
		FFTVector		*outvp = (FFTVector *)(_dst + colDex*dstRowSize);
		
		for(size_t rowDex=0; rowDex<FFT_COMPLEX_PER_SUBMATRIX; rowDex++) {

			register FFTVector tmp = *invp;
			
			_mm_stream_pd((double*)outvp, tmp);
		
			outvp += 1;
			invp += srcRowSize;
		}
	}
}
/* 
 * Intel single precision, _mm_stream_pd version, used for transposing
 * from a stripe buffer to columns. 
 */
static void fftOPSubTrans(
  const FFTComplex	*_src,
  FFTComplex		*_dst,
  size_t			srcRowSize,		// src, in FFTComplex, a.k.a. src numCols
  size_t			dstRowSize)		// dst, in FFTComplex, a.k.a. dst numCols
{
	double *src = (double *)_src;
	double *dst = (double *)_dst;
	
	dumpSub("fftOPSubTrans start", _src, srcRowSize);
	
	/* 
	 * row and col refer to coordinates in src 
	 * row size of dst is dstRowSize
	 */
	unsigned curcol;
	
	for(curcol=0; curcol<FFT_COMPLEX_PER_SUBMATRIX; curcol+=2) {
		__m128d vin1;
		__m128d vin2;
		__m128d vin3;
		__m128d vin4;
		__m128d vin5;
		__m128d vin6;
		__m128d vin7;
		__m128d vin8;
		
		__m128d vOut_row1_1;
		__m128d vOut_row1_2;
		__m128d vOut_row1_3;
		__m128d vOut_row1_4;
		__m128d vOut_row2_1;
		__m128d vOut_row2_2;
		__m128d vOut_row2_3;
		__m128d vOut_row2_4;
		
		const double *pIn = src + curcol;
		double *pOut = dst + curcol*dstRowSize;
		
		// load in two columns from src at curcol
		vin1 = _mm_load_pd(pIn+0*srcRowSize);
		vin2 = _mm_load_pd(pIn+1*srcRowSize);
		vin3 = _mm_load_pd(pIn+2*srcRowSize);
		vin4 = _mm_load_pd(pIn+3*srcRowSize);
		vin5 = _mm_load_pd(pIn+4*srcRowSize);
		vin6 = _mm_load_pd(pIn+5*srcRowSize);
		vin7 = _mm_load_pd(pIn+6*srcRowSize);
		vin8 = _mm_load_pd(pIn+7*srcRowSize);
		
		///////////////////////////////////////////////
		// transpose for first row out
		
		vOut_row1_1 = _mm_unpacklo_pd(vin1, vin2);
		vOut_row1_2 = _mm_unpacklo_pd(vin3, vin4);
		vOut_row1_3 = _mm_unpacklo_pd(vin5, vin6);
		vOut_row1_4 = _mm_unpacklo_pd(vin7, vin8);
		
		_mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row1_1);
		_mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row1_2);
		_mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row1_3);
		_mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row1_4);
		
		///////////////////////////////////////////////
		// transpose for second row out
		pOut += dstRowSize;
		
		vOut_row2_1 = _mm_unpackhi_pd(vin1, vin2);
		vOut_row2_2 = _mm_unpackhi_pd(vin3, vin4);
		vOut_row2_3 = _mm_unpackhi_pd(vin5, vin6);
		vOut_row2_4 = _mm_unpackhi_pd(vin7, vin8);
		
		_mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row2_1);
		_mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row2_2);
		_mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row2_3);
		_mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row2_4);
	}
	
	dumpSub("fftOPSubTrans end", _dst, dstRowSize);
}
Beispiel #12
0
int main(int argc,char *argv[])
{
  const char   *label[4] = {"Copy", "Scale","Add", "Triad"};
  const double bytes[4]  = {2 * sizeof(double) * N,
                            2 * sizeof(double) * N,
                            3 * sizeof(double) * N,
                            3 * sizeof(double) * N};
  double       rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
  int          quantum;
  int          BytesPerWord,j,k,size;
  PetscInt     node = -1;
  double       scalar, t, times[4][NTIMES];
#if !STATIC_ALLOC
  double       *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c;
#endif

  PetscInitialize(&argc,&argv,0,help);
  MPI_Comm_size(PETSC_COMM_WORLD,&size);
  PetscOptionsGetInt(NULL,"-node",&node,NULL);
  /* --- SETUP --- determine precision and check timing --- */

  PetscPrintf(PETSC_COMM_WORLD,HLINE);
  BytesPerWord = sizeof(double);
  PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n",
              BytesPerWord);

  PetscPrintf(PETSC_COMM_WORLD,HLINE);
  PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n", N, OFFSET);
  PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n",
              (3 * N * BytesPerWord) / 1048576.0);
  PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES);
  PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n");

  /* Get initial value for system clock. */

#if !STATIC_ALLOC
  if (node == -1) {
    posix_memalign((void**)&a,64,N*sizeof(double));
    posix_memalign((void**)&b,64,N*sizeof(double));
    posix_memalign((void**)&c,64,N*sizeof(double));
  } else if (node == -2) {
    a = malloc(N*sizeof(double));
    b = malloc(N*sizeof(double));
    c = malloc(N*sizeof(double));
#if defined(HAVE_NUMA)
  } else {
    a = numa_alloc_onnode(N*sizeof(double),node);
    b = numa_alloc_onnode(N*sizeof(double),node);
    c = numa_alloc_onnode(N*sizeof(double),node);
#endif
  }
#endif
#if FAULT_TOGETHER
  for (j=0; j<N; j++) {
    a[j] = 1.0;
    b[j] = 2.0;
    c[j] = 0.0;
  }
#else
  for (j=0; j<N; j++) a[j] = 1.0;
  for (j=0; j<N; j++) b[j] = 2.0;
  for (j=0; j<N; j++) c[j] = 0.0;
#endif

  PetscPrintf(PETSC_COMM_WORLD,HLINE);

  if  ((quantum = checktick()) >= 1) PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be %d microseconds.\n", quantum);
  else PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be less than one microsecond.\n");

  t = Second();
  for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
  t = 1.0E6 * (Second() - t);

  PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order"
              " of %d microseconds.\n", (int) t);
  PetscPrintf(PETSC_COMM_WORLD,"   (= %d clock ticks)\n", (int) (t/quantum));
  PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n");
  PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n");

  PetscPrintf(PETSC_COMM_WORLD,HLINE);

  PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n");
  PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n");
  PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n");
  PetscPrintf(PETSC_COMM_WORLD,HLINE);

  /* --- MAIN LOOP --- repeat test cases NTIMES times --- */

  scalar = 3.0;
  for (k=0; k<NTIMES; k++) {
    MPI_Barrier(PETSC_COMM_WORLD);
    /* ### COPY: c <- a ### */
    times[0][k] = Second();
    MPI_Barrier(PETSC_COMM_WORLD);
#if USE_MEMCPY
    memcpy(c,a,N*sizeof(double));
#elif SSE2
    for (j=0; j<N; j+=8) {
      _mm_stream_pd(c+j+0,_mm_load_pd(a+j+0));
      _mm_stream_pd(c+j+2,_mm_load_pd(a+j+2));
      _mm_stream_pd(c+j+4,_mm_load_pd(a+j+4));
      _mm_stream_pd(c+j+6,_mm_load_pd(a+j+6));
#  if PREFETCH_NTA
      _mm_prefetch(a+j+64,_MM_HINT_NTA);
#  endif
    }
#else
    for (j=0; j<N; j++) c[j] = a[j];
#endif
    MPI_Barrier(PETSC_COMM_WORLD);
    times[0][k] = Second() - times[0][k];

    /* ### SCALE: b <- scalar * c ### */
    times[1][k] = Second();
    MPI_Barrier(PETSC_COMM_WORLD);
#if SSE2
    {
      __m128d scalar2 = _mm_set1_pd(scalar);
      for (j=0; j<N; j+=8) {
        _mm_stream_pd(b+j+0,_mm_mul_pd(scalar2,_mm_load_pd(c+j+0)));
        _mm_stream_pd(b+j+2,_mm_mul_pd(scalar2,_mm_load_pd(c+j+2)));
        _mm_stream_pd(b+j+4,_mm_mul_pd(scalar2,_mm_load_pd(c+j+4)));
        _mm_stream_pd(b+j+6,_mm_mul_pd(scalar2,_mm_load_pd(c+j+6)));
#  if PREFETCH_NTA
        _mm_prefetch(c+j+64,_MM_HINT_NTA);
#  endif
      }
    }
#else
    for (j=0; j<N; j++) b[j] = scalar*c[j];
#endif
    MPI_Barrier(PETSC_COMM_WORLD);
    times[1][k] = Second() - times[1][k];

    /* ### ADD: c <- a + b ### */
    times[2][k] = Second();
    MPI_Barrier(PETSC_COMM_WORLD);
#if SSE2
    {
      for (j=0; j<N; j+=8) {
        _mm_stream_pd(c+j+0,_mm_add_pd(_mm_load_pd(a+j+0),_mm_load_pd(b+j+0)));
        _mm_stream_pd(c+j+2,_mm_add_pd(_mm_load_pd(a+j+2),_mm_load_pd(b+j+2)));
        _mm_stream_pd(c+j+4,_mm_add_pd(_mm_load_pd(a+j+4),_mm_load_pd(b+j+4)));
        _mm_stream_pd(c+j+6,_mm_add_pd(_mm_load_pd(a+j+6),_mm_load_pd(b+j+6)));
#  if PREFETCH_NTA
        _mm_prefetch(a+j+64,_MM_HINT_NTA);
        _mm_prefetch(b+j+64,_MM_HINT_NTA);
#  endif
      }
    }
#else
    for (j=0; j<N; j++) c[j] = a[j]+b[j];
#endif
    MPI_Barrier(PETSC_COMM_WORLD);
    times[2][k] = Second() - times[2][k];

    /* ### TRIAD: a <- b + scalar * c ### */
    times[3][k] = Second();
    MPI_Barrier(PETSC_COMM_WORLD);
#if SSE2
    {
      __m128d scalar2 = _mm_set1_pd(scalar);
      for (j=0; j<N; j+=8) {
        _mm_stream_pd(a+j+0,_mm_add_pd(_mm_load_pd(b+j+0),_mm_mul_pd(scalar2,_mm_load_pd(c+j+0))));
        _mm_stream_pd(a+j+2,_mm_add_pd(_mm_load_pd(b+j+2),_mm_mul_pd(scalar2,_mm_load_pd(c+j+2))));
        _mm_stream_pd(a+j+4,_mm_add_pd(_mm_load_pd(b+j+4),_mm_mul_pd(scalar2,_mm_load_pd(c+j+4))));
        _mm_stream_pd(a+j+6,_mm_add_pd(_mm_load_pd(b+j+6),_mm_mul_pd(scalar2,_mm_load_pd(c+j+6))));
#  if PREFETCH_NTA
        _mm_prefetch(b+j+64,_MM_HINT_NTA);
        _mm_prefetch(c+j+64,_MM_HINT_NTA);
#  endif
      }
    }
#else
    for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j];
#endif
    MPI_Barrier(PETSC_COMM_WORLD);
    times[3][k] = Second() - times[3][k];
  }

  /* --- SUMMARY --- */

  for (k=0; k<NTIMES; k++)
    for (j=0; j<4; j++) {
      rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
      mintime[j] = MIN(mintime[j], times[j][k]);
      maxtime[j] = MAX(maxtime[j], times[j][k]);
    }


  PetscPrintf(PETSC_COMM_WORLD,"%8s:  %11s  %11s  %11s  %11s  %11s\n","Function","Rate (MB/s)","Total (MB/s)","RMS time","Min time","Max time");
  for (j=0; j<4; j++) {
    rmstime[j] = sqrt(rmstime[j]/(double)NTIMES);
    PetscPrintf(PETSC_COMM_WORLD,"%8s: %11.4f  %11.4f  %11.4f  %11.4f  %11.4f\n", label[j], 1.0e-06*bytes[j]/mintime[j], size*1.0e-06*bytes[j]/mintime[j], rmstime[j], mintime[j], maxtime[j]);
  }
  PetscFinalize();
  return 0;
}
Beispiel #13
0
	void blurRemoveMinMax_(const Mat& src, Mat& dest, const int r)
	{
		const Size ksize = Size(2 * r + 1, 2 * r + 1);
		if (src.data != dest.data)src.copyTo(dest);

		Mat xv;
		Mat nv;
		Mat element = Mat::ones(2 * r + 1, 2 * r + 1, CV_8U);
		dilate(src, xv, element);
		erode(src, nv, element);

		Mat mind;
		Mat maxd;
		Mat mask;
		absdiff(src, nv, mind);//can move to loop
		absdiff(src, xv, maxd);//
		min(mind, maxd, mask);//

		T* n = nv.ptr<T>(0);
		T* x = xv.ptr<T>(0);
		T* d = dest.ptr<T>(0);
		T* nd = mind.ptr<T>(0);
		T* mk = mask.ptr<T>(0);

		int remsize = src.size().area();

#if CV_SSE4_1
		if (src.depth() == CV_8U)
		{

			const int ssesize = src.size().area() / 16;
			remsize = src.size().area() - ssesize * 16;
			for (int i = 0; i < ssesize; i++)
			{
				__m128i mmk = _mm_load_si128((__m128i*)mk);
				__m128i mnd = _mm_load_si128((__m128i*)nd);

				__m128i mmn = _mm_load_si128((__m128i*)n);
				__m128i mmx = _mm_load_si128((__m128i*)x);
				__m128i msk = _mm_cmpeq_epi8(mnd, mmk);
				_mm_stream_si128((__m128i*)d, _mm_blendv_epi8(mmx, mmn, msk));
				nd += 16;
				mk += 16;
				d += 16;
				n += 16;
				x += 16;
			}
		}
		else if (src.depth() == CV_16S || src.depth() == CV_16U)
		{

			const int ssesize = src.size().area() / 8;
			remsize = src.size().area() - ssesize * 8;
			for (int i = 0; i < ssesize; i++)
			{
				__m128i mmk = _mm_load_si128((__m128i*)mk);
				__m128i mnd = _mm_load_si128((__m128i*)nd);

				__m128i mmn = _mm_load_si128((__m128i*)n);
				__m128i mmx = _mm_load_si128((__m128i*)x);
				__m128i msk = _mm_cmpeq_epi16(mnd, mmk);
				_mm_stream_si128((__m128i*)d, _mm_blendv_epi8(mmx, mmn, msk));
				nd += 8;
				mk += 8;
				d += 8;
				n += 8;
				x += 8;
			}
		}
		else if (src.depth() == CV_32F)
		{

			const int ssesize = src.size().area() / 4;
			remsize = src.size().area() - ssesize * 4;
			for (int i = 0; i < ssesize; i++)
			{
				__m128 mmk = _mm_load_ps((float*)mk);
				__m128 mnd = _mm_load_ps((float*)nd);

				__m128 mmn = _mm_load_ps((float*)n);
				__m128 mmx = _mm_load_ps((float*)x);
				__m128 msk = _mm_cmpeq_ps(mnd, mmk);
				_mm_stream_ps((float*)d, _mm_blendv_ps(mmx, mmn, msk));
				nd += 4;
				mk += 4;
				d += 4;
				n += 4;
				x += 4;
			}
		}
		else if (src.depth() == CV_64F)
		{
			const int ssesize = src.size().area() / 2;
			remsize = src.size().area() - ssesize * 2;
			for (int i = 0; i < ssesize; i++)
			{
				__m128d mmk = _mm_load_pd((double*)mk);
				__m128d mnd = _mm_load_pd((double*)nd);

				__m128d mmn = _mm_load_pd((double*)n);
				__m128d mmx = _mm_load_pd((double*)x);
				__m128d msk = _mm_cmpeq_pd(mnd, mmk);
				_mm_stream_pd((double*)d, _mm_blendv_pd(mmx, mmn, msk));
				nd += 2;
				mk += 2;
				d += 2;
				n += 2;
				x += 2;
			}
		}
#endif
		for (int i = 0; i < remsize; i++)
		{
			{
				if (nd[i] == mk[i])
				{
					d[i] = n[i];
				}
				else
				{
					d[i] = x[i];
				}
			}
		}
	}
Beispiel #14
0
test (double *p, __m128d s)
{
  return _mm_stream_pd (p, s); 
}