static void
sse4_1_test (void)
{
  union
    {
      __m128i x[NUM / 4];
      int i[NUM];
    } dst, src1, src2;
  int i, sign = 1;
  int max;

  for (i = 0; i < NUM; i++)
    {
      src1.i[i] = i * i * sign;
      src2.i[i] = (i + 20) * sign;
      sign = -sign;
    }

  for (i = 0; i < NUM; i += 4)
    dst.x[i / 4] = _mm_max_epi32 (src1.x[i / 4], src2.x[i / 4]);

  for (i = 0; i < NUM; i++)
    {
      max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
      if (max != dst.i[i])
	abort ();
    }
}
Exemplo n.º 2
0
void vFindMax(__m128i *pixels, int n)
{
  __m128i vIdx,vMax;
  int i;
  vIdx = _mm_setzero_si128();
  vMax = _mm_set_epi32(INT_MIN,INT_MIN,INT_MIN,INT_MIN);
  for(i = 0; i < n; i++)
    {
      __m128i v = _mm_load_si128(pixels+i);
      __m128i vCmp = _mm_cmpgt_epi32(v, vMax);
      /* max value */
      vMax = _mm_max_epi32(vMax,v);
      
      __m128i vBdxIdx = _mm_set_epi32(i,i,i,i); 
      
      __m128 t0 = _mm_and_ps((__m128)vBdxIdx,(__m128)vCmp);
      __m128 t1 = _mm_andnot_ps((__m128)vCmp, (__m128)vIdx);
      /* max index */
      vIdx = (__m128i)_mm_or_ps(t0,t1);
    }
  int indices[4];
  int values[4];
  _mm_store_si128((__m128i*)indices, vIdx);
  _mm_store_si128((__m128i*)values, vMax);
  printf("SSE:\n");
  for(i=0;i<4;i++)
    {
      printf("%d:max=%d,idx=%d\n",i,values[i],indices[i]);
      //int idx = 4*indices[i] + i;
      //int *sArr = (int*)pixels;
      //printf("sArr[%d]=%d\n",idx,sArr[idx]);
    }
}
Exemplo n.º 3
0
static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
                                       const __m128i *f, int taps,
                                       uint16_t *dst, WritePixels saveFunc,
                                       int bd) {
  __m128i s[12];
  __m128i zero = _mm_setzero_si128();
  int i = 0;
  int r = 0;

  // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
  assert(taps == 10 || taps == 12);
  if (10 == taps) {
    i += 1;
    s[0] = zero;
  }
  while (i < 12) {
    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
    i += 1;
    r += 1;
  }

  s[0] = _mm_unpacklo_epi16(s[0], s[1]);
  s[2] = _mm_unpacklo_epi16(s[2], s[3]);
  s[4] = _mm_unpacklo_epi16(s[4], s[5]);
  s[6] = _mm_unpacklo_epi16(s[6], s[7]);
  s[8] = _mm_unpacklo_epi16(s[8], s[9]);
  s[10] = _mm_unpacklo_epi16(s[10], s[11]);

  s[0] = _mm_madd_epi16(s[0], f[0]);
  s[2] = _mm_madd_epi16(s[2], f[1]);
  s[4] = _mm_madd_epi16(s[4], f[2]);
  s[6] = _mm_madd_epi16(s[6], f[3]);
  s[8] = _mm_madd_epi16(s[8], f[4]);
  s[10] = _mm_madd_epi16(s[10], f[5]);

  s[1] = _mm_min_epi32(s[4], s[6]);
  s[3] = _mm_max_epi32(s[4], s[6]);

  s[0] = _mm_add_epi32(s[0], s[2]);
  s[0] = _mm_add_epi32(s[0], s[10]);
  s[0] = _mm_add_epi32(s[0], s[8]);
  s[0] = _mm_add_epi32(s[0], s[1]);
  s[0] = _mm_add_epi32(s[0], s[3]);

  saveFunc(s, bd, dst);
}
    SIMDValue SIMDInt32x4Operation::OpMax(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        SIMDValue result;
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);

        if (AutoSystemInfo::Data.SSE4_1Available())
        {   // choose the larger value of the two parameters, only available after SSE4
            x86Result.m128i_value = _mm_max_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // a ^ b
            result = X86SIMDValue::ToSIMDValue(x86Result);
        }
        else
        {
            result.i32[SIMD_X] = (aValue.i32[SIMD_X] > bValue.i32[SIMD_X]) ? aValue.i32[SIMD_X] : bValue.i32[SIMD_X];
            result.i32[SIMD_Y] = (aValue.i32[SIMD_Y] > bValue.i32[SIMD_Y]) ? aValue.i32[SIMD_Y] : bValue.i32[SIMD_Y];
            result.i32[SIMD_Z] = (aValue.i32[SIMD_Z] > bValue.i32[SIMD_Z]) ? aValue.i32[SIMD_Z] : bValue.i32[SIMD_Z];
            result.i32[SIMD_W] = (aValue.i32[SIMD_W] > bValue.i32[SIMD_W]) ? aValue.i32[SIMD_W] : bValue.i32[SIMD_W];
        }

        return result;
    }
Exemplo n.º 5
0
static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
                                int tapsNum, uint32_t *buf) {
  __m128i u[8], v[6];

  assert(tapsNum == 10 || tapsNum == 12);
  if (tapsNum == 10) {
    src -= 1;
  }

  u[0] = _mm_loadu_si128((__m128i const *)src);
  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));

  u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
  u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
  u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
  u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));

  transpose_pair(u, v);

  u[0] = _mm_madd_epi16(v[0], f[0]);
  u[1] = _mm_madd_epi16(v[1], f[1]);
  u[2] = _mm_madd_epi16(v[2], f[2]);
  u[3] = _mm_madd_epi16(v[3], f[3]);
  u[4] = _mm_madd_epi16(v[4], f[4]);
  u[5] = _mm_madd_epi16(v[5], f[5]);

  u[6] = _mm_min_epi32(u[2], u[3]);
  u[7] = _mm_max_epi32(u[2], u[3]);

  u[0] = _mm_add_epi32(u[0], u[1]);
  u[0] = _mm_add_epi32(u[0], u[5]);
  u[0] = _mm_add_epi32(u[0], u[4]);
  u[0] = _mm_add_epi32(u[0], u[6]);
  u[0] = _mm_add_epi32(u[0], u[7]);

  _mm_storeu_si128((__m128i *)buf, u[0]);
}
Exemplo n.º 6
0
int oneThread(int threadId, int w)
{
	int cc[OUT_ARRAY_SIZE];
	int i;
	int k;
	int itr;
	int w2 = 2*w;

	memset(&cc[0], 0, ARRAY_SIZE*4);

	__m128i a,b0,b1,b2,b3;
	__m128i c0,c1,c2,c3;

	c0 = _mm_set_epi32(0,0,0,0);
	c1 = _mm_set_epi32(0,0,0,1);
	c2 = _mm_set_epi32(0,0,1,0);
	c3 = _mm_set_epi32(0,1,0,0);
	a = _mm_set_epi32(1,2,2,1);
	c0 = _mm_load_si128((__m128i*)&cc[ARRAY_SIZE-w]);
	c1 = _mm_load_si128((__m128i*)&cc[ARRAY_SIZE-(w-4)]);	
	c2 = _mm_load_si128((__m128i*)&cc[ARRAY_SIZE-(w-8)]);	
	c3 = _mm_load_si128((__m128i*)&cc[ARRAY_SIZE-(w-12)]);	

	for (k = 0; k < REPS; k++) 
	{
		for (itr = ARRAY_SIZE; itr>w2; itr-=w2)
		{

			b0 = _mm_load_si128((__m128i*)&cc[itr-(w2)]);
			b1 = _mm_load_si128((__m128i*)&cc[itr-(w2-4)]);	
			b2 = _mm_load_si128((__m128i*)&cc[itr-(w2-8)]);	
			b3 = _mm_load_si128((__m128i*)&cc[itr-(w2-12)]);	

			c0 = _mm_max_epi32(_mm_add_epi32(c0,a), b0);
			c1 = _mm_max_epi32(_mm_add_epi32(c1,a), b1);
			c2 = _mm_max_epi32(_mm_add_epi32(c2,a), b2);
			c3 = _mm_max_epi32(_mm_add_epi32(c3,a), b3);

			_mm_store_si128((__m128i*)&cc[itr-w], c0);
			_mm_store_si128((__m128i*)&cc[itr-(w-4)], c1);	
			_mm_store_si128((__m128i*)&cc[itr-(w-8)], c2);	
			_mm_store_si128((__m128i*)&cc[itr-(w-12)], c3);	


			c0 = _mm_load_si128((__m128i*)&cc[itr-(w+w2)]);
			c1 = _mm_load_si128((__m128i*)&cc[itr-(w+w2-4)]);	
			c2 = _mm_load_si128((__m128i*)&cc[itr-(w+w2-8)]);	
			c3 = _mm_load_si128((__m128i*)&cc[itr-(w+w2-12)]);	
		
			b0 = _mm_max_epi32(_mm_add_epi32(b0,a), c0);
			b1 = _mm_max_epi32(_mm_add_epi32(b1,a), c1);
			b2 = _mm_max_epi32(_mm_add_epi32(b2,a), c2);
			b3 = _mm_max_epi32(_mm_add_epi32(b3,a), c3);

			_mm_store_si128((__m128i*)&cc[itr-w2], b0);
			_mm_store_si128((__m128i*)&cc[itr-(w2-4)], b1);	
			_mm_store_si128((__m128i*)&cc[itr-(w2-8)], b2);	
			_mm_store_si128((__m128i*)&cc[itr-(w2-12)], b3);	
		}	
		a = _mm_min_epi32(a,b0);
	}

	int count =0;
	for (i=0; i< ARRAY_SIZE; i++)
	{
		count += cc[i];	
	}

	return count;

}
Exemplo n.º 7
0
// Calculates bounding rectagnle of a point set or retrieves already calculated
static Rect pointSetBoundingRect( const Mat& points )
{
    int npoints = points.checkVector(2);
    int depth = points.depth();
    CV_Assert(npoints >= 0 && (depth == CV_32F || depth == CV_32S));

    int  xmin = 0, ymin = 0, xmax = -1, ymax = -1, i;
    bool is_float = depth == CV_32F;

    if( npoints == 0 )
        return Rect();

    const Point* pts = (const Point*)points.data;
    Point pt = pts[0];

#if CV_SSE4_2
    if(cv::checkHardwareSupport(CV_CPU_SSE4_2))
    {
        if( !is_float )
        {
            __m128i minval, maxval;
            minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y

            for( i = 1; i < npoints; i++ )
            {
                __m128i ptXY = _mm_loadl_epi64((const __m128i*)&pts[i]);
                minval = _mm_min_epi32(ptXY, minval);
                maxval = _mm_max_epi32(ptXY, maxval);
            }
            xmin = _mm_cvtsi128_si32(minval);
            ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4));
            xmax = _mm_cvtsi128_si32(maxval);
            ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4));
        }
        else
        {
            __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps();
            minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt));

            for( i = 1; i < npoints; i++ )
            {
                ptXY = _mm_loadl_pi(ptXY, (const __m64*)&pts[i]);

                minvalf = _mm_min_ps(minvalf, ptXY);
                maxvalf = _mm_max_ps(maxvalf, ptXY);
            }

            float xyminf[2], xymaxf[2];
            _mm_storel_pi((__m64*)xyminf, minvalf);
            _mm_storel_pi((__m64*)xymaxf, maxvalf);
            xmin = cvFloor(xyminf[0]);
            ymin = cvFloor(xyminf[1]);
            xmax = cvFloor(xymaxf[0]);
            ymax = cvFloor(xymaxf[1]);
        }
    }
    else
#endif
    {
        if( !is_float )
        {
            xmin = xmax = pt.x;
            ymin = ymax = pt.y;

            for( i = 1; i < npoints; i++ )
            {
                pt = pts[i];

                if( xmin > pt.x )
                    xmin = pt.x;

                if( xmax < pt.x )
                    xmax = pt.x;

                if( ymin > pt.y )
                    ymin = pt.y;

                if( ymax < pt.y )
                    ymax = pt.y;
            }
        }
        else
        {
            Cv32suf v;
            // init values
            xmin = xmax = CV_TOGGLE_FLT(pt.x);
            ymin = ymax = CV_TOGGLE_FLT(pt.y);

            for( i = 1; i < npoints; i++ )
            {
                pt = pts[i];
                pt.x = CV_TOGGLE_FLT(pt.x);
                pt.y = CV_TOGGLE_FLT(pt.y);

                if( xmin > pt.x )
                    xmin = pt.x;

                if( xmax < pt.x )
                    xmax = pt.x;

                if( ymin > pt.y )
                    ymin = pt.y;

                if( ymax < pt.y )
                    ymax = pt.y;
            }

            v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f);
            v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f);
            // because right and bottom sides of the bounding rectangle are not inclusive
            // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil
            v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
            v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
        }
    }

    return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1);
}
Exemplo n.º 8
0
static inline int32_t _mm_hmax_epi32_rpl(__m128i a) {
    a = _mm_max_epi32(a, _mm_srli_si128(a, 8));
    a = _mm_max_epi32(a, _mm_srli_si128(a, 4));
    return _mm_extract_epi32(a, 0);
}
Exemplo n.º 9
0
void ahd_interpolate_tile(int top, char * buffer)
{
    int row, col, tr, tc, c, val;
    const int dir[4] = { -1, 1, -width, width };
    __m128i ldiff[2], abdiff[2];
    union hvrgbpix (*rgb)[width] = (union hvrgbpix (*)[width])buffer;
    union hvrgbpix *rix;
    union rgbpix * pix;
    union hvrgbpix (*lab)[width];
    short (*lix)[8];
    char (*h**o)[width][2];
    lab  = (union hvrgbpix (*)[width])(buffer + 16*width*TS);
    h**o = (char  (*)[width][2])(buffer + 32*width*TS);

    const int left=2;

    if ((uintptr_t)(image+top*width)&0xf || (uintptr_t)buffer&0xf) {
        fprintf(stderr, "unaligned buffers defeat speed!\n"); abort();
    }

    /*  Interpolate gren horz&vert, red and blue, and convert to CIELab:  */
    //do the first two rows of green first.
    //then one green, and rgb through the tile.. this because R/B needs down-right green value
    for (row=top; row < top+2 && row < height-2; row++) {
        col = left + (FC(row,left) & 1);
        for (c = FC(row,col); col < width-2; col+=2) {
            pix = (union rgbpix*)image + row*width+col;
            val = ((pix[-1].g + pix[0].c[c] + pix[1].g) * 2 - pix[-2].c[c] - pix[2].c[c]) >> 2;
            rgb[row-top][col-left].h.g = ULIM(val,pix[-1].g,pix[1].g);
            val = ((pix[-width].g + pix[0].c[c] + pix[width].g) * 2 - pix[-2*width].c[c] - pix[2*width].c[c]) >> 2;
            rgb[row-top][col-left].v.g = ULIM(val,pix[-width].g,pix[width].g);
        }
    }

    for (; row < top+TS && row < height-2; row++) {
        int rowx = row-1;

        if (FC(rowx,left+1)==1) {
            int c1 = FC(rowx+1,left+1),
                c2 = FC(rowx,left+2);

            pix = (union rgbpix*)image + row*width+left+1;
            rix = &rgb[row-top][1];

            val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2;
            rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g);
            val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2;
            rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g);
            for (col=left+1; col < width-3; col+=2) {
                pix = (union rgbpix*)image + rowx*width+col+1;

                union hvrgbpix rixr, rix0;

                rix = &rgb[rowx-top][col-left]+1;

                signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1];
                signed pix_ul = pix[-width-1].c[c1];
                rixr.vec = _mm_set1_epi16(pix[-1].g);
                signed pix_lr = pix[-2].c[c2] + pix[0].c[c2];
                rix0.h.c[c2] = rix0.v.c[c2]  = pix[0].c[c2];
                pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1] + 1;
                signed pix_dl = pix[width-1].c[c1];

                //fully loaded
                __m128i rix_dr =               _mm_setr_epi32(pix[width].g,       pix[width-1].c[c1], pix[1].g, pix[-width+1].c[c1]);
                rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+1].c[c1],  pix[width+3].c[c1], pix[width+1].c[c1], 0));
                rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+2].g,      0,                  pix[2*width+1].g, pix[3*width+1].c[c1]));
                rix_dr = _mm_mullo_epi32(rix_dr,_mm_setr_epi32(2,1,2,1));
                //half loaded
                rix_dr = _mm_hsub_epi32(rix_dr,_mm_setzero_si128());
                rix_dr = _mm_srai_epi32(rix_dr,2);
                __m128i a = _mm_setr_epi32(pix[width].g,pix[1].g,0,0);
                __m128i b = _mm_setr_epi32(pix[width+2].g,pix[2*width+1].g,0,0);
                __m128i m = _mm_min_epi32(a,b);
                __m128i M = _mm_max_epi32(a,b);
                rix_dr = _mm_min_epi32(rix_dr,M);
                rix_dr = _mm_max_epi32(rix_dr,m);

                signed pix_udr = pix_ul + pix_dl;

                signed rix0_ul = rix[-width-1].h.g;
                signed rix1_ul = rix[-width-1].v.g;
                __m128i rix_ur = _mm_setr_epi32(rix[-width+1].h.g, rix[-width+1].v.g, 0, 0);
                signed rix0_rr = rix[-2].h.g;
                signed rix1_rr = rix[-2].v.g;

                rix0.h.g = rix[0].h.g;
                rix0.v.g = rix[0].v.g;
                signed rix0_dl = rix[width-1].h.g;
                signed rix1_dl = rix[width-1].v.g;

                // fully loaded
                __m128i rix_udr = _mm_setr_epi32(rix0_ul, rix1_ul, rix0_rr, rix1_rr);
                rix_udr = _mm_add_epi32(rix_udr, _mm_setr_epi32(rix0_dl, rix1_dl, rix0.h.g, rix0.v.g));
                __m128i v2 = _mm_set_epi32(pix_lr, pix_lr, pix_udr, pix_udr);
                v2 = _mm_sub_epi32(v2, rix_udr);
                v2 = _mm_srai_epi32(v2,1);
                v2 = _mm_add_epi32(v2,_mm_cvtepu16_epi32(rixr.vec));
                v2 = _mm_max_epi32(v2, _mm_setzero_si128());
                v2 = _mm_min_epi32(v2, _mm_set1_epi32(0xffff));
                rixr.h.c[c2] = _mm_extract_epi32(v2,2);
                rixr.v.c[c2] = _mm_extract_epi32(v2,3);
                rixr.h.c[c1] = _mm_extract_epi32(v2,0);
                rixr.v.c[c1] = _mm_extract_epi32(v2,1);

                // following only uses 64 bit
                __m128i v1 = _mm_set1_epi32(pix_diag);
                v1 = _mm_sub_epi32(v1, rix_ur);
                v1 = _mm_sub_epi32(v1, rix_dr);
                v1 = _mm_sub_epi32(v1, rix_udr);
                v1 = _mm_srai_epi32(v1,2);
                v1 = _mm_add_epi32(v1, _mm_setr_epi32(rix0.h.g, rix0.v.g, 0, 0));
                v1 = _mm_max_epi32(v1, _mm_setzero_si128());
                v1 = _mm_min_epi32(v1, _mm_set1_epi32(0xffff));
                rix0.h.c[c1] = _mm_extract_epi32(v1,0);
                rix0.v.c[c1] = _mm_extract_epi32(v1,1);


                lab[rowx-top][col-left].vec = cielabv(rixr);
                lab[rowx-top][col-left+1].vec = cielabv(rix0);

                _mm_store_si128(&rix[-1].vec,rixr.vec);
                _mm_store_si128(&rix[0].vec,rix0.vec);

                rix[width+1].h.g = _mm_extract_epi32(rix_dr,0);
                rix[width+1].v.g = _mm_extract_epi32(rix_dr,1);
            }
        } else {
Exemplo n.º 10
0
int sse_auction_search(int *pr, int *P, int *ai0, int *ai1, int *a0, int *a1, int nodes, int arcs, int s, int t)
{
	int i __attribute__ ((aligned (16))) = 0;	
	int j __attribute__ ((aligned (16))) = t;
	int k __attribute__ ((aligned (16))) = 0;
	int m __attribute__ ((aligned (16))) = 0;	
	int maxla __attribute__ ((aligned (32))) = 0;
	int argmaxla __attribute__ ((aligned (16))) = 0;
	int cost __attribute__ ((aligned (16))) = 0;
	int length __attribute__ ((aligned (16))) = 1;
	int path_cost __attribute__ ((aligned (16))) = 0;
	
	uint32_t tmp1, tmp2;
	int cost_tab[nodes+1];

	__m128i a0sse, a1sse, ai0sse, ai1sse, ai1sse1, I, J, K, M, then;
	__m128i ARCS, MNODES, INFINITE, NEGINF, prsse, Psse, MAXLA, ARGMAXLA, LA, mask1, mask2, mask3, COST;
			
	for(i = 0; i <= nodes; i++) {
		cost_tab[i] = 0;
	}

	if(check_s_t(s, t, P, nodes) != 0) {
		return 1;
	}

	while(P[s] == INF) {
		k = -1;	
		m = -1;

		//printf("j = %d\n", j);

		J = _mm_set1_epi32(j);			//aktualna wartosc j
		K = _mm_set1_epi32(-1);			//poczatkowy indeks w tablicy z kosztami krawedzi
		M = _mm_set1_epi32(-1);			//koncowy indeks w tablicy z kosztami krawedzi
		MNODES = _mm_set1_epi32(nodes-1);	//liczba wezlow pomniejszona o 1 (do sprawdzenia czy koniec tablicy)
		ARCS = _mm_set1_epi32(arcs);		//liczba krawedzi
	
		/* wyliczenie k, m */
		for(i = 0; i < nodes; i+=4) {
			ai0sse = _mm_load_si128((__m128i*) &ai0[i]);	//ladowanie ai0 (numerow wezlow)
			ai1sse = _mm_load_si128((__m128i*) &ai1[i]);	//ladowanie ai1 (indeksow w tablicy z krawedziami)
			ai1sse1 = _mm_set_epi32(ai1[i+4],ai1[i+3],ai1[i+2],ai1[i+1]);	//ladowanie indeksow z ai1 przesunietych o 1
			mask1 = _mm_cmpeq_epi32(J, ai0sse);				//sprawdzenie warunku j == ai0[i]
			K = _mm_or_si128(_mm_and_si128(mask1,ai1sse), _mm_andnot_si128(mask1,K));	//ustalenie K
			I = _mm_set_epi32(i+3, i+2, i+1, i);						//aktualne wartosci i
			mask2 = _mm_cmplt_epi32(I, MNODES);				//sprawdzenie warunku i == nodes-1
			mask3 = _mm_and_si128(mask1,mask2);				//sprawdzenie sumy warunkow 1 i 2
			then = _mm_or_si128(_mm_and_si128(mask2,ai1sse1), _mm_andnot_si128(mask2,ARCS));	//m = ai1[i+1] lub arcs
			M = _mm_or_si128(_mm_and_si128(mask3,then), _mm_andnot_si128(mask3,M));		//ustalenie M
		}
	
		for(i = 0; i < nodes; i++) {
			if(ai0[i] == j) {
				k = ai1[i];		//k - indeks startowy krawedzi wychodzacych z j
				//printf("i = %d ", i);
				if(i < nodes - 1) {
					m = ai1[i+1];
				}
				else {
					m = arcs;
				}
			}
		}


		/* zapisanie k, m */
		for(i = 0; i < 4; i++) {
			tmp1 = get_from_m128i(K,i);
			tmp2 = get_from_m128i(M,i);
			if(tmp1 != -1) {
				k = tmp1;
			}
			if(tmp2 != -1) {
				m = tmp2;
			}
		}
		//printf("K,M: %d %d\n", k, m);
		
		/* wybor optymalnej krawedzi */
		if(k != -1) {		
			INFINITE = _mm_set1_epi32(INF);		//wartosc "nieskonczona"
			NEGINF = _mm_set1_epi32(0-INF);		//wartosc -INF
			COST = _mm_set1_epi32(cost);		//koszt wybranej krawedzi
			MAXLA = _mm_set1_epi32(0-INF);		//maksymalna wartosc la = pr[a0[i]] - a1[i]
			ARGMAXLA = _mm_set1_epi32(-1);		//indeks dla którego la jest najwieksza
			for(i = k; i < m; i+=4) {
				a1sse = _mm_set_epi32(a1[i],a1[i+1],a1[i+2],a1[i+3]);				//ladowanie a1
				a0sse = _mm_set_epi32(a0[i],a0[i+1],a0[i+2],a0[i+3]);				//ladowanie a0
				prsse = _mm_set_epi32(pr[a0[i]],pr[a0[i+1]],pr[a0[i+2]],pr[a0[i+3]]);		//ladowanie pr
				Psse = _mm_set_epi32(P[a0[i]],P[a0[i+1]],P[a0[i+2]],P[a0[i+3]]);		//ladowanie P
				mask1 = _mm_cmpgt_epi32(_mm_set1_epi32(m),_mm_set_epi32(i,i+1,i+2,i+3));	//czy ostatni obieg
				prsse = _mm_or_si128(_mm_and_si128(mask1,prsse), _mm_andnot_si128(mask1,NEGINF));	//obciecie cudzych lukow
				LA = _mm_sub_epi32(prsse, a1sse);		//la = pr[a0[i]] - a1[i]
				then = _mm_max_epi32(LA,MAXLA);			//maksymalna wartość la, maxla
				mask1 = _mm_cmpeq_epi32(Psse,INFINITE);		//czy P[i] == INF
				mask2 = _mm_and_si128(mask1,_mm_cmpgt_epi32(LA,MAXLA));		//czy P[i] == INF i LA > MAXLA
				MAXLA = _mm_or_si128(_mm_and_si128(mask1,then), _mm_andnot_si128(mask1,MAXLA));		//aktualizacja maxla
				ARGMAXLA = _mm_or_si128(_mm_and_si128(mask2,a0sse), _mm_andnot_si128(mask2,ARGMAXLA));	//aktualizacja argmaxla
				COST = _mm_or_si128(_mm_and_si128(mask2,a1sse), _mm_andnot_si128(mask2,COST));		//aktualizacja cost
			}
		}
	
		/* zapisanie maxla, argmaxla, cost */
		maxla = 0 - INF;
		for(i = 0; i < 4; i++) {
			tmp1 = get_from_m128i(MAXLA,i);
			if(tmp1 > maxla) {
				argmaxla = get_from_m128i(ARGMAXLA,i);
				maxla = tmp1;
				cost = get_from_m128i(COST,i);
			}
		}
		//printf("COST: %d, PATH_COST: %d\n", cost, path_cost);
		//printf("pr[j] = %d, maxla = %d, argmaxla = %d\n", pr[j], maxla, argmaxla);

		/* skrocenie sciezki */
		if(pr[j] > maxla || maxla == -INF) {
			
			/* uaktualnienie ceny */
			pr[j] = maxla;

			/* sciezka jednoelementowa nie jest skracana */
			if(j != t) {

				/* uaktualnienie sciezki */
				P[j] = INF;
				length = length - 1;
				path_cost = path_cost - cost_tab[length];
				cost_tab[length] = 0;
			
				/* powrot do poprzedniego wierzcholka w sciezce (j), k - odcinany */
				k = j;
				for(i = 0; i < nodes; i++) {
					if(P[i] == length - 1) {
						j = i;
						break;
					}
				}
			}
		}
		/* przedluzenie sciezki */
		else {
			P[argmaxla] = length;
			j = argmaxla;
			path_cost = path_cost + cost;
			cost_tab[length] = cost;
			length = length + 1;

			/* sciezka doszla do wierzcholka startowego => koniec */
			if(argmaxla == s)
			{
				printf("dlugosc sciezki: %d\n", path_cost);
				return 0;
			}
		}
	}
	return 0;


}
Exemplo n.º 11
0
/* Calculates bounding rectagnle of a point set or retrieves already calculated */
CV_IMPL  CvRect
cvBoundingRect( CvArr* array, int update )
{
    CvSeqReader reader;
    CvRect  rect = { 0, 0, 0, 0 };
    CvContour contour_header;
    CvSeq* ptseq = 0;
    CvSeqBlock block;

    CvMat stub, *mat = 0;
    int  xmin = 0, ymin = 0, xmax = -1, ymax = -1, i, j, k;
    int calculate = update;

    if( CV_IS_SEQ( array ))
    {
        ptseq = (CvSeq*)array;
        if( !CV_IS_SEQ_POINT_SET( ptseq ))
            CV_Error( CV_StsBadArg, "Unsupported sequence type" );

        if( ptseq->header_size < (int)sizeof(CvContour))
        {
            update = 0;
            calculate = 1;
        }
    }
    else
    {
        mat = cvGetMat( array, &stub );
        if( CV_MAT_TYPE(mat->type) == CV_32SC2 ||
            CV_MAT_TYPE(mat->type) == CV_32FC2 )
        {
            ptseq = cvPointSeqFromMat(CV_SEQ_KIND_GENERIC, mat, &contour_header, &block);
            mat = 0;
        }
        else if( CV_MAT_TYPE(mat->type) != CV_8UC1 &&
                CV_MAT_TYPE(mat->type) != CV_8SC1 )
            CV_Error( CV_StsUnsupportedFormat,
                "The image/matrix format is not supported by the function" );
        update = 0;
        calculate = 1;
    }

    if( !calculate )
        return ((CvContour*)ptseq)->rect;

    if( mat )
    {
        CvSize size = cvGetMatSize(mat);
        xmin = size.width;
        ymin = -1;

        for( i = 0; i < size.height; i++ )
        {
            uchar* _ptr = mat->data.ptr + i*mat->step;
            uchar* ptr = (uchar*)cvAlignPtr(_ptr, 4);
            int have_nz = 0, k_min, offset = (int)(ptr - _ptr);
            j = 0;
            offset = MIN(offset, size.width);
            for( ; j < offset; j++ )
                if( _ptr[j] )
                {
                    have_nz = 1;
                    break;
                }
            if( j < offset )
            {
                if( j < xmin )
                    xmin = j;
                if( j > xmax )
                    xmax = j;
            }
            if( offset < size.width )
            {
                xmin -= offset;
                xmax -= offset;
                size.width -= offset;
                j = 0;
                for( ; j <= xmin - 4; j += 4 )
                    if( *((int*)(ptr+j)) )
                        break;
                for( ; j < xmin; j++ )
                    if( ptr[j] )
                    {
                        xmin = j;
                        if( j > xmax )
                            xmax = j;
                        have_nz = 1;
                        break;
                    }
                k_min = MAX(j-1, xmax);
                k = size.width - 1;
                for( ; k > k_min && (k&3) != 3; k-- )
                    if( ptr[k] )
                        break;
                if( k > k_min && (k&3) == 3 )
                {
                    for( ; k > k_min+3; k -= 4 )
                        if( *((int*)(ptr+k-3)) )
                            break;
                }
                for( ; k > k_min; k-- )
                    if( ptr[k] )
                    {
                        xmax = k;
                        have_nz = 1;
                        break;
                    }
                if( !have_nz )
                {
                    j &= ~3;
                    for( ; j <= k - 3; j += 4 )
                        if( *((int*)(ptr+j)) )
                            break;
                    for( ; j <= k; j++ )
                        if( ptr[j] )
                        {
                            have_nz = 1;
                            break;
                        }
                }
                xmin += offset;
                xmax += offset;
                size.width += offset;
            }
            if( have_nz )
            {
                if( ymin < 0 )
                    ymin = i;
                ymax = i;
            }
        }

        if( xmin >= size.width )
            xmin = ymin = 0;
    }
    else if( ptseq->total )
    {
        int  is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2;
        cvStartReadSeq( ptseq, &reader, 0 );
        CvPoint pt;
        CV_READ_SEQ_ELEM( pt, reader );
    #if CV_SSE4_2
        if(cv::checkHardwareSupport(CV_CPU_SSE4_2))
        {
            if( !is_float )
            {
                __m128i minval, maxval;
                minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y

                for( i = 1; i < ptseq->total; i++)
                {
                    __m128i ptXY = _mm_loadl_epi64((const __m128i*)(reader.ptr));
                    CV_NEXT_SEQ_ELEM(sizeof(pt), reader);
                    minval = _mm_min_epi32(ptXY, minval);
                    maxval = _mm_max_epi32(ptXY, maxval);
                }
                xmin = _mm_cvtsi128_si32(minval);
                ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4));
                xmax = _mm_cvtsi128_si32(maxval);
                ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4));
            }
            else
            {
                __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps();
                minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt));

                for( i = 1; i < ptseq->total; i++ )
                {
                    ptXY = _mm_loadl_pi(ptXY, (const __m64*)reader.ptr);
                    CV_NEXT_SEQ_ELEM(sizeof(pt), reader);

                    minvalf = _mm_min_ps(minvalf, ptXY);
                    maxvalf = _mm_max_ps(maxvalf, ptXY);
                }

                float xyminf[2], xymaxf[2];
                _mm_storel_pi((__m64*)xyminf, minvalf);
                _mm_storel_pi((__m64*)xymaxf, maxvalf);
                xmin = cvFloor(xyminf[0]);
                ymin = cvFloor(xyminf[1]);
                xmax = cvFloor(xymaxf[0]);
                ymax = cvFloor(xymaxf[1]);
            }
        }
        else
    #endif
        {
            if( !is_float )
            {
                xmin = xmax = pt.x;
                ymin = ymax = pt.y;

                for( i = 1; i < ptseq->total; i++ )
                {
                    CV_READ_SEQ_ELEM( pt, reader );

                    if( xmin > pt.x )
                        xmin = pt.x;

                    if( xmax < pt.x )
                        xmax = pt.x;

                    if( ymin > pt.y )
                        ymin = pt.y;

                    if( ymax < pt.y )
                        ymax = pt.y;
                }
            }
            else
            {
                Cv32suf v;
                // init values
                xmin = xmax = CV_TOGGLE_FLT(pt.x);
                ymin = ymax = CV_TOGGLE_FLT(pt.y);

                for( i = 1; i < ptseq->total; i++ )
                {
                    CV_READ_SEQ_ELEM( pt, reader );
                    pt.x = CV_TOGGLE_FLT(pt.x);
                    pt.y = CV_TOGGLE_FLT(pt.y);

                    if( xmin > pt.x )
                        xmin = pt.x;

                    if( xmax < pt.x )
                        xmax = pt.x;

                    if( ymin > pt.y )
                        ymin = pt.y;

                    if( ymax < pt.y )
                        ymax = pt.y;
                }

                v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f);
                v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f);
                // because right and bottom sides of the bounding rectangle are not inclusive
                // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil
                v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
                v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
            }
        }
        rect.x = xmin;
        rect.y = ymin;
        rect.width = xmax - xmin + 1;
        rect.height = ymax - ymin + 1;
    }
    if( update )
        ((CvContour*)ptseq)->rect = rect;
    return rect;
}
Exemplo n.º 12
0
__m128i test_mm_max_epi32(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_max_epi32
  // CHECK: call <4 x i32> @llvm.x86.sse41.pmaxsd
  // CHECK-ASM: pmaxsd %xmm{{.*}}, %xmm{{.*}}
  return _mm_max_epi32(x, y);
}
Exemplo n.º 13
0
inline __m128i
_mm_clamp_epi32(__m128i value, __m128i min, __m128i max)
{
    return _mm_min_epi32(_mm_max_epi32(value, min), max);
}
Exemplo n.º 14
0
__m128i test_mm_max_epi32(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_max_epi32
  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[Y:%.*]]
  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
  return _mm_max_epi32(x, y);
}