Ejemplo n.º 1
0
  // Y = Y + alpha * X
  // saddly, there is no madd instr. for Fp in SSE
  static void madd(const float& alpha, const RView& X, 
		   Result& result)
#ifdef __SSE_4_1__    
  {
    const int * x = X.data();
    int * y       = result.data();
    __m128i px, px1, px2, py, py1, py2;
    __m128i alpha_p = _mm_set_epi32(alpha, alpha, alpha, alpha);

    for(int i=0; i<DIM_N - (DIM_N%12); i+=12)
      {
	px  = _mm_load_si128((const __m128i *)x);
	px1 = _mm_load_si128((const __m128i *)(x+4));
	px2 = _mm_load_si128(x+8)
	py  = _mm_load_si128(y);
	py1 = _mm_load_si128(y+4);
	py2 = _mm_load_si128(y+8);

	py  = _mm_add_epi32(py,  _mm_mullo_epi32(alpha_a, px));
	py1 = _mm_add_epi32(py1, _mm_mullo_epi32(alpha_a, px1));
	py2 = _mm_add_epi32(py2, _mm_mullo_epi32(alpha_a, px2));

	_mm_store_si128(y, py);
	_mm_store_si128(y+4, py1);
	_mm_store_si128(y+8, py2);

	x += 12;
	y += 12;
      }
    for(int i=DIM_N - (DIM_N%12); i<DIM_N; ++i)
      {
	result[i] += alpha * X[i];	
      }
  }
Ejemplo n.º 2
0
void ethash_calculate_dag_item(node *const ret,
                               const unsigned node_index,
                               const struct ethash_params *params,
                               const struct ethash_cache *cache)
{
    uint32_t num_parent_nodes = (uint32_t) (params->cache_size / sizeof(node));
    node const *cache_nodes = (node const *) cache->mem;
    node const *init = &cache_nodes[node_index % num_parent_nodes];

    memcpy(ret, init, sizeof(node));
    ret->words[0] ^= node_index;
    SHA3_512(ret->bytes, ret->bytes, sizeof(node));

#if defined(_M_X64) && ENABLE_SSE
    __m128i const fnv_prime = _mm_set1_epi32(FNV_PRIME);
    __m128i xmm0 = ret->xmm[0];
    __m128i xmm1 = ret->xmm[1];
    __m128i xmm2 = ret->xmm[2];
    __m128i xmm3 = ret->xmm[3];
#endif

    for (unsigned i = 0; i != DATASET_PARENTS; ++i) {
        uint32_t parent_index = ((node_index ^ i) * FNV_PRIME ^ ret->words[i % NODE_WORDS]) % num_parent_nodes;
        node const *parent = &cache_nodes[parent_index];

#if defined(_M_X64) && ENABLE_SSE
        {
            xmm0 = _mm_mullo_epi32(xmm0, fnv_prime);
            xmm1 = _mm_mullo_epi32(xmm1, fnv_prime);
            xmm2 = _mm_mullo_epi32(xmm2, fnv_prime);
            xmm3 = _mm_mullo_epi32(xmm3, fnv_prime);
            xmm0 = _mm_xor_si128(xmm0, parent->xmm[0]);
            xmm1 = _mm_xor_si128(xmm1, parent->xmm[1]);
            xmm2 = _mm_xor_si128(xmm2, parent->xmm[2]);
            xmm3 = _mm_xor_si128(xmm3, parent->xmm[3]);

            // have to write to ret as values are used to compute index
            ret->xmm[0] = xmm0;
            ret->xmm[1] = xmm1;
            ret->xmm[2] = xmm2;
            ret->xmm[3] = xmm3;
        }
        #else
        {
            for (unsigned w = 0; w != NODE_WORDS; ++w) {
                ret->words[w] = fnv_hash(ret->words[w], parent->words[w]);
            }
        }
#endif
    }

    SHA3_512(ret->bytes, ret->bytes, sizeof(node));
}
Ejemplo n.º 3
0
inline static short
sse3_dot_prod (const uint16_t *p1,
               const uint16_t *p2,
               size_t          size)
{
  unsigned long res[4];
  unsigned int  i;

  __m128i* mp1  = (__m128i *)p1;
  __m128i* mp2  = (__m128i *)p2;
  __m128i  mres = _mm_set_epi32 (0, 0, 0, 0);

  for (i = 0; i < size; i += 8)
    {

      __m128i mreg1 = _mm_loadu_si128 (mp1);
      __m128i mreg2 = _mm_loadu_si128 (mp2);
      __m128i xlo1  = _mm_unpacklo_epi16 (mreg1, _mm_set1_epi16 (0));
      __m128i xlo2  = _mm_unpacklo_epi16 (mreg2, _mm_set1_epi16 (0));
      __m128i mtmp  = _mm_mullo_epi32 (xlo1, xlo2);
      mres          = _mm_add_epi32 (mres, mtmp);

      __m128i xhi1 = _mm_unpackhi_epi16 (mreg1, _mm_set1_epi16 (0));
      __m128i xhi2 = _mm_unpackhi_epi16 (mreg2, _mm_set1_epi16 (0));
      mtmp         = _mm_mullo_epi32 (xhi1, xhi2);
      mres         = _mm_add_epi32 (mres, mtmp);

      /*
      __m128i xlo1  = _mm_unpacklo_epi16 (_mm_loadu_si128 (mp1), _mm_set1_epi16 (0));
      __m128i xlo2  = _mm_unpacklo_epi16 (_mm_loadu_si128 (mp2), _mm_set1_epi16 (0));
      __m128i mtmp  = _mm_mullo_epi32 (xlo1, xlo2);
      mres          = _mm_add_epi32 (mres, mtmp);

      __m128i xhi1  = _mm_unpackhi_epi16 (_mm_loadu_si128 (mp1), _mm_set1_epi16 (0));
      __m128i xhi2  = _mm_unpackhi_epi16 (_mm_loadu_si128 (mp2), _mm_set1_epi16 (0));
      mtmp          = _mm_mullo_epi32 (xhi1, xhi2);
      mres          = _mm_add_epi32 (mres, mtmp);
      */

      mp1++;
      mp2++;
    }

  __m128i* pmres = (__m128i *)res;
  _mm_storeu_si128 (pmres, mres);

  return res[0]+res[1]+res[2]+res[3];
}
Ejemplo n.º 4
0
uint32_t probe(uint32_t key)
{
  /* create a vector with all values initialized to key */
  __m128i keyVector = _mm_set1_epi32(key);

  /* find the appropriate buckets using multiplicative hashing */
  __m128i bucketIds = _mm_mullo_epi32(keyVector, hashes.vec128);
  bucketIds  = _mm_srli_epi32(bucketIds, hashShift);
  size_t b0 = _mm_extract_epi32(bucketIds, 0);
  size_t b1 = _mm_extract_epi32(bucketIds, 1);

  __m128i keys;
  __m128i values0, values1;

  /* load keys, compare with lookup key (to produce a bitmask).
   * AND the result with the corresponding values. */
  keys = _mm_load_si128((const __m128i *) buckets[b0].keys);
  keys = _mm_cmpeq_epi32(keys, keyVector);
  values0 = _mm_load_si128((const __m128i *) buckets[b0].values);
  values0 = _mm_and_si128(values0, keys);

  keys = _mm_load_si128((const __m128i *) buckets[b1].keys);
  keys = _mm_cmpeq_epi32(keys, keyVector);
  values1 = _mm_load_si128((const __m128i *) buckets[b1].values);
  values1 = _mm_and_si128(values1, keys);

  /* OR all of the (key AND value) pairs to get result */
  union QuadInt qi;
  qi.vec128 = _mm_or_si128(values0, values1);
  qi.vec64[0] = _mm_or_si64(qi.vec64[0], qi.vec64[1]);
  return qi.arr[0] | qi.arr[1];
}
    SIMDValue SIMDInt32x4Operation::OpMul(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        SIMDValue result;
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);

        if (AutoSystemInfo::Data.SSE4_1Available())
        {   // a * b, only available in SSE4
            x86Result.m128i_value = _mm_mullo_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value);
            result = X86SIMDValue::ToSIMDValue(x86Result);
        }
        else if (AutoSystemInfo::Data.SSE2Available())
        {
            // mul 2,0: r0 = a0*b0; r1 = a2*b2
            __m128i tmp1 = _mm_mul_epu32(tmpaValue.m128i_value, tmpbValue.m128i_value);
            // mul 3,1: r0=a1*b1; r1=a3*b3
            __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(tmpaValue.m128i_value, 4), _mm_srli_si128(tmpbValue.m128i_value, 4));
            // shuffle x86Results to [63..0] and pack
            x86Result.m128i_value = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
            result = X86SIMDValue::ToSIMDValue(x86Result);
        }
        else
        {
            result.i32[SIMD_X] = aValue.i32[SIMD_X] * bValue.i32[SIMD_X];
            result.i32[SIMD_Y] = aValue.i32[SIMD_Y] * bValue.i32[SIMD_Y];
            result.i32[SIMD_Z] = aValue.i32[SIMD_Z] * bValue.i32[SIMD_Z];
            result.i32[SIMD_W] = aValue.i32[SIMD_W] * bValue.i32[SIMD_W];
        }

        return result;
    }
Ejemplo n.º 6
0
int main(int, char**)
{
    volatile __m128 a = _mm_setzero_ps();
    _mm_ceil_ps(a);
    volatile __m128i result = _mm_mullo_epi32(_mm_set1_epi32(42), _mm_set1_epi32(64));
    (void)result;
    return 0;
}
Ejemplo n.º 7
0
//int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, PSTR pScmdline, int iCmdshow)
int main(int _argc, char** _argv)
{
	int i[6] = { 1, 2, 3, 4, 5 };
	int j[6] = { 1, 2, 4, 4, 5 };
	int *k = new int(4);
	__m128i tmp = _mm_mullo_epi32(*(__m128i*)i, *(__m128i*)&j[0]);
	tmp.m128i_i32[0];
	//_mm_mulhi_epi16(*xmm0, *xmm1);

	return 0;
}
Ejemplo n.º 8
0
inline __m128i LOAD_QUANTISED(const int32_t *idata, const QuantisationMatrix *qmatrix, const int l, const int s) {
  __m128i D  = _mm_load_si128((__m128i *)idata);
  __m128i QF = _mm_load_si128((__m128i *)&qmatrix->qfactor[l][s]);
  __m128i QO = _mm_load_si128((__m128i *)&qmatrix->qoffset[l][s]);
  __m128i X  = _mm_abs_epi32(D);
  X = _mm_mullo_epi32(X, QF);
  X = _mm_add_epi32(X, QO);
  X = _mm_srai_epi32(X, 2);
  X = _mm_sign_epi32(X, D);
  return X;
}
void
multiply(int *a, int *b, int *c, int n)
{
	int i;
	__m128i *pa, *pb, pc;
	for(i = 0; i < n; i += 4) {
		pa = (__m128i *)&a[i];
		pb = (__m128i *)&b[i];
		pc = _mm_mullo_epi32(*pa, *pb);
		memcpy(&c[i], &pc, 4*sizeof(int));
	}
}
Ejemplo n.º 10
0
void ahd_interpolate_tile(int top, char * buffer)
{
    int row, col, tr, tc, c, val;
    const int dir[4] = { -1, 1, -width, width };
    __m128i ldiff[2], abdiff[2];
    union hvrgbpix (*rgb)[width] = (union hvrgbpix (*)[width])buffer;
    union hvrgbpix *rix;
    union rgbpix * pix;
    union hvrgbpix (*lab)[width];
    short (*lix)[8];
    char (*h**o)[width][2];
    lab  = (union hvrgbpix (*)[width])(buffer + 16*width*TS);
    h**o = (char  (*)[width][2])(buffer + 32*width*TS);

    const int left=2;

    if ((uintptr_t)(image+top*width)&0xf || (uintptr_t)buffer&0xf) {
        fprintf(stderr, "unaligned buffers defeat speed!\n"); abort();
    }

    /*  Interpolate gren horz&vert, red and blue, and convert to CIELab:  */
    //do the first two rows of green first.
    //then one green, and rgb through the tile.. this because R/B needs down-right green value
    for (row=top; row < top+2 && row < height-2; row++) {
        col = left + (FC(row,left) & 1);
        for (c = FC(row,col); col < width-2; col+=2) {
            pix = (union rgbpix*)image + row*width+col;
            val = ((pix[-1].g + pix[0].c[c] + pix[1].g) * 2 - pix[-2].c[c] - pix[2].c[c]) >> 2;
            rgb[row-top][col-left].h.g = ULIM(val,pix[-1].g,pix[1].g);
            val = ((pix[-width].g + pix[0].c[c] + pix[width].g) * 2 - pix[-2*width].c[c] - pix[2*width].c[c]) >> 2;
            rgb[row-top][col-left].v.g = ULIM(val,pix[-width].g,pix[width].g);
        }
    }

    for (; row < top+TS && row < height-2; row++) {
        int rowx = row-1;

        if (FC(rowx,left+1)==1) {
            int c1 = FC(rowx+1,left+1),
                c2 = FC(rowx,left+2);

            pix = (union rgbpix*)image + row*width+left+1;
            rix = &rgb[row-top][1];

            val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2;
            rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g);
            val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2;
            rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g);
            for (col=left+1; col < width-3; col+=2) {
                pix = (union rgbpix*)image + rowx*width+col+1;

                union hvrgbpix rixr, rix0;

                rix = &rgb[rowx-top][col-left]+1;

                signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1];
                signed pix_ul = pix[-width-1].c[c1];
                rixr.vec = _mm_set1_epi16(pix[-1].g);
                signed pix_lr = pix[-2].c[c2] + pix[0].c[c2];
                rix0.h.c[c2] = rix0.v.c[c2]  = pix[0].c[c2];
                pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1] + 1;
                signed pix_dl = pix[width-1].c[c1];

                //fully loaded
                __m128i rix_dr =               _mm_setr_epi32(pix[width].g,       pix[width-1].c[c1], pix[1].g, pix[-width+1].c[c1]);
                rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+1].c[c1],  pix[width+3].c[c1], pix[width+1].c[c1], 0));
                rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+2].g,      0,                  pix[2*width+1].g, pix[3*width+1].c[c1]));
                rix_dr = _mm_mullo_epi32(rix_dr,_mm_setr_epi32(2,1,2,1));
                //half loaded
                rix_dr = _mm_hsub_epi32(rix_dr,_mm_setzero_si128());
                rix_dr = _mm_srai_epi32(rix_dr,2);
                __m128i a = _mm_setr_epi32(pix[width].g,pix[1].g,0,0);
                __m128i b = _mm_setr_epi32(pix[width+2].g,pix[2*width+1].g,0,0);
                __m128i m = _mm_min_epi32(a,b);
                __m128i M = _mm_max_epi32(a,b);
                rix_dr = _mm_min_epi32(rix_dr,M);
                rix_dr = _mm_max_epi32(rix_dr,m);

                signed pix_udr = pix_ul + pix_dl;

                signed rix0_ul = rix[-width-1].h.g;
                signed rix1_ul = rix[-width-1].v.g;
                __m128i rix_ur = _mm_setr_epi32(rix[-width+1].h.g, rix[-width+1].v.g, 0, 0);
                signed rix0_rr = rix[-2].h.g;
                signed rix1_rr = rix[-2].v.g;

                rix0.h.g = rix[0].h.g;
                rix0.v.g = rix[0].v.g;
                signed rix0_dl = rix[width-1].h.g;
                signed rix1_dl = rix[width-1].v.g;

                // fully loaded
                __m128i rix_udr = _mm_setr_epi32(rix0_ul, rix1_ul, rix0_rr, rix1_rr);
                rix_udr = _mm_add_epi32(rix_udr, _mm_setr_epi32(rix0_dl, rix1_dl, rix0.h.g, rix0.v.g));
                __m128i v2 = _mm_set_epi32(pix_lr, pix_lr, pix_udr, pix_udr);
                v2 = _mm_sub_epi32(v2, rix_udr);
                v2 = _mm_srai_epi32(v2,1);
                v2 = _mm_add_epi32(v2,_mm_cvtepu16_epi32(rixr.vec));
                v2 = _mm_max_epi32(v2, _mm_setzero_si128());
                v2 = _mm_min_epi32(v2, _mm_set1_epi32(0xffff));
                rixr.h.c[c2] = _mm_extract_epi32(v2,2);
                rixr.v.c[c2] = _mm_extract_epi32(v2,3);
                rixr.h.c[c1] = _mm_extract_epi32(v2,0);
                rixr.v.c[c1] = _mm_extract_epi32(v2,1);

                // following only uses 64 bit
                __m128i v1 = _mm_set1_epi32(pix_diag);
                v1 = _mm_sub_epi32(v1, rix_ur);
                v1 = _mm_sub_epi32(v1, rix_dr);
                v1 = _mm_sub_epi32(v1, rix_udr);
                v1 = _mm_srai_epi32(v1,2);
                v1 = _mm_add_epi32(v1, _mm_setr_epi32(rix0.h.g, rix0.v.g, 0, 0));
                v1 = _mm_max_epi32(v1, _mm_setzero_si128());
                v1 = _mm_min_epi32(v1, _mm_set1_epi32(0xffff));
                rix0.h.c[c1] = _mm_extract_epi32(v1,0);
                rix0.v.c[c1] = _mm_extract_epi32(v1,1);


                lab[rowx-top][col-left].vec = cielabv(rixr);
                lab[rowx-top][col-left+1].vec = cielabv(rix0);

                _mm_store_si128(&rix[-1].vec,rixr.vec);
                _mm_store_si128(&rix[0].vec,rix0.vec);

                rix[width+1].h.g = _mm_extract_epi32(rix_dr,0);
                rix[width+1].v.g = _mm_extract_epi32(rix_dr,1);
            }
        } else {
//-----------------------------------------------------------------------------------------
// Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer
// If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee
// as visible. If all rasterized AABB pixels are occluded then the occludee is culled
//-----------------------------------------------------------------------------------------
void TransformedAABBoxSSE::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	// Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. 
	// so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m128i colOffset = _mm_set_epi32(0, 1, 0, 1);
	__m128i rowOffset = _mm_set_epi32(0, 0, 1, 1);

	__m128i fxptZero = _mm_setzero_si128();
	float* pDepthBuffer = (float*)pRenderTargetPixels; 
	
	// Rasterize the AABB triangles 4 at a time
	for(UINT i = 0; i < AABB_TRIANGLES; i += SSE)
	{
		vFloat4 xformedPos[3];
		Gather(xformedPos, i);

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        vFxPt4 xFormedFxPtPos[3];
		for(int m = 0; m < 3; m++)
		{
			xFormedFxPtPos[m].X = _mm_cvtps_epi32(xformedPos[m].X);
			xFormedFxPtPos[m].Y = _mm_cvtps_epi32(xformedPos[m].Y);
			xFormedFxPtPos[m].Z = _mm_cvtps_epi32(xformedPos[m].Z);
			xFormedFxPtPos[m].W = _mm_cvtps_epi32(xformedPos[m].W);
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(xFormedFxPtPos[1].Y, xFormedFxPtPos[2].Y);
		__m128i A1 = _mm_sub_epi32(xFormedFxPtPos[2].Y, xFormedFxPtPos[0].Y);
		__m128i A2 = _mm_sub_epi32(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].X);
		__m128i B1 = _mm_sub_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].X);
		__m128i B2 = _mm_sub_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].X);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[2].Y), _mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].Y));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[0].Y), _mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].Y));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[1].Y), _mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].Y));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(A0, xFormedFxPtPos[0].X);
		triArea = _mm_add_epi32(triArea, _mm_mullo_epi32(B0, xFormedFxPtPos[0].Y));
		triArea = _mm_add_epi32(triArea, C0);

		__m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea));

		// Use bounding box traversal strategy to determine which pixels to rasterize 
		__m128i startX = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endX   = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENW));

		__m128i startY = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endY   = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENH));

		for(int vv = 0; vv < 3; vv++) 
		{
            // If W (holding 1/w in our case) is not between 0 and 1,
            // then vertex is behind near clip plane (1.0 in our case.
            // If W < 1, then verify 1/W > 1 (for W>0), and 1/W < 0 (for W < 0).
		    __m128 nearClipMask0 = _mm_cmple_ps(xformedPos[vv].W, _mm_set1_ps(0.0f));
		    __m128 nearClipMask1 = _mm_cmpge_ps(xformedPos[vv].W, _mm_set1_ps(1.0f));
            __m128 nearClipMask  = _mm_or_ps(nearClipMask0, nearClipMask1);

			if(!_mm_test_all_zeros(*(__m128i*)&nearClipMask, *(__m128i*)&nearClipMask))
			{
                // All four vertices are behind the near plane (we're processing four triangles at a time w/ SSE)
                *mVisible = true;
                return;
			}
		}

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < SSE; lane++)
        {
			// Skip triangle if area is zero 
			if(triArea.m128i_i32[lane] <= 0)
			{
				continue;
			}

			// Extract this triangle's properties from the SIMD versions
            __m128 zz[3], oneOverW[3];
			for(int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm_set1_ps(xformedPos[vv].Z.m128_f32[lane]);
				oneOverW[vv] = _mm_set1_ps(xformedPos[vv].W.m128_f32[lane]);
			}

			__m128 oneOverTotalArea = _mm_set1_ps(oneOverTriArea.m128_f32[lane]);
			zz[0] *= oneOverTotalArea;
			zz[1] *= oneOverTotalArea;
			zz[2] *= oneOverTotalArea;
			
			int startXx = startX.m128i_i32[lane];
			int endXx	= endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy	= endY.m128i_i32[lane];
		
			__m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]);
			__m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]);
			__m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]);

			__m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]);
			__m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]);
			__m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]);

			__m128i cc0 = _mm_set1_epi32(C0.m128i_i32[lane]);
			__m128i cc1 = _mm_set1_epi32(C1.m128i_i32[lane]);
			__m128i cc2 = _mm_set1_epi32(C2.m128i_i32[lane]);

			__m128i aa0Inc = _mm_slli_epi32(aa0, 1);
			__m128i aa1Inc = _mm_slli_epi32(aa1, 1);
			__m128i aa2Inc = _mm_slli_epi32(aa2, 1);

			__m128i row, col;

			int rowIdx;
			// To avoid this branching, choose one method to traverse and store the pixel depth
			if(gVisualizeDepthBuffer)
			{
				// Sequentially traverse and store pixel depths contiguously
				rowIdx = (startYy * SCREENW + startXx);
			}
			else
			{
				// Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X
				// This method provides better perfromance
				rowIdx = (startYy * SCREENW + 2 * startXx);
			}

			col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx));
			__m128i aa0Col = _mm_mullo_epi32(aa0, col);
			__m128i aa1Col = _mm_mullo_epi32(aa1, col);
			__m128i aa2Col = _mm_mullo_epi32(aa2, col);

			row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy));
			__m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), cc0);
			__m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), cc1);
			__m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), cc2);

			__m128i bb0Inc = _mm_slli_epi32(bb0, 1);
			__m128i bb1Inc = _mm_slli_epi32(bb1, 1);
			__m128i bb2Inc = _mm_slli_epi32(bb2, 1);

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for(int r = startYy; r < endYy; r += 2,
											row  = _mm_add_epi32(row, _mm_set1_epi32(2)),
											rowIdx = rowIdx + 2 * SCREENW,
											bb0Row = _mm_add_epi32(bb0Row, bb0Inc),
											bb1Row = _mm_add_epi32(bb1Row, bb1Inc),
											bb2Row = _mm_add_epi32(bb2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int idx = rowIdx;
				__m128i alpha = _mm_add_epi32(aa0Col, bb0Row);
				__m128i beta = _mm_add_epi32(aa1Col, bb1Row);
				__m128i gama = _mm_add_epi32(aa2Col, bb2Row);

				int idxIncr;
				if(gVisualizeDepthBuffer)
				{ 
					idxIncr = 2;
				}
				else
				{
					idxIncr = 4;
				}

				for(int c = startXx; c < endXx; c += 2,
												idx = idx + idxIncr,
												alpha = _mm_add_epi32(alpha, aa0Inc),
												beta  = _mm_add_epi32(beta, aa1Inc),
												gama  = _mm_add_epi32(gama, aa2Inc))
				{
					//Test Pixel inside triangle
					__m128i mask = _mm_cmplt_epi32(fxptZero, _mm_or_si128(_mm_or_si128(alpha, beta), gama));
					
					// Early out if all of this quad's pixels are outside the triangle.
					if(_mm_test_all_zeros(mask, mask))
					{
						continue;
					}

					// Compute barycentric-interpolated depth
			        __m128 depth = _mm_mul_ps(_mm_cvtepi32_ps(alpha), zz[0]);
					depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1]));
					depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2]));

					__m128 previousDepthValue;
					if(gVisualizeDepthBuffer)
					{
						previousDepthValue = _mm_set_ps(pDepthBuffer[idx], pDepthBuffer[idx + 1], pDepthBuffer[idx + SCREENW], pDepthBuffer[idx + SCREENW + 1]);
					}
					else
					{
						previousDepthValue = *(__m128*)&pDepthBuffer[idx];
					}

					__m128 depthMask  = _mm_cmpge_ps( depth, previousDepthValue);
					__m128i finalMask = _mm_and_si128( mask, _mm_castps_si128(depthMask));
					if(!_mm_test_all_zeros(finalMask, finalMask))
					{
						*mVisible = true;
						return; //early exit
					}
				}//for each column											
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles
}
Ejemplo n.º 12
0
void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
{
    int j;

    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
    __m128i vecY0, vecY1, vecY2, vecY3;
    __m128i sum0, sum1, sum2, sum3, vecSum;
    __m128i initSum;

    celt_assert(len >= 3);

    sum0 = _mm_setzero_si128();
    sum1 = _mm_setzero_si128();
    sum2 = _mm_setzero_si128();
    sum3 = _mm_setzero_si128();

    for (j=0;j<(len-7);j+=8)
    {
        vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
        vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
        vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
        vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
        vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));

        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
    }

    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));

    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));

    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));

    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));

    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
          _mm_unpacklo_epi32(sum2, sum3));

    for (;j<(len-3);j+=4)
    {
        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
        vecX3 = _mm_shuffle_epi32(vecX, 0xff);

        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);

        sum0 = _mm_mullo_epi32(vecX0, vecY0);
        sum1 = _mm_mullo_epi32(vecX1, vecY1);
        sum2 = _mm_mullo_epi32(vecX2, vecY2);
        sum3 = _mm_mullo_epi32(vecX3, vecY3);

        sum0 = _mm_add_epi32(sum0, sum1);
        sum2 = _mm_add_epi32(sum2, sum3);
        vecSum = _mm_add_epi32(vecSum, sum0);
        vecSum = _mm_add_epi32(vecSum, sum2);
    }

    for (;j<len;j++)
    {
        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
        vecX0 = _mm_shuffle_epi32(vecX, 0x00);

        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);

        sum0 = _mm_mullo_epi32(vecX0, vecY0);
        vecSum = _mm_add_epi32(vecSum, sum0);
    }

    initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
    initSum = _mm_add_epi32(initSum, vecSum);
    _mm_storeu_si128((__m128i *)sum, initSum);
}
Ejemplo n.º 13
0
int main(int argc, char  **argv)
{

  struct timespec  t1, t2; 
  int c, d, k, sum = 0;
  int size, opt, i;
  char *fname; 
  
  while((opt = getopt(argc, argv, "f:s:"))!= -1) {
    switch (opt){
    case 's':
      size = atoi(optarg);
      break;
    case 'f':
      fname = optarg;
      break;
    default:
      size = MEDIUM; 
      break; 
    }
  }
 
  FILE *fp;
  fp = fopen(fname,"a");
  
  int edge;
  int *first; 
  posix_memalign((void**)&first,16,sizeof(int)*size*size);  //use posix_memalign to get 16byte alignment 
  int *multiply; 
  posix_memalign((void**)&multiply,16,sizeof(int)*size*size);  
  __m128i m1, m2,m3; 
 
  for (  c = 0 ; c < size ; c++ )
  	for ( d = 0 ; d < size ; d++ )    	
      first[c*size+d] = ((c+d) % 2) - 1;
  multiply[c*size+d] = 0;
      
	
  printf("multiplying the %d-size matrices\n  You should try to time this part.\n",size);
	
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t1);

	for ( c = 0 ; c < size ; c++ )
	{
		for ( k = 0 ; k < size ; k++ )
	    {
	    	m2 = _mm_set1_epi32(first[c*size+k]); //first[c][k]
			for (d = 0 ; d < size ; d+=4) 
			{
				edge = size - d; 
				if (edge < 4){	//account for non-div by 4 matrices
					for (i = d; i < size; i++)
						multiply[c*size+i] += first[c*size+k]*first[k*size+i];
				}
				else{ 
				  m1 = _mm_loadu_si128(&first[k*size+d]);  //first[k][d]
				  m1 = _mm_mullo_epi32(m1,m2); // first[k][d] * first[c][k]
				  m3 = _mm_loadu_si128(&multiply[c*size+d]);//load up old values of multiply[c][d] 
				  
				  m1 = _mm_add_epi32(m3,m1);  //[+= to mult]
			
				  _mm_storeu_si128(&multiply[c*size+d],m1);
			  	}
			  	
	      		} 
	    	}
		}
	
		
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t2);

  double nanos = (diff(t1,t2).tv_nsec) * pow(10,-9);
  double secs = (diff(t1,t2).tv_sec);
  double dif = secs + nanos;
  
  fprintf(fp,"%.10f\n", dif); 
  
  fclose(fp);

  printf("test first %d\n",first[size]);
  printf("test mult %d\n",multiply[size]);
  

  free(first);   //free SSE aligned array with _aligned_free
  free(multiply);
  return 0;
}
Ejemplo n.º 14
0
static bool ethash_hash(
	ethash_return_value_t* ret,
	node const* full_nodes,
	ethash_light_t const light,
	uint64_t full_size,
	ethash_h256_t const header_hash,
	uint64_t const nonce
)
{
	if (full_size % MIX_WORDS != 0) {
		return false;
	}

	// pack hash and nonce together into first 40 bytes of s_mix
	assert(sizeof(node) * 8 == 512);
	node s_mix[MIX_NODES + 1];
	memcpy(s_mix[0].bytes, &header_hash, 32);
	fix_endian64(s_mix[0].double_words[4], nonce);

	// compute sha3-512 hash and replicate across mix
	SHA3_512(s_mix->bytes, s_mix->bytes, 40);
	fix_endian_arr32(s_mix[0].words, 16);

	node* const mix = s_mix + 1;
	for (uint32_t w = 0; w != MIX_WORDS; ++w) {
		mix->words[w] = s_mix[0].words[w % NODE_WORDS];
	}

	unsigned const page_size = sizeof(uint32_t) * MIX_WORDS;
	unsigned const num_full_pages = (unsigned) (full_size / page_size);

	for (unsigned i = 0; i != ETHASH_ACCESSES; ++i) {
		uint32_t const index = fnv_hash(s_mix->words[0] ^ i, mix->words[i % MIX_WORDS]) % num_full_pages;

		for (unsigned n = 0; n != MIX_NODES; ++n) {
			node const* dag_node;
			if (full_nodes) {
				dag_node = &full_nodes[MIX_NODES * index + n];
			} else {
				node tmp_node;
				ethash_calculate_dag_item(&tmp_node, index * MIX_NODES + n, light);
				dag_node = &tmp_node;
			}

#if defined(_M_X64) && ENABLE_SSE
			{
				__m128i fnv_prime = _mm_set1_epi32(FNV_PRIME);
				__m128i xmm0 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[0]);
				__m128i xmm1 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[1]);
				__m128i xmm2 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[2]);
				__m128i xmm3 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[3]);
				mix[n].xmm[0] = _mm_xor_si128(xmm0, dag_node->xmm[0]);
				mix[n].xmm[1] = _mm_xor_si128(xmm1, dag_node->xmm[1]);
				mix[n].xmm[2] = _mm_xor_si128(xmm2, dag_node->xmm[2]);
				mix[n].xmm[3] = _mm_xor_si128(xmm3, dag_node->xmm[3]);
			}
			#else
			{
				for (unsigned w = 0; w != NODE_WORDS; ++w) {
					mix[n].words[w] = fnv_hash(mix[n].words[w], dag_node->words[w]);
				}
			}
#endif
		}

	}

	// compress mix
	for (uint32_t w = 0; w != MIX_WORDS; w += 4) {
		uint32_t reduction = mix->words[w + 0];
		reduction = reduction * FNV_PRIME ^ mix->words[w + 1];
		reduction = reduction * FNV_PRIME ^ mix->words[w + 2];
		reduction = reduction * FNV_PRIME ^ mix->words[w + 3];
		mix->words[w / 4] = reduction;
	}

	fix_endian_arr32(mix->words, MIX_WORDS / 4);
	memcpy(&ret->mix_hash, mix->bytes, 32);
	// final Keccak hash
	SHA3_256(&ret->result, s_mix->bytes, 64 + 32); // Keccak-256(s + compressed_mix)
	return true;
}
//-------------------------------------------------------------------------------
// For each tile go through all the bins and process all the triangles in it.
// Rasterize each triangle to the CPU depth buffer. 
//-------------------------------------------------------------------------------
void DepthBufferRasterizerSSEST::RasterizeBinnedTrianglesToDepthBuffer(UINT tileId, UINT idx)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m128i colOffset = _mm_setr_epi32(0, 1, 0, 1);
	__m128i rowOffset = _mm_setr_epi32(0, 0, 1, 1);

	__m128i fxptZero = _mm_setzero_si128();
	float* pDepthBuffer = (float*)mpRenderTargetPixels[idx]; 

	// Based on TaskId determine which tile to process
	UINT screenWidthInTiles = SCREENW/TILE_WIDTH_IN_PIXELS;
    UINT tileX = tileId % screenWidthInTiles;
    UINT tileY = tileId / screenWidthInTiles;

    int tileStartX = tileX * TILE_WIDTH_IN_PIXELS;
	int tileEndX   = tileStartX + TILE_WIDTH_IN_PIXELS - 1;
	
	int tileStartY = tileY * TILE_HEIGHT_IN_PIXELS;
	int tileEndY   = tileStartY + TILE_HEIGHT_IN_PIXELS - 1;

	ClearDepthTile(tileStartX, tileStartY, tileEndX+1, tileEndY+1, idx);

	UINT bin = 0;
	UINT binIndex = 0;
	UINT offset1 = YOFFSET1_ST * tileY + XOFFSET1_ST * tileX;
	UINT offset2 = YOFFSET2_ST * tileY + XOFFSET2_ST * tileX;
	UINT numTrisInBin = mpNumTrisInBin[idx][offset1 + bin];

	__m128 gatherBuf[4][3];
	bool done = false;
	bool allBinsEmpty = true;
	mNumRasterizedTris[idx][tileId] = numTrisInBin;
	while(!done)
	{
		// Loop through all the bins and process 4 binned traingles at a time
		UINT ii;
		int numSimdTris = 0;
		for(ii = 0; ii < SSE; ii++)
		{
			while(numTrisInBin <= 0)
			{
				 // This bin is empty.  Move to next bin.
				if(++bin >= 1)
				{
					break;
				}
				numTrisInBin = mpNumTrisInBin[idx][offset1 + bin];
				mNumRasterizedTris[idx][tileId] += numTrisInBin;
				binIndex = 0;
			}
			if(!numTrisInBin)
			{
				break; // No more tris in the bins
			}
			USHORT modelId = mpBinModel[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			USHORT meshId = mpBinMesh[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			UINT triIdx = mpBin[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			mpTransformedModels1[modelId].Gather(gatherBuf[ii], meshId, triIdx, idx);
			allBinsEmpty = false;
			numSimdTris++; 

			++binIndex;
			--numTrisInBin;
		}
		done = bin >= NUM_XFORMVERTS_TASKS;
		
		if(allBinsEmpty)
		{
			return;
		}

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        __m128i fxPtX[3], fxPtY[3];
		__m128 Z[3];
		for(int i = 0; i < 3; i++)
		{
			__m128 v0 = gatherBuf[0][i];
			__m128 v1 = gatherBuf[1][i];
			__m128 v2 = gatherBuf[2][i];
			__m128 v3 = gatherBuf[3][i];

  			// transpose into SoA layout
			_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
			fxPtX[i] = _mm_cvtps_epi32(v0);
			fxPtY[i] = _mm_cvtps_epi32(v1);
			Z[i] = v2;
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]);
		__m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]);
		__m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]);
		__m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]);
		__m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1]));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2]));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0]));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(B2, A1);
		triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2));
		__m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea));

		Z[1] = _mm_mul_ps(_mm_sub_ps(Z[1], Z[0]), oneOverTriArea);
		Z[2] = _mm_mul_ps(_mm_sub_ps(Z[2], Z[0]), oneOverTriArea);

		// Use bounding box traversal strategy to determine which pixels to rasterize 
		__m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(tileStartX)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endX   = Min(_mm_add_epi32(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndX));

		__m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(tileStartY)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endY   = Min(_mm_add_epi32(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndY));

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < numSimdTris; lane++)
        {
			// Extract this triangle's properties from the SIMD versions
            __m128 zz[3];
			for(int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]);
			}
									
			int startXx = startX.m128i_i32[lane];
			int endXx	= endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy	= endY.m128i_i32[lane];
		
			__m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]);
			__m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]);
			__m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]);

			__m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]);
			__m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]);
			__m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]);

			__m128i aa0Inc = _mm_slli_epi32(aa0, 1);
			__m128i aa1Inc = _mm_slli_epi32(aa1, 1);
			__m128i aa2Inc = _mm_slli_epi32(aa2, 1);

			__m128i row, col;

			// Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X
			// This method provides better perfromance
			int rowIdx = (startYy * SCREENW + 2 * startXx);

			col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx));
			__m128i aa0Col = _mm_mullo_epi32(aa0, col);
			__m128i aa1Col = _mm_mullo_epi32(aa1, col);
			__m128i aa2Col = _mm_mullo_epi32(aa2, col);

			row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy));
			__m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), _mm_set1_epi32(C0.m128i_i32[lane]));
			__m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), _mm_set1_epi32(C1.m128i_i32[lane]));
			__m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), _mm_set1_epi32(C2.m128i_i32[lane]));

			__m128i sum0Row = _mm_add_epi32(aa0Col, bb0Row);
			__m128i sum1Row = _mm_add_epi32(aa1Col, bb1Row);
			__m128i sum2Row = _mm_add_epi32(aa2Col, bb2Row);

			__m128i bb0Inc = _mm_slli_epi32(bb0, 1);
			__m128i bb1Inc = _mm_slli_epi32(bb1, 1);
			__m128i bb2Inc = _mm_slli_epi32(bb2, 1);

			__m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1Inc), zz[1]);
			zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2Inc), zz[2]));

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for(int r = startYy; r < endYy; r += 2,
											rowIdx += 2 * SCREENW,
											sum0Row = _mm_add_epi32(sum0Row, bb0Inc),
											sum1Row = _mm_add_epi32(sum1Row, bb1Inc),
											sum2Row = _mm_add_epi32(sum2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int index = rowIdx;
				__m128i alpha = sum0Row;
				__m128i beta = sum1Row;
				__m128i gama = sum2Row;

				//Compute barycentric-interpolated depth
				__m128 depth = zz[0];
				depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1]));
				depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2]));

				for(int c = startXx; c < endXx; c += 2,
												index += 4,
												alpha = _mm_add_epi32(alpha, aa0Inc),
												beta  = _mm_add_epi32(beta, aa1Inc),
												gama  = _mm_add_epi32(gama, aa2Inc),
												depth = _mm_add_ps(depth, zx))
				{
					//Test Pixel inside triangle
					__m128i mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama);
					
					__m128 previousDepthValue = _mm_load_ps(&pDepthBuffer[index]);
					__m128 mergedDepth = _mm_max_ps(depth, previousDepthValue);
					__m128 finalDepth = _mm_blendv_ps(mergedDepth, previousDepthValue, _mm_castsi128_ps(mask));
					_mm_store_ps(&pDepthBuffer[index], finalDepth);
				}//for each column											
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles
}
//-----------------------------------------------------------------------------------------
// Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer
// If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee
// as visible. If all rasterized AABB pixels are occluded then the occludee is culled
//-----------------------------------------------------------------------------------------
bool TransformedAABBoxAVX::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels, const __m128 pXformedPos[], UINT idx)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	// Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. 
	// so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m256i colOffset = _mm256_setr_epi32(0, 1, 2, 3, 0, 1, 2, 3);
	__m256i rowOffset = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1);
	float* pDepthBuffer = (float*)pRenderTargetPixels;
	
	// Rasterize the AABB triangles 4 at a time
	for(UINT i = 0; i < AABB_TRIANGLES; i += SSE)
	{
		vFloat4 xformedPos[3];
		Gather(xformedPos, i, pXformedPos, idx);

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        __m128i fxPtX[3], fxPtY[3];
		for(int m = 0; m < 3; m++)
		{
			fxPtX[m] = _mm_cvtps_epi32(xformedPos[m].X);
			fxPtY[m] = _mm_cvtps_epi32(xformedPos[m].Y);
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]);
		__m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]);
		__m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]);
		__m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]);
		__m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1]));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2]));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0]));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(B2, A1);
		triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2));
		__m128 oneOverTriArea = _mm_rcp_ps(_mm_cvtepi32_ps(triArea));

		__m128 Z[3];
		Z[0] = xformedPos[0].Z;
		Z[1] = _mm_mul_ps(_mm_sub_ps(xformedPos[1].Z, Z[0]), oneOverTriArea);
		Z[2] = _mm_mul_ps(_mm_sub_ps(xformedPos[2].Z, Z[0]), oneOverTriArea);
		
		// Use bounding box traversal strategy to determine which pixels to rasterize 
		//__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1));
		__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~3));
		__m128i endX = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(SCREENW - 1));

		__m128i startY = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1));
		__m128i endY = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(SCREENH - 1));

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < SSE; lane++)
        {
			// Skip triangle if area is zero 
			if(triArea.m128i_i32[lane] <= 0)
			{
				continue;
			}

			// Extract this triangle's properties from the SIMD versions
			__m256 zz[3];
			for (int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm256_set1_ps(Z[vv].m128_f32[lane]);
			}

			int startXx = startX.m128i_i32[lane];
			int endXx = endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy = endY.m128i_i32[lane];

			__m256i aa0 = _mm256_set1_epi32(A0.m128i_i32[lane]);
			__m256i aa1 = _mm256_set1_epi32(A1.m128i_i32[lane]);
			__m256i aa2 = _mm256_set1_epi32(A2.m128i_i32[lane]);

			__m256i bb0 = _mm256_set1_epi32(B0.m128i_i32[lane]);
			__m256i bb1 = _mm256_set1_epi32(B1.m128i_i32[lane]);
			__m256i bb2 = _mm256_set1_epi32(B2.m128i_i32[lane]);

			__m256i aa0Inc = _mm256_slli_epi32(aa0, 2);
			__m256i aa1Inc = _mm256_slli_epi32(aa1, 2);
			__m256i aa2Inc = _mm256_slli_epi32(aa2, 2);

			__m256i bb0Inc = _mm256_slli_epi32(bb0, 1);
			__m256i bb1Inc = _mm256_slli_epi32(bb1, 1);
			__m256i bb2Inc = _mm256_slli_epi32(bb2, 1);

			__m256i row, col;

			// Traverse pixels in 2x4 blocks and store 2x4 pixel quad depths contiguously in memory ==> 2*X
			// This method provides better performance
			int	rowIdx = (startYy * SCREENW + 2 * startXx);

			col = _mm256_add_epi32(colOffset, _mm256_set1_epi32(startXx));
			__m256i aa0Col = _mm256_mullo_epi32(aa0, col);
			__m256i aa1Col = _mm256_mullo_epi32(aa1, col);
			__m256i aa2Col = _mm256_mullo_epi32(aa2, col);

			row = _mm256_add_epi32(rowOffset, _mm256_set1_epi32(startYy));
			__m256i bb0Row = _mm256_add_epi32(_mm256_mullo_epi32(bb0, row), _mm256_set1_epi32(C0.m128i_i32[lane]));
			__m256i bb1Row = _mm256_add_epi32(_mm256_mullo_epi32(bb1, row), _mm256_set1_epi32(C1.m128i_i32[lane]));
			__m256i bb2Row = _mm256_add_epi32(_mm256_mullo_epi32(bb2, row), _mm256_set1_epi32(C2.m128i_i32[lane]));

			__m256i sum0Row = _mm256_add_epi32(aa0Col, bb0Row);
			__m256i sum1Row = _mm256_add_epi32(aa1Col, bb1Row);
			__m256i sum2Row = _mm256_add_epi32(aa2Col, bb2Row);

			__m256 zx = _mm256_mul_ps(_mm256_cvtepi32_ps(aa1Inc), zz[1]);
			zx = _mm256_add_ps(zx, _mm256_mul_ps(_mm256_cvtepi32_ps(aa2Inc), zz[2]));

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for (int r = startYy; r < endYy; r += 2,
				rowIdx += 2 * SCREENW,
				sum0Row = _mm256_add_epi32(sum0Row, bb0Inc),
				sum1Row = _mm256_add_epi32(sum1Row, bb1Inc),
				sum2Row = _mm256_add_epi32(sum2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int index = rowIdx;
				__m256i alpha = sum0Row;
				__m256i beta = sum1Row;
				__m256i gama = sum2Row;

				//Compute barycentric-interpolated depth
				__m256 depth = zz[0];
				depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(beta), zz[1]));
				depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(gama), zz[2]));
				__m256i anyOut = _mm256_setzero_si256();

				for (int c = startXx; c < endXx; c += 4,
					index += 8,
					alpha = _mm256_add_epi32(alpha, aa0Inc),
					beta = _mm256_add_epi32(beta, aa1Inc),
					gama = _mm256_add_epi32(gama, aa2Inc),
					depth = _mm256_add_ps(depth, zx))
				{
					//Test Pixel inside triangle
					__m256i mask = _mm256_or_si256(_mm256_or_si256(alpha, beta), gama);

					__m256 previousDepthValue = _mm256_loadu_ps(&pDepthBuffer[index]);
					__m256 depthMask = _mm256_cmp_ps(depth, previousDepthValue, 0x1D);
					__m256i finalMask = _mm256_andnot_si256(mask, _mm256_castps_si256(depthMask));
					anyOut = _mm256_or_si256(anyOut, finalMask);
				}//for each column	

				if (!_mm256_testz_si256(anyOut, _mm256_set1_epi32(0x80000000)))
				{
					return true; //early exit
				}
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles

	return false;
}
Ejemplo n.º 17
0
__m128i test_mm_mullo_epi32(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_mullo_epi32
  // CHECK: mul <4 x i32>
  // CHECK-ASM: pmulld %xmm{{.*}}, %xmm{{.*}}
  return _mm_mullo_epi32(x, y);
}
Ejemplo n.º 18
0
template<class T> inline void dequantise_sse4_2_16_8_3(QuantisationMatrix *qmatrix,
                                                        int32_t *idata,
                                                        void *_odata,
                                                        int ostride) {
  T *odata = (T *)_odata;
  const int slice_width  = 16;
  const int slice_height = 8;

  const int Y = 0;
  const int X = 0;
  const int N = 0;

  T * const optr = &odata[Y*slice_height*ostride + X*slice_width];
  const int32_t * iptr = &idata[N*slice_height*slice_width];

  {
    __m128i D0;
    {
      D0 = _mm_load_si128((__m128i *)&iptr[ 0]); // [  0  1  2  3 ] (Q)
      __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[0][0]),
                                      _mm_load_si128((__m128i *)&qmatrix->qfactor[1][1]));
      __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[0][0]),
                                      _mm_load_si128((__m128i *)&qmatrix->qoffset[1][1]));

      __m128i X  = _mm_abs_epi32(D0);
      X = _mm_mullo_epi32(X, QF);
      X = _mm_add_epi32(X, QO);
      X = _mm_srai_epi32(X, 2);
      D0 = _mm_sign_epi32(X, D0);

      D0 = _mm_shuffle_epi32(D0, 0xD8);
    }

    const __m128i D1 = LOAD_QUANTISED(&iptr[8], qmatrix, 2, 1);

    const __m128i D2 = LOAD_QUANTISED(&iptr[32], qmatrix, 3, 1);
    const __m128i D3 = LOAD_QUANTISED(&iptr[36], qmatrix, 3, 1);

    const __m128i A0  = _mm_unpacklo_epi32(D0, D1);
    const __m128i A1  = _mm_unpackhi_epi32(D0, D1);

    const __m128i B0  = _mm_unpacklo_epi32(A0, D2);
    const __m128i B1  = _mm_unpackhi_epi32(A0, D2);
    const __m128i B2  = _mm_unpacklo_epi32(A1, D3);
    const __m128i B3  = _mm_unpackhi_epi32(A1, D3);

    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride +  0], B0, B1);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride +  8], B2, B3);
  }

  {
    __m128i D0;
    {
      D0 = _mm_load_si128((__m128i *)&iptr[ 4]);
      __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[1][2]),
                                      _mm_load_si128((__m128i *)&qmatrix->qfactor[1][3]));
      __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[1][2]),
                                      _mm_load_si128((__m128i *)&qmatrix->qoffset[1][3]));

      __m128i X  = _mm_abs_epi32(D0);
      X = _mm_mullo_epi32(X, QF);
      X = _mm_add_epi32(X, QO);
      X = _mm_srai_epi32(X, 2);
      D0 = _mm_sign_epi32(X, D0);

      D0 = _mm_shuffle_epi32(D0, 0xD8);
    }

    const __m128i D1 = LOAD_QUANTISED(&iptr[12], qmatrix, 2, 1);

    const __m128i D2 = LOAD_QUANTISED(&iptr[48], qmatrix, 3, 1);
    const __m128i D3 = LOAD_QUANTISED(&iptr[52], qmatrix, 3, 1);

    const __m128i A0  = _mm_unpacklo_epi32(D0, D1);
    const __m128i A1  = _mm_unpackhi_epi32(D0, D1);

    const __m128i B0  = _mm_unpacklo_epi32(A0, D2);
    const __m128i B1  = _mm_unpackhi_epi32(A0, D2);
    const __m128i B2  = _mm_unpacklo_epi32(A1, D3);
    const __m128i B3  = _mm_unpackhi_epi32(A1, D3);

    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride +  0], B0, B1);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride +  8], B2, B3);
  }

  {
    const __m128i D0 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 2);

    const __m128i D1 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 3);

    const __m128i D2 = LOAD_QUANTISED(&iptr[40], qmatrix, 3, 1);
    const __m128i D3 = LOAD_QUANTISED(&iptr[44], qmatrix, 3, 1);

    const __m128i A0  = _mm_unpacklo_epi32(D0, D1);
    const __m128i A1  = _mm_unpackhi_epi32(D0, D1);

    const __m128i B0  = _mm_unpacklo_epi32(A0, D2);
    const __m128i B1  = _mm_unpackhi_epi32(A0, D2);
    const __m128i B2  = _mm_unpacklo_epi32(A1, D3);
    const __m128i B3  = _mm_unpackhi_epi32(A1, D3);

    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride +  0], B0, B1);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride +  8], B2, B3);
  }

  {
    const __m128i D0 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 2);

    const __m128i D1 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 3);

    const __m128i D2 = LOAD_QUANTISED(&iptr[56], qmatrix, 3, 1);
    const __m128i D3 = LOAD_QUANTISED(&iptr[60], qmatrix, 3, 1);

    const __m128i A0  = _mm_unpacklo_epi32(D0, D1);
    const __m128i A1  = _mm_unpackhi_epi32(D0, D1);

    const __m128i B0  = _mm_unpacklo_epi32(A0, D2);
    const __m128i B1  = _mm_unpackhi_epi32(A0, D2);
    const __m128i B2  = _mm_unpacklo_epi32(A1, D3);
    const __m128i B3  = _mm_unpackhi_epi32(A1, D3);

    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride +  0], B0, B1);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride +  8], B2, B3);
  }

  for (int y = 0; y < 4; y++) {
    const __m128i D0 = LOAD_QUANTISED(&iptr[ 64 + y*8], qmatrix, 3, 2);
    const __m128i D1 = LOAD_QUANTISED(&iptr[ 68 + y*8], qmatrix, 3, 2);

    const __m128i D2 = LOAD_QUANTISED(&iptr[ 96 + y*8], qmatrix, 3, 3);
    const __m128i D3 = LOAD_QUANTISED(&iptr[100 + y*8], qmatrix, 3, 3);

    const __m128i A0  = _mm_unpacklo_epi32(D0, D2);
    const __m128i A1  = _mm_unpackhi_epi32(D0, D2);
    const __m128i A2  = _mm_unpacklo_epi32(D1, D3);
    const __m128i A3  = _mm_unpackhi_epi32(D1, D3);

    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*y + 1)*ostride +  0], A0, A1);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*y + 1)*ostride +  8], A2, A3);
  }
}
Ejemplo n.º 19
0
template<> void dequantise_sse4_2<4,8,2, int32_t>(QuantisationMatrix *qmatrix,
                                  int32_t *idata,
                                  void *_odata,
                                  int ostride,
                                  int, int, int) {
  int32_t *odata = (int32_t *)_odata;
  const int slice_width  = 4;
  const int slice_height = 8;
  const int Y = 0;
  const int X = 0;
  const int N = 0;

  int32_t * const optr = &odata[Y*slice_height*ostride + X*slice_width];
  const int32_t * iptr = &idata[N*slice_height*slice_width];

  __m128i D0;
  {
    D0 = _mm_load_si128((__m128i *)&iptr[ 0]); // [  0  1  2  3 ] (Q)
    __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[0][0]),
                                    _mm_load_si128((__m128i *)&qmatrix->qfactor[1][1]));
    __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[0][0]),
                                    _mm_load_si128((__m128i *)&qmatrix->qoffset[1][1]));

    __m128i X  = _mm_abs_epi32(D0);
    X = _mm_mullo_epi32(X, QF);
    X = _mm_add_epi32(X, QO);
    X = _mm_srai_epi32(X, 2);
    D0 = _mm_sign_epi32(X, D0);
  }
  __m128i D4;
  {
    D4 = _mm_load_si128((__m128i *)&iptr[ 4]); // [  4  5  6  7 ] (Q)
    __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[1][2]),
                                    _mm_load_si128((__m128i *)&qmatrix->qfactor[1][3]));
    __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[1][2]),
                                    _mm_load_si128((__m128i *)&qmatrix->qoffset[1][3]));
    __m128i X  = _mm_abs_epi32(D4);
    X = _mm_mullo_epi32(X, QF);
    X = _mm_add_epi32(X, QO);
    X = _mm_srai_epi32(X, 2);
    D4 = _mm_sign_epi32(X, D4);
  }

  const __m128i D8  = LOAD_QUANTISED(&iptr[ 8], qmatrix, 2, 1); // [  8  9 10 11 ]
  const __m128i D12 = LOAD_QUANTISED(&iptr[12], qmatrix, 2, 1); // [ 12 13 14 15 ]
  const __m128i D16 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 2); // [ 16 17 18 19 ]
  const __m128i D20 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 2); // [ 20 21 22 23 ]
  const __m128i D24 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 3); // [ 24 25 26 27 ]
  const __m128i D28 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 3); // [ 28 29 30 31 ]

  const __m128i X0  = _mm_unpacklo_epi32(D0,  D4); // [  0  4  1  5 ]
  const __m128i X1  = _mm_unpackhi_epi32(D0,  D4); // [  2  6  3  7 ]
  const __m128i Y0  = _mm_unpacklo_epi32(X0,  X1); // [  0  2  4  6 ]
  const __m128i Y1  = _mm_unpackhi_epi32(X0,  X1); // [  1  3  5  7 ]

  const __m128i Z0  = _mm_unpacklo_epi32(Y0,  D8); // [  0  8  2  9 ]
  _mm_store_si128((__m128i *)&optr[0*ostride + 0], Z0);
  const __m128i Z1  = _mm_unpackhi_epi32(Y0,  D8); // [  4 10  6 11 ]
  _mm_store_si128((__m128i *)&optr[2*ostride + 0], Z1);

  const __m128i Z2  = _mm_unpacklo_epi32(Y1, D12); // [  1 12  3 13 ]
  _mm_store_si128((__m128i *)&optr[4*ostride + 0], Z2);
  const __m128i Z3  = _mm_unpackhi_epi32(Y1, D12); // [  5 14  7 15 ]
  _mm_store_si128((__m128i *)&optr[6*ostride + 0], Z3);

  const __m128i W0  = _mm_unpacklo_epi32(D16, D24);// [ 16 24 17 25 ]
  _mm_store_si128((__m128i *)&optr[1*ostride + 0], W0);
  const __m128i W1  = _mm_unpackhi_epi32(D16, D24);// [ 18 26 19 27 ]
  _mm_store_si128((__m128i *)&optr[3*ostride + 0], W1);

  const __m128i W2  = _mm_unpacklo_epi32(D20, D28);// [ 20 28 21 29 ]
  _mm_store_si128((__m128i *)&optr[5*ostride + 0], W2);
  const __m128i W3  = _mm_unpackhi_epi32(D20, D28);// [ 22 30 23 31 ]
  _mm_store_si128((__m128i *)&optr[7*ostride + 0], W3);

}
Ejemplo n.º 20
0
__m128i test_mm_mullo_epi32(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_mullo_epi32
  // CHECK: mul <4 x i32>
  return _mm_mullo_epi32(x, y);
}
/*****************************************************************************
 * This function utilises 3 properties of the cost function lookup tables,   *
 * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
 * vp9_encoder.c.                                                            *
 * For the joint cost:                                                       *
 *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
 * For the component costs:                                                  *
 *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
 *         (Equal costs for both components)                                 *
 *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
 *         (Cost function is even)                                           *
 * If these do not hold, then this function cannot be used without           *
 * modification, in which case you can revert to using the C implementation, *
 * which does not rely on these properties.                                  *
 *****************************************************************************/
int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                               const search_site_config *cfg,
                               MV *ref_mv, MV *best_mv, int search_param,
                               int sad_per_bit, int *num00,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *center_mv) {
  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);

  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);

  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);

  // search_param determines the length of the initial step and hence the number
  // of iterations.
  // 0 = initial step (MAX_FIRST_STEP) pel
  // 1 = (MAX_FIRST_STEP/2) pel,
  // 2 = (MAX_FIRST_STEP/4) pel...
  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
  const int tot_steps = cfg->total_steps - search_param;

  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
                                        center_mv->col >> 3);
  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);

  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);

  int_mv bmv = pack_int_mv(ref_row, ref_col);
  int_mv new_bmv = bmv;
  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);

  const int what_stride = x->plane[0].src.stride;
  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
  const uint8_t *const what = x->plane[0].src.buf;
  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
                                 ref_row * in_what_stride + ref_col;

  // Work out the start point for the search
  const uint8_t *best_address = in_what;
  const uint8_t *new_best_address = best_address;
#if ARCH_X86_64
  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

  unsigned int best_sad;

  int i;
  int j;
  int step;

  // Check the prerequisite cost function properties that are easy to check
  // in an assert. See the function-level documentation for details on all
  // prerequisites.
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);

  // Check the starting position
  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);

  *num00 = 0;

  for (i = 0, step = 0; step < tot_steps; step++) {
    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
      __m128i v_sad_d;
      __m128i v_cost_d;
      __m128i v_outside_d;
      __m128i v_inside_d;
      __m128i v_diff_mv_w;
#if ARCH_X86_64
      __m128i v_blocka[2];
#else
      __m128i v_blocka[1];
#endif

      // Compute the candidate motion vectors
      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
      // Clamp them to the search bounds
      __m128i v_these_mv_clamp_w = v_these_mv_w;
      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
      // The ones that did not change are inside the search area
      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);

      // If none of them are inside, then move on
      if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
        continue;
      }

      // The inverse mask indicates which of the MVs are outside
      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
      // Shift right to keep the sign bit clear, we will use this later
      // to set the cost to the maximum value.
      v_outside_d = _mm_srli_epi32(v_outside_d, 1);

      // Compute the difference MV
      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
      // We utilise the fact that the cost function is even, and use the
      // absolute difference. This allows us to use unsigned indexes later
      // and reduces cache pressure somewhat as only a half of the table
      // is ever referenced.
      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);

      // Compute the SIMD pointer offsets.
      {
#if ARCH_X86_64  //  sizeof(intptr_t) == 8
        // Load the offsets
        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
        // Set the ones falling outside to zero
        v_bo10_q = _mm_and_si128(v_bo10_q,
                                 _mm_cvtepi32_epi64(v_inside_d));
        v_bo32_q = _mm_and_si128(v_bo32_q,
                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
        // Compute the candidate addresses
        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
#else  // ARCH_X86 //  sizeof(intptr_t) == 4
        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
#endif
      }

      fn_ptr->sdx4df(what, what_stride,
                     (const uint8_t **)&v_blocka[0], in_what_stride,
                     (uint32_t*)&v_sad_d);

      // Look up the component cost of the residual motion vector
      {
        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);

        // Note: This is a use case for vpgather in AVX2
        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];

        __m128i v_cost_10_d, v_cost_32_d;

        v_cost_10_d = _mm_cvtsi32_si128(cost0);
        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);

        v_cost_32_d = _mm_cvtsi32_si128(cost2);
        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);

        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
      }

      // Now add in the joint cost
      {
        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
                                                _mm_setzero_si128());
        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
                                                       v_joint_cost_0_d,
                                                       v_sel_d);
        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
      }

      // Multiply by sad_per_bit
      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
      // ROUND_POWER_OF_TWO(v_cost_d, 8)
      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
      // Add the cost to the sad
      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);

      // Make the motion vectors outside the search area have max cost
      // by or'ing in the comparison mask, this way the minimum search won't
      // pick them.
      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);

      // Find the minimum value and index horizontally in v_sad_d
      {
        // Try speculatively on 16 bits, so we can use the minpos intrinsic
        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);

        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);

        // If the local best value is not saturated, just use it, otherwise
        // find the horizontal minimum again the hard way on 32 bits.
        // This is executed rarely.
        if (__unlikely__(local_best_sad == 0xffff)) {
          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;

          v_loval_d = v_sad_d;
          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
          v_hival_d = _mm_srli_si128(v_loval_d, 8);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
          v_hival_d = _mm_srli_si128(v_loval_d, 4);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);

          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
        }

        // Update the global minimum if the local minimum is smaller
        if (__likely__(local_best_sad < best_sad)) {
          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];

          best_sad = local_best_sad;
        }
      }
    }

    bmv = new_bmv;
    best_address = new_best_address;

    v_bmv_w = _mm_set1_epi32(bmv.as_int);
#if ARCH_X86_64
    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

    if (__unlikely__(best_address == in_what)) {
      (*num00)++;
    }
  }

  *best_mv = bmv.as_mv;
  return best_sad;
}
Ejemplo n.º 22
0
/* Compute reflection coefficients from input signal */
void silk_burg_modified_sse4_1(
    opus_int32                  *res_nrg,           /* O    Residual energy                                             */
    opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
    opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
    const opus_int16            x[],                /* I    Input signal, length: nb_subfr * (D + subfr_length)       */
    const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
    const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
    const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
    const opus_int              D,                  /* I    Order                                                       */
    int                         arch                /* I    Run-time architecture                                       */
)
{
    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
    opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
    const opus_int16 *x_ptr;
    opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
    opus_int32       C_last_row[  SILK_MAX_ORDER_LPC ];
    opus_int32       Af_QA[       SILK_MAX_ORDER_LPC ];
    opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
    opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
    opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];

    __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
    __m128i CONST1 = _mm_set1_epi32(1);

    silk_assert(subfr_length * nb_subfr <= MAX_FRAME_SIZE);

    /* Compute autocorrelations, added over subframes */
    silk_sum_sqr_shift(&C0, &rshifts, x, nb_subfr * subfr_length);
    if(rshifts > MAX_RSHIFTS) {
        C0 = silk_LSHIFT32(C0, rshifts - MAX_RSHIFTS);
        silk_assert(C0 > 0);
        rshifts = MAX_RSHIFTS;
    } else {
        lz = silk_CLZ32(C0) - 1;
        rshifts_extra = N_BITS_HEAD_ROOM - lz;
        if(rshifts_extra > 0) {
            rshifts_extra = silk_min(rshifts_extra, MAX_RSHIFTS - rshifts);
            C0 = silk_RSHIFT32(C0, rshifts_extra);
        } else {
            rshifts_extra = silk_max(rshifts_extra, MIN_RSHIFTS - rshifts);
            C0 = silk_LSHIFT32(C0, -rshifts_extra);
        }
        rshifts += rshifts_extra;
    }
    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1;                                /* Q(-rshifts) */
    silk_memset(C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof(opus_int32));
    if(rshifts > 0) {
        for(s = 0; s < nb_subfr; s++) {
            x_ptr = x + s * subfr_length;
            for(n = 1; n < D + 1; n++) {
                C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
                    silk_inner_prod16_aligned_64(x_ptr, x_ptr + n, subfr_length - n, arch), rshifts);
            }
        }
    } else {
        for(s = 0; s < nb_subfr; s++) {
            int i;
            opus_int32 d;
            x_ptr = x + s * subfr_length;
            celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch);
            for(n = 1; n < D + 1; n++) {
               for (i = n + subfr_length - D, d = 0; i < subfr_length; i++)
                  d = MAC16_16(d, x_ptr[ i ], x_ptr[ i - n ]);
               xcorr[ n - 1 ] += d;
            }
            for(n = 1; n < D + 1; n++) {
                C_first_row[ n - 1 ] += silk_LSHIFT32(xcorr[ n - 1 ], -rshifts);
            }
        }
    }
    silk_memcpy(C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof(opus_int32));

    /* Initialize */
    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1;                                /* Q(-rshifts) */

    invGain_Q30 = (opus_int32)1 << 30;
    reached_max_gain = 0;
    for(n = 0; n < D; n++) {
        /* Update first row of correlation matrix (without first element) */
        /* Update last row of correlation matrix (without last element, stored in reversed order) */
        /* Update C * Af */
        /* Update C * flipud(Af) (stored in reversed order) */
        if(rshifts > -2) {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                x1  = -silk_LSHIFT32((opus_int32)x_ptr[ n ],                    16 - rshifts);        /* Q(16-rshifts) */
                x2  = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts);        /* Q(16-rshifts) */
                tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ],                    QA - 16);             /* Q(QA-16) */
                tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16);             /* Q(QA-16) */
                for(k = 0; k < n; k++) {
                    C_first_row[ k ] = silk_SMLAWB(C_first_row[ k ], x1, x_ptr[ n - k - 1 ]           ); /* Q(-rshifts) */
                    C_last_row[ k ]  = silk_SMLAWB(C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */
                    Atmp_QA = Af_QA[ k ];
                    tmp1 = silk_SMLAWB(tmp1, Atmp_QA, x_ptr[ n - k - 1 ]           );                 /* Q(QA-16) */
                    tmp2 = silk_SMLAWB(tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ]);                 /* Q(QA-16) */
                }
                tmp1 = silk_LSHIFT32(-tmp1, 32 - QA - rshifts);                                       /* Q(16-rshifts) */
                tmp2 = silk_LSHIFT32(-tmp2, 32 - QA - rshifts);                                       /* Q(16-rshifts) */
                for(k = 0; k <= n; k++) {
                    CAf[ k ] = silk_SMLAWB(CAf[ k ], tmp1, x_ptr[ n - k ]                   );        /* Q(-rshift) */
                    CAb[ k ] = silk_SMLAWB(CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ]);        /* Q(-rshift) */
                }
            }
        } else {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                x1  = -silk_LSHIFT32((opus_int32)x_ptr[ n ],                    -rshifts);            /* Q(-rshifts) */
                x2  = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts);            /* Q(-rshifts) */
                tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ],                    17);                  /* Q17 */
                tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 17);                  /* Q17 */

                X1_3210 = _mm_set1_epi32(x1);
                X2_3210 = _mm_set1_epi32(x2);
                TMP1_3210 = _mm_setzero_si128();
                TMP2_3210 = _mm_setzero_si128();
                for(k = 0; k < n - 3; k += 4) {
                    PTR_3210   = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 1 - 3 ]);
                    SUBFR_3210 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k ]);
                    FIRST_3210 = _mm_loadu_si128((__m128i *)&C_first_row[ k ]);
                    PTR_3210   = _mm_shuffle_epi32(PTR_3210,  _MM_SHUFFLE(0, 1, 2, 3));
                    LAST_3210  = _mm_loadu_si128((__m128i *)&C_last_row[ k ]);
                    ATMP_3210  = _mm_loadu_si128((__m128i *)&Af_QA[ k ]);

                    T1_3210 = _mm_mullo_epi32(PTR_3210, X1_3210);
                    T2_3210 = _mm_mullo_epi32(SUBFR_3210, X2_3210);

                    ATMP_3210 = _mm_srai_epi32(ATMP_3210, 7);
                    ATMP_3210 = _mm_add_epi32(ATMP_3210, CONST1);
                    ATMP_3210 = _mm_srai_epi32(ATMP_3210, 1);

                    FIRST_3210 = _mm_add_epi32(FIRST_3210, T1_3210);
                    LAST_3210 = _mm_add_epi32(LAST_3210, T2_3210);

                    PTR_3210   = _mm_mullo_epi32(ATMP_3210, PTR_3210);
                    SUBFR_3210   = _mm_mullo_epi32(ATMP_3210, SUBFR_3210);

                    _mm_storeu_si128((__m128i *)&C_first_row[ k ], FIRST_3210);
                    _mm_storeu_si128((__m128i *)&C_last_row[ k ], LAST_3210);

                    TMP1_3210 = _mm_add_epi32(TMP1_3210, PTR_3210);
                    TMP2_3210 = _mm_add_epi32(TMP2_3210, SUBFR_3210);
                }

                TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210));
                TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210));
                TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E));
                TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E));

                tmp1 += _mm_cvtsi128_si32(TMP1_3210);
                tmp2 += _mm_cvtsi128_si32(TMP2_3210);

                for(; k < n; k++) {
                    C_first_row[ k ] = silk_MLA(C_first_row[ k ], x1, x_ptr[ n - k - 1 ]           ); /* Q(-rshifts) */
                    C_last_row[ k ]  = silk_MLA(C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */
                    Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 17);                                   /* Q17 */
                    tmp1 = silk_MLA(tmp1, x_ptr[ n - k - 1 ],            Atmp1);                      /* Q17 */
                    tmp2 = silk_MLA(tmp2, x_ptr[ subfr_length - n + k ], Atmp1);                      /* Q17 */
                }

                tmp1 = -tmp1;                /* Q17 */
                tmp2 = -tmp2;                /* Q17 */

                {
                    __m128i xmm_tmp1, xmm_tmp2;
                    __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
                    __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;

                    xmm_tmp1 = _mm_set1_epi32(tmp1);
                    xmm_tmp2 = _mm_set1_epi32(tmp2);

                    for(k = 0; k <= n - 3; k += 4) {
                        xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 3 ]);
                        xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k - 1 ]);

                        xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 1, 2, 3));

                        xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32(xmm_x_ptr_n_k_x2x0, -rshifts - 1);
                        xmm_x_ptr_sub_x2x0 = _mm_slli_epi32(xmm_x_ptr_sub_x2x0, -rshifts - 1);

                        /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
                        xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 3, 2, 1));
                        xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_sub_x2x0, _MM_SHUFFLE(0, 3, 2, 1));

                        xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32(xmm_x_ptr_n_k_x2x0, xmm_tmp1);
                        xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32(xmm_x_ptr_n_k_x3x1, xmm_tmp1);
                        xmm_x_ptr_sub_x2x0 = _mm_mul_epi32(xmm_x_ptr_sub_x2x0, xmm_tmp2);
                        xmm_x_ptr_sub_x3x1 = _mm_mul_epi32(xmm_x_ptr_sub_x3x1, xmm_tmp2);

                        xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64(xmm_x_ptr_n_k_x2x0, 16);
                        xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64(xmm_x_ptr_n_k_x3x1, 16);
                        xmm_x_ptr_sub_x2x0 = _mm_srli_epi64(xmm_x_ptr_sub_x2x0, 16);
                        xmm_x_ptr_sub_x3x1 = _mm_slli_epi64(xmm_x_ptr_sub_x3x1, 16);

                        xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16(xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC);
                        xmm_x_ptr_sub_x2x0 = _mm_blend_epi16(xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC);

                        X1_3210  = _mm_loadu_si128((__m128i *)&CAf[ k ]);
                        PTR_3210 = _mm_loadu_si128((__m128i *)&CAb[ k ]);

                        X1_3210  = _mm_add_epi32(X1_3210, xmm_x_ptr_n_k_x2x0);
                        PTR_3210 = _mm_add_epi32(PTR_3210, xmm_x_ptr_sub_x2x0);

                        _mm_storeu_si128((__m128i *)&CAf[ k ], X1_3210);
                        _mm_storeu_si128((__m128i *)&CAb[ k ], PTR_3210);
                    }

                    for(; k <= n; k++) {
                        CAf[ k ] = silk_SMLAWW(CAf[ k ], tmp1,
                            silk_LSHIFT32((opus_int32)x_ptr[ n - k ], -rshifts - 1));                    /* Q(-rshift) */
                        CAb[ k ] = silk_SMLAWW(CAb[ k ], tmp2,
                            silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1)); /* Q(-rshift) */
                    }
                }
            }
        }

        /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
        tmp1 = C_first_row[ n ];                                                                        /* Q(-rshifts) */
        tmp2 = C_last_row[ n ];                                                                         /* Q(-rshifts) */
        num  = 0;                                                                                       /* Q(-rshifts) */
        nrg  = silk_ADD32(CAb[ 0 ], CAf[ 0 ]);                                                        /* Q(1-rshifts) */
        for(k = 0; k < n; k++) {
            Atmp_QA = Af_QA[ k ];
            lz = silk_CLZ32(silk_abs(Atmp_QA)) - 1;
            lz = silk_min(32 - QA, lz);
            Atmp1 = silk_LSHIFT32(Atmp_QA, lz);                                                       /* Q(QA + lz) */

            tmp1 = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(C_last_row[  n - k - 1 ], Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            tmp2 = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(C_first_row[ n - k - 1 ], Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            num  = silk_ADD_LSHIFT32(num,  silk_SMMUL(CAb[ n - k ],             Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            nrg  = silk_ADD_LSHIFT32(nrg,  silk_SMMUL(silk_ADD32(CAb[ k + 1 ], CAf[ k + 1 ]),
                                                                                Atmp1), 32 - QA - lz);    /* Q(1-rshifts) */
        }
        CAf[ n + 1 ] = tmp1;                                                                            /* Q(-rshifts) */
        CAb[ n + 1 ] = tmp2;                                                                            /* Q(-rshifts) */
        num = silk_ADD32(num, tmp2);                                                                  /* Q(-rshifts) */
        num = silk_LSHIFT32(-num, 1);                                                                 /* Q(1-rshifts) */

        /* Calculate the next order reflection (parcor) coefficient */
        if(silk_abs(num) < nrg) {
            rc_Q31 = silk_DIV32_varQ(num, nrg, 31);
        } else {
            rc_Q31 = (num > 0) ? silk_int32_MAX : silk_int32_MIN;
        }

        /* Update inverse prediction gain */
        tmp1 = ((opus_int32)1 << 30) - silk_SMMUL(rc_Q31, rc_Q31);
        tmp1 = silk_LSHIFT(silk_SMMUL(invGain_Q30, tmp1), 2);
        if(tmp1 <= minInvGain_Q30) {
            /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
            tmp2 = ((opus_int32)1 << 30) - silk_DIV32_varQ(minInvGain_Q30, invGain_Q30, 30);            /* Q30 */
            rc_Q31 = silk_SQRT_APPROX(tmp2);                                                  /* Q15 */
            /* Newton-Raphson iteration */
            rc_Q31 = silk_RSHIFT32(rc_Q31 + silk_DIV32(tmp2, rc_Q31), 1);                   /* Q15 */
            rc_Q31 = silk_LSHIFT32(rc_Q31, 16);                                               /* Q31 */
            if(num < 0) {
                /* Ensure adjusted reflection coefficients has the original sign */
                rc_Q31 = -rc_Q31;
            }
            invGain_Q30 = minInvGain_Q30;
            reached_max_gain = 1;
        } else {
            invGain_Q30 = tmp1;
        }

        /* Update the AR coefficients */
        for(k = 0; k < (n + 1) >> 1; k++) {
            tmp1 = Af_QA[ k ];                                                                  /* QA */
            tmp2 = Af_QA[ n - k - 1 ];                                                          /* QA */
            Af_QA[ k ]         = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1);      /* QA */
            Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1);      /* QA */
        }
        Af_QA[ n ] = silk_RSHIFT32(rc_Q31, 31 - QA);                                          /* QA */

        if(reached_max_gain) {
            /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
            for(k = n + 1; k < D; k++) {
                Af_QA[ k ] = 0;
            }
            break;
        }

        /* Update C * Af and C * Ab */
        for(k = 0; k <= n + 1; k++) {
            tmp1 = CAf[ k ];                                                                    /* Q(-rshifts) */
            tmp2 = CAb[ n - k + 1 ];                                                            /* Q(-rshifts) */
            CAf[ k ]         = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1);        /* Q(-rshifts) */
            CAb[ n - k + 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1);        /* Q(-rshifts) */
        }
    }

    if(reached_max_gain) {
        for(k = 0; k < D; k++) {
            /* Scale coefficients */
            A_Q16[ k ] = -silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16);
        }
        /* Subtract energy of preceding samples from C0 */
        if(rshifts > 0) {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                C0 -= (opus_int32)silk_RSHIFT64(silk_inner_prod16_aligned_64(x_ptr, x_ptr, D, arch), rshifts);
            }
        } else {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                C0 -= silk_LSHIFT32(silk_inner_prod_aligned(x_ptr, x_ptr, D, arch), -rshifts);
            }
        }
        /* Approximate residual energy */
        *res_nrg = silk_LSHIFT(silk_SMMUL(invGain_Q30, C0), 2);
        *res_nrg_Q = -rshifts;
    } else {
        /* Return residual energy */
        nrg  = CAf[ 0 ];                                                                            /* Q(-rshifts) */
        tmp1 = (opus_int32)1 << 16;                                                                             /* Q16 */
        for(k = 0; k < D; k++) {
            Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16);                                       /* Q16 */
            nrg  = silk_SMLAWW(nrg, CAf[ k + 1 ], Atmp1);                                         /* Q(-rshifts) */
            tmp1 = silk_SMLAWW(tmp1, Atmp1, Atmp1);                                               /* Q16 */
            A_Q16[ k ] = -Atmp1;
        }
        *res_nrg = silk_SMLAWW(nrg, silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0), -tmp1);/* Q(-rshifts) */
        *res_nrg_Q = -rshifts;
    }
}
Ejemplo n.º 23
-1
opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
      int N)
{
    opus_int  i, dataSize16;
    opus_int32 sum;
    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
    __m128i inVec1_3210, inVec2_3210;

    sum = 0;
    dataSize16 = N & ~15;

    acc1 = _mm_setzero_si128();
    acc2 = _mm_setzero_si128();

    for (i=0;i<dataSize16;i+=16) {
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));

        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));

        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);

        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
    }

    acc1 = _mm_add_epi32(acc1, acc2);

    if (N - i >= 8)
    {
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));

        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);

        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
        i += 8;
    }

    if (N - i >= 4)
    {
        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);

        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);

        acc1 = _mm_add_epi32(acc1, inVec1_3210);
        i += 4;
    }

    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));

    sum += _mm_cvtsi128_si32(acc1);

    for (;i<N;i++)
    {
        sum = silk_SMLABB(sum, x[i], y[i]);
    }

    return sum;
}