// Y = Y + alpha * X // saddly, there is no madd instr. for Fp in SSE static void madd(const float& alpha, const RView& X, Result& result) #ifdef __SSE_4_1__ { const int * x = X.data(); int * y = result.data(); __m128i px, px1, px2, py, py1, py2; __m128i alpha_p = _mm_set_epi32(alpha, alpha, alpha, alpha); for(int i=0; i<DIM_N - (DIM_N%12); i+=12) { px = _mm_load_si128((const __m128i *)x); px1 = _mm_load_si128((const __m128i *)(x+4)); px2 = _mm_load_si128(x+8) py = _mm_load_si128(y); py1 = _mm_load_si128(y+4); py2 = _mm_load_si128(y+8); py = _mm_add_epi32(py, _mm_mullo_epi32(alpha_a, px)); py1 = _mm_add_epi32(py1, _mm_mullo_epi32(alpha_a, px1)); py2 = _mm_add_epi32(py2, _mm_mullo_epi32(alpha_a, px2)); _mm_store_si128(y, py); _mm_store_si128(y+4, py1); _mm_store_si128(y+8, py2); x += 12; y += 12; } for(int i=DIM_N - (DIM_N%12); i<DIM_N; ++i) { result[i] += alpha * X[i]; } }
void ethash_calculate_dag_item(node *const ret, const unsigned node_index, const struct ethash_params *params, const struct ethash_cache *cache) { uint32_t num_parent_nodes = (uint32_t) (params->cache_size / sizeof(node)); node const *cache_nodes = (node const *) cache->mem; node const *init = &cache_nodes[node_index % num_parent_nodes]; memcpy(ret, init, sizeof(node)); ret->words[0] ^= node_index; SHA3_512(ret->bytes, ret->bytes, sizeof(node)); #if defined(_M_X64) && ENABLE_SSE __m128i const fnv_prime = _mm_set1_epi32(FNV_PRIME); __m128i xmm0 = ret->xmm[0]; __m128i xmm1 = ret->xmm[1]; __m128i xmm2 = ret->xmm[2]; __m128i xmm3 = ret->xmm[3]; #endif for (unsigned i = 0; i != DATASET_PARENTS; ++i) { uint32_t parent_index = ((node_index ^ i) * FNV_PRIME ^ ret->words[i % NODE_WORDS]) % num_parent_nodes; node const *parent = &cache_nodes[parent_index]; #if defined(_M_X64) && ENABLE_SSE { xmm0 = _mm_mullo_epi32(xmm0, fnv_prime); xmm1 = _mm_mullo_epi32(xmm1, fnv_prime); xmm2 = _mm_mullo_epi32(xmm2, fnv_prime); xmm3 = _mm_mullo_epi32(xmm3, fnv_prime); xmm0 = _mm_xor_si128(xmm0, parent->xmm[0]); xmm1 = _mm_xor_si128(xmm1, parent->xmm[1]); xmm2 = _mm_xor_si128(xmm2, parent->xmm[2]); xmm3 = _mm_xor_si128(xmm3, parent->xmm[3]); // have to write to ret as values are used to compute index ret->xmm[0] = xmm0; ret->xmm[1] = xmm1; ret->xmm[2] = xmm2; ret->xmm[3] = xmm3; } #else { for (unsigned w = 0; w != NODE_WORDS; ++w) { ret->words[w] = fnv_hash(ret->words[w], parent->words[w]); } } #endif } SHA3_512(ret->bytes, ret->bytes, sizeof(node)); }
inline static short sse3_dot_prod (const uint16_t *p1, const uint16_t *p2, size_t size) { unsigned long res[4]; unsigned int i; __m128i* mp1 = (__m128i *)p1; __m128i* mp2 = (__m128i *)p2; __m128i mres = _mm_set_epi32 (0, 0, 0, 0); for (i = 0; i < size; i += 8) { __m128i mreg1 = _mm_loadu_si128 (mp1); __m128i mreg2 = _mm_loadu_si128 (mp2); __m128i xlo1 = _mm_unpacklo_epi16 (mreg1, _mm_set1_epi16 (0)); __m128i xlo2 = _mm_unpacklo_epi16 (mreg2, _mm_set1_epi16 (0)); __m128i mtmp = _mm_mullo_epi32 (xlo1, xlo2); mres = _mm_add_epi32 (mres, mtmp); __m128i xhi1 = _mm_unpackhi_epi16 (mreg1, _mm_set1_epi16 (0)); __m128i xhi2 = _mm_unpackhi_epi16 (mreg2, _mm_set1_epi16 (0)); mtmp = _mm_mullo_epi32 (xhi1, xhi2); mres = _mm_add_epi32 (mres, mtmp); /* __m128i xlo1 = _mm_unpacklo_epi16 (_mm_loadu_si128 (mp1), _mm_set1_epi16 (0)); __m128i xlo2 = _mm_unpacklo_epi16 (_mm_loadu_si128 (mp2), _mm_set1_epi16 (0)); __m128i mtmp = _mm_mullo_epi32 (xlo1, xlo2); mres = _mm_add_epi32 (mres, mtmp); __m128i xhi1 = _mm_unpackhi_epi16 (_mm_loadu_si128 (mp1), _mm_set1_epi16 (0)); __m128i xhi2 = _mm_unpackhi_epi16 (_mm_loadu_si128 (mp2), _mm_set1_epi16 (0)); mtmp = _mm_mullo_epi32 (xhi1, xhi2); mres = _mm_add_epi32 (mres, mtmp); */ mp1++; mp2++; } __m128i* pmres = (__m128i *)res; _mm_storeu_si128 (pmres, mres); return res[0]+res[1]+res[2]+res[3]; }
uint32_t probe(uint32_t key) { /* create a vector with all values initialized to key */ __m128i keyVector = _mm_set1_epi32(key); /* find the appropriate buckets using multiplicative hashing */ __m128i bucketIds = _mm_mullo_epi32(keyVector, hashes.vec128); bucketIds = _mm_srli_epi32(bucketIds, hashShift); size_t b0 = _mm_extract_epi32(bucketIds, 0); size_t b1 = _mm_extract_epi32(bucketIds, 1); __m128i keys; __m128i values0, values1; /* load keys, compare with lookup key (to produce a bitmask). * AND the result with the corresponding values. */ keys = _mm_load_si128((const __m128i *) buckets[b0].keys); keys = _mm_cmpeq_epi32(keys, keyVector); values0 = _mm_load_si128((const __m128i *) buckets[b0].values); values0 = _mm_and_si128(values0, keys); keys = _mm_load_si128((const __m128i *) buckets[b1].keys); keys = _mm_cmpeq_epi32(keys, keyVector); values1 = _mm_load_si128((const __m128i *) buckets[b1].values); values1 = _mm_and_si128(values1, keys); /* OR all of the (key AND value) pairs to get result */ union QuadInt qi; qi.vec128 = _mm_or_si128(values0, values1); qi.vec64[0] = _mm_or_si64(qi.vec64[0], qi.vec64[1]); return qi.arr[0] | qi.arr[1]; }
SIMDValue SIMDInt32x4Operation::OpMul(const SIMDValue& aValue, const SIMDValue& bValue) { SIMDValue result; X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); if (AutoSystemInfo::Data.SSE4_1Available()) { // a * b, only available in SSE4 x86Result.m128i_value = _mm_mullo_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); result = X86SIMDValue::ToSIMDValue(x86Result); } else if (AutoSystemInfo::Data.SSE2Available()) { // mul 2,0: r0 = a0*b0; r1 = a2*b2 __m128i tmp1 = _mm_mul_epu32(tmpaValue.m128i_value, tmpbValue.m128i_value); // mul 3,1: r0=a1*b1; r1=a3*b3 __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(tmpaValue.m128i_value, 4), _mm_srli_si128(tmpbValue.m128i_value, 4)); // shuffle x86Results to [63..0] and pack x86Result.m128i_value = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); result = X86SIMDValue::ToSIMDValue(x86Result); } else { result.i32[SIMD_X] = aValue.i32[SIMD_X] * bValue.i32[SIMD_X]; result.i32[SIMD_Y] = aValue.i32[SIMD_Y] * bValue.i32[SIMD_Y]; result.i32[SIMD_Z] = aValue.i32[SIMD_Z] * bValue.i32[SIMD_Z]; result.i32[SIMD_W] = aValue.i32[SIMD_W] * bValue.i32[SIMD_W]; } return result; }
int main(int, char**) { volatile __m128 a = _mm_setzero_ps(); _mm_ceil_ps(a); volatile __m128i result = _mm_mullo_epi32(_mm_set1_epi32(42), _mm_set1_epi32(64)); (void)result; return 0; }
//int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, PSTR pScmdline, int iCmdshow) int main(int _argc, char** _argv) { int i[6] = { 1, 2, 3, 4, 5 }; int j[6] = { 1, 2, 4, 4, 5 }; int *k = new int(4); __m128i tmp = _mm_mullo_epi32(*(__m128i*)i, *(__m128i*)&j[0]); tmp.m128i_i32[0]; //_mm_mulhi_epi16(*xmm0, *xmm1); return 0; }
inline __m128i LOAD_QUANTISED(const int32_t *idata, const QuantisationMatrix *qmatrix, const int l, const int s) { __m128i D = _mm_load_si128((__m128i *)idata); __m128i QF = _mm_load_si128((__m128i *)&qmatrix->qfactor[l][s]); __m128i QO = _mm_load_si128((__m128i *)&qmatrix->qoffset[l][s]); __m128i X = _mm_abs_epi32(D); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); X = _mm_sign_epi32(X, D); return X; }
void multiply(int *a, int *b, int *c, int n) { int i; __m128i *pa, *pb, pc; for(i = 0; i < n; i += 4) { pa = (__m128i *)&a[i]; pb = (__m128i *)&b[i]; pc = _mm_mullo_epi32(*pa, *pb); memcpy(&c[i], &pc, 4*sizeof(int)); } }
void ahd_interpolate_tile(int top, char * buffer) { int row, col, tr, tc, c, val; const int dir[4] = { -1, 1, -width, width }; __m128i ldiff[2], abdiff[2]; union hvrgbpix (*rgb)[width] = (union hvrgbpix (*)[width])buffer; union hvrgbpix *rix; union rgbpix * pix; union hvrgbpix (*lab)[width]; short (*lix)[8]; char (*h**o)[width][2]; lab = (union hvrgbpix (*)[width])(buffer + 16*width*TS); h**o = (char (*)[width][2])(buffer + 32*width*TS); const int left=2; if ((uintptr_t)(image+top*width)&0xf || (uintptr_t)buffer&0xf) { fprintf(stderr, "unaligned buffers defeat speed!\n"); abort(); } /* Interpolate gren horz&vert, red and blue, and convert to CIELab: */ //do the first two rows of green first. //then one green, and rgb through the tile.. this because R/B needs down-right green value for (row=top; row < top+2 && row < height-2; row++) { col = left + (FC(row,left) & 1); for (c = FC(row,col); col < width-2; col+=2) { pix = (union rgbpix*)image + row*width+col; val = ((pix[-1].g + pix[0].c[c] + pix[1].g) * 2 - pix[-2].c[c] - pix[2].c[c]) >> 2; rgb[row-top][col-left].h.g = ULIM(val,pix[-1].g,pix[1].g); val = ((pix[-width].g + pix[0].c[c] + pix[width].g) * 2 - pix[-2*width].c[c] - pix[2*width].c[c]) >> 2; rgb[row-top][col-left].v.g = ULIM(val,pix[-width].g,pix[width].g); } } for (; row < top+TS && row < height-2; row++) { int rowx = row-1; if (FC(rowx,left+1)==1) { int c1 = FC(rowx+1,left+1), c2 = FC(rowx,left+2); pix = (union rgbpix*)image + row*width+left+1; rix = &rgb[row-top][1]; val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2; rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g); val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2; rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g); for (col=left+1; col < width-3; col+=2) { pix = (union rgbpix*)image + rowx*width+col+1; union hvrgbpix rixr, rix0; rix = &rgb[rowx-top][col-left]+1; signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1]; signed pix_ul = pix[-width-1].c[c1]; rixr.vec = _mm_set1_epi16(pix[-1].g); signed pix_lr = pix[-2].c[c2] + pix[0].c[c2]; rix0.h.c[c2] = rix0.v.c[c2] = pix[0].c[c2]; pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1] + 1; signed pix_dl = pix[width-1].c[c1]; //fully loaded __m128i rix_dr = _mm_setr_epi32(pix[width].g, pix[width-1].c[c1], pix[1].g, pix[-width+1].c[c1]); rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+1].c[c1], pix[width+3].c[c1], pix[width+1].c[c1], 0)); rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+2].g, 0, pix[2*width+1].g, pix[3*width+1].c[c1])); rix_dr = _mm_mullo_epi32(rix_dr,_mm_setr_epi32(2,1,2,1)); //half loaded rix_dr = _mm_hsub_epi32(rix_dr,_mm_setzero_si128()); rix_dr = _mm_srai_epi32(rix_dr,2); __m128i a = _mm_setr_epi32(pix[width].g,pix[1].g,0,0); __m128i b = _mm_setr_epi32(pix[width+2].g,pix[2*width+1].g,0,0); __m128i m = _mm_min_epi32(a,b); __m128i M = _mm_max_epi32(a,b); rix_dr = _mm_min_epi32(rix_dr,M); rix_dr = _mm_max_epi32(rix_dr,m); signed pix_udr = pix_ul + pix_dl; signed rix0_ul = rix[-width-1].h.g; signed rix1_ul = rix[-width-1].v.g; __m128i rix_ur = _mm_setr_epi32(rix[-width+1].h.g, rix[-width+1].v.g, 0, 0); signed rix0_rr = rix[-2].h.g; signed rix1_rr = rix[-2].v.g; rix0.h.g = rix[0].h.g; rix0.v.g = rix[0].v.g; signed rix0_dl = rix[width-1].h.g; signed rix1_dl = rix[width-1].v.g; // fully loaded __m128i rix_udr = _mm_setr_epi32(rix0_ul, rix1_ul, rix0_rr, rix1_rr); rix_udr = _mm_add_epi32(rix_udr, _mm_setr_epi32(rix0_dl, rix1_dl, rix0.h.g, rix0.v.g)); __m128i v2 = _mm_set_epi32(pix_lr, pix_lr, pix_udr, pix_udr); v2 = _mm_sub_epi32(v2, rix_udr); v2 = _mm_srai_epi32(v2,1); v2 = _mm_add_epi32(v2,_mm_cvtepu16_epi32(rixr.vec)); v2 = _mm_max_epi32(v2, _mm_setzero_si128()); v2 = _mm_min_epi32(v2, _mm_set1_epi32(0xffff)); rixr.h.c[c2] = _mm_extract_epi32(v2,2); rixr.v.c[c2] = _mm_extract_epi32(v2,3); rixr.h.c[c1] = _mm_extract_epi32(v2,0); rixr.v.c[c1] = _mm_extract_epi32(v2,1); // following only uses 64 bit __m128i v1 = _mm_set1_epi32(pix_diag); v1 = _mm_sub_epi32(v1, rix_ur); v1 = _mm_sub_epi32(v1, rix_dr); v1 = _mm_sub_epi32(v1, rix_udr); v1 = _mm_srai_epi32(v1,2); v1 = _mm_add_epi32(v1, _mm_setr_epi32(rix0.h.g, rix0.v.g, 0, 0)); v1 = _mm_max_epi32(v1, _mm_setzero_si128()); v1 = _mm_min_epi32(v1, _mm_set1_epi32(0xffff)); rix0.h.c[c1] = _mm_extract_epi32(v1,0); rix0.v.c[c1] = _mm_extract_epi32(v1,1); lab[rowx-top][col-left].vec = cielabv(rixr); lab[rowx-top][col-left+1].vec = cielabv(rix0); _mm_store_si128(&rix[-1].vec,rixr.vec); _mm_store_si128(&rix[0].vec,rix0.vec); rix[width+1].h.g = _mm_extract_epi32(rix_dr,0); rix[width+1].v.g = _mm_extract_epi32(rix_dr,1); } } else {
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- void TransformedAABBoxSSE::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 _mm_setcsr( _mm_getcsr() | 0x8040 ); __m128i colOffset = _mm_set_epi32(0, 1, 0, 1); __m128i rowOffset = _mm_set_epi32(0, 0, 1, 1); __m128i fxptZero = _mm_setzero_si128(); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i); // use fixed-point only for X and Y. Avoid work for Z and W. vFxPt4 xFormedFxPtPos[3]; for(int m = 0; m < 3; m++) { xFormedFxPtPos[m].X = _mm_cvtps_epi32(xformedPos[m].X); xFormedFxPtPos[m].Y = _mm_cvtps_epi32(xformedPos[m].Y); xFormedFxPtPos[m].Z = _mm_cvtps_epi32(xformedPos[m].Z); xFormedFxPtPos[m].W = _mm_cvtps_epi32(xformedPos[m].W); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(xFormedFxPtPos[1].Y, xFormedFxPtPos[2].Y); __m128i A1 = _mm_sub_epi32(xFormedFxPtPos[2].Y, xFormedFxPtPos[0].Y); __m128i A2 = _mm_sub_epi32(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].X); __m128i B1 = _mm_sub_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].X); __m128i B2 = _mm_sub_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].X); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[2].Y), _mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].Y)); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[0].Y), _mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].Y)); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[1].Y), _mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].Y)); // Compute triangle area __m128i triArea = _mm_mullo_epi32(A0, xFormedFxPtPos[0].X); triArea = _mm_add_epi32(triArea, _mm_mullo_epi32(B0, xFormedFxPtPos[0].Y)); triArea = _mm_add_epi32(triArea, C0); __m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea)); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endX = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENW)); __m128i startY = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endY = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENH)); for(int vv = 0; vv < 3; vv++) { // If W (holding 1/w in our case) is not between 0 and 1, // then vertex is behind near clip plane (1.0 in our case. // If W < 1, then verify 1/W > 1 (for W>0), and 1/W < 0 (for W < 0). __m128 nearClipMask0 = _mm_cmple_ps(xformedPos[vv].W, _mm_set1_ps(0.0f)); __m128 nearClipMask1 = _mm_cmpge_ps(xformedPos[vv].W, _mm_set1_ps(1.0f)); __m128 nearClipMask = _mm_or_ps(nearClipMask0, nearClipMask1); if(!_mm_test_all_zeros(*(__m128i*)&nearClipMask, *(__m128i*)&nearClipMask)) { // All four vertices are behind the near plane (we're processing four triangles at a time w/ SSE) *mVisible = true; return; } } // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m128 zz[3], oneOverW[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = _mm_set1_ps(xformedPos[vv].Z.m128_f32[lane]); oneOverW[vv] = _mm_set1_ps(xformedPos[vv].W.m128_f32[lane]); } __m128 oneOverTotalArea = _mm_set1_ps(oneOverTriArea.m128_f32[lane]); zz[0] *= oneOverTotalArea; zz[1] *= oneOverTotalArea; zz[2] *= oneOverTotalArea; int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]); __m128i cc0 = _mm_set1_epi32(C0.m128i_i32[lane]); __m128i cc1 = _mm_set1_epi32(C1.m128i_i32[lane]); __m128i cc2 = _mm_set1_epi32(C2.m128i_i32[lane]); __m128i aa0Inc = _mm_slli_epi32(aa0, 1); __m128i aa1Inc = _mm_slli_epi32(aa1, 1); __m128i aa2Inc = _mm_slli_epi32(aa2, 1); __m128i row, col; int rowIdx; // To avoid this branching, choose one method to traverse and store the pixel depth if(gVisualizeDepthBuffer) { // Sequentially traverse and store pixel depths contiguously rowIdx = (startYy * SCREENW + startXx); } else { // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance rowIdx = (startYy * SCREENW + 2 * startXx); } col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx)); __m128i aa0Col = _mm_mullo_epi32(aa0, col); __m128i aa1Col = _mm_mullo_epi32(aa1, col); __m128i aa2Col = _mm_mullo_epi32(aa2, col); row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy)); __m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), cc0); __m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), cc1); __m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), cc2); __m128i bb0Inc = _mm_slli_epi32(bb0, 1); __m128i bb1Inc = _mm_slli_epi32(bb1, 1); __m128i bb2Inc = _mm_slli_epi32(bb2, 1); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, row = _mm_add_epi32(row, _mm_set1_epi32(2)), rowIdx = rowIdx + 2 * SCREENW, bb0Row = _mm_add_epi32(bb0Row, bb0Inc), bb1Row = _mm_add_epi32(bb1Row, bb1Inc), bb2Row = _mm_add_epi32(bb2Row, bb2Inc)) { // Compute barycentric coordinates int idx = rowIdx; __m128i alpha = _mm_add_epi32(aa0Col, bb0Row); __m128i beta = _mm_add_epi32(aa1Col, bb1Row); __m128i gama = _mm_add_epi32(aa2Col, bb2Row); int idxIncr; if(gVisualizeDepthBuffer) { idxIncr = 2; } else { idxIncr = 4; } for(int c = startXx; c < endXx; c += 2, idx = idx + idxIncr, alpha = _mm_add_epi32(alpha, aa0Inc), beta = _mm_add_epi32(beta, aa1Inc), gama = _mm_add_epi32(gama, aa2Inc)) { //Test Pixel inside triangle __m128i mask = _mm_cmplt_epi32(fxptZero, _mm_or_si128(_mm_or_si128(alpha, beta), gama)); // Early out if all of this quad's pixels are outside the triangle. if(_mm_test_all_zeros(mask, mask)) { continue; } // Compute barycentric-interpolated depth __m128 depth = _mm_mul_ps(_mm_cvtepi32_ps(alpha), zz[0]); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1])); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2])); __m128 previousDepthValue; if(gVisualizeDepthBuffer) { previousDepthValue = _mm_set_ps(pDepthBuffer[idx], pDepthBuffer[idx + 1], pDepthBuffer[idx + SCREENW], pDepthBuffer[idx + SCREENW + 1]); } else { previousDepthValue = *(__m128*)&pDepthBuffer[idx]; } __m128 depthMask = _mm_cmpge_ps( depth, previousDepthValue); __m128i finalMask = _mm_and_si128( mask, _mm_castps_si128(depthMask)); if(!_mm_test_all_zeros(finalMask, finalMask)) { *mVisible = true; return; //early exit } }//for each column }// for each row }// for each triangle }// for each set of SIMD# triangles }
void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) { int j; __m128i vecX, vecX0, vecX1, vecX2, vecX3; __m128i vecY0, vecY1, vecY2, vecY3; __m128i sum0, sum1, sum2, sum3, vecSum; __m128i initSum; celt_assert(len >= 3); sum0 = _mm_setzero_si128(); sum1 = _mm_setzero_si128(); sum2 = _mm_setzero_si128(); sum3 = _mm_setzero_si128(); for (j=0;j<(len-7);j+=8) { vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); } sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), _mm_unpacklo_epi32(sum2, sum3)); for (;j<(len-3);j+=4) { vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); vecX0 = _mm_shuffle_epi32(vecX, 0x00); vecX1 = _mm_shuffle_epi32(vecX, 0x55); vecX2 = _mm_shuffle_epi32(vecX, 0xaa); vecX3 = _mm_shuffle_epi32(vecX, 0xff); vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); sum0 = _mm_mullo_epi32(vecX0, vecY0); sum1 = _mm_mullo_epi32(vecX1, vecY1); sum2 = _mm_mullo_epi32(vecX2, vecY2); sum3 = _mm_mullo_epi32(vecX3, vecY3); sum0 = _mm_add_epi32(sum0, sum1); sum2 = _mm_add_epi32(sum2, sum3); vecSum = _mm_add_epi32(vecSum, sum0); vecSum = _mm_add_epi32(vecSum, sum2); } for (;j<len;j++) { vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); vecX0 = _mm_shuffle_epi32(vecX, 0x00); vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); sum0 = _mm_mullo_epi32(vecX0, vecY0); vecSum = _mm_add_epi32(vecSum, sum0); } initSum = _mm_loadu_si128((__m128i *)(&sum[0])); initSum = _mm_add_epi32(initSum, vecSum); _mm_storeu_si128((__m128i *)sum, initSum); }
int main(int argc, char **argv) { struct timespec t1, t2; int c, d, k, sum = 0; int size, opt, i; char *fname; while((opt = getopt(argc, argv, "f:s:"))!= -1) { switch (opt){ case 's': size = atoi(optarg); break; case 'f': fname = optarg; break; default: size = MEDIUM; break; } } FILE *fp; fp = fopen(fname,"a"); int edge; int *first; posix_memalign((void**)&first,16,sizeof(int)*size*size); //use posix_memalign to get 16byte alignment int *multiply; posix_memalign((void**)&multiply,16,sizeof(int)*size*size); __m128i m1, m2,m3; for ( c = 0 ; c < size ; c++ ) for ( d = 0 ; d < size ; d++ ) first[c*size+d] = ((c+d) % 2) - 1; multiply[c*size+d] = 0; printf("multiplying the %d-size matrices\n You should try to time this part.\n",size); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t1); for ( c = 0 ; c < size ; c++ ) { for ( k = 0 ; k < size ; k++ ) { m2 = _mm_set1_epi32(first[c*size+k]); //first[c][k] for (d = 0 ; d < size ; d+=4) { edge = size - d; if (edge < 4){ //account for non-div by 4 matrices for (i = d; i < size; i++) multiply[c*size+i] += first[c*size+k]*first[k*size+i]; } else{ m1 = _mm_loadu_si128(&first[k*size+d]); //first[k][d] m1 = _mm_mullo_epi32(m1,m2); // first[k][d] * first[c][k] m3 = _mm_loadu_si128(&multiply[c*size+d]);//load up old values of multiply[c][d] m1 = _mm_add_epi32(m3,m1); //[+= to mult] _mm_storeu_si128(&multiply[c*size+d],m1); } } } } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t2); double nanos = (diff(t1,t2).tv_nsec) * pow(10,-9); double secs = (diff(t1,t2).tv_sec); double dif = secs + nanos; fprintf(fp,"%.10f\n", dif); fclose(fp); printf("test first %d\n",first[size]); printf("test mult %d\n",multiply[size]); free(first); //free SSE aligned array with _aligned_free free(multiply); return 0; }
static bool ethash_hash( ethash_return_value_t* ret, node const* full_nodes, ethash_light_t const light, uint64_t full_size, ethash_h256_t const header_hash, uint64_t const nonce ) { if (full_size % MIX_WORDS != 0) { return false; } // pack hash and nonce together into first 40 bytes of s_mix assert(sizeof(node) * 8 == 512); node s_mix[MIX_NODES + 1]; memcpy(s_mix[0].bytes, &header_hash, 32); fix_endian64(s_mix[0].double_words[4], nonce); // compute sha3-512 hash and replicate across mix SHA3_512(s_mix->bytes, s_mix->bytes, 40); fix_endian_arr32(s_mix[0].words, 16); node* const mix = s_mix + 1; for (uint32_t w = 0; w != MIX_WORDS; ++w) { mix->words[w] = s_mix[0].words[w % NODE_WORDS]; } unsigned const page_size = sizeof(uint32_t) * MIX_WORDS; unsigned const num_full_pages = (unsigned) (full_size / page_size); for (unsigned i = 0; i != ETHASH_ACCESSES; ++i) { uint32_t const index = fnv_hash(s_mix->words[0] ^ i, mix->words[i % MIX_WORDS]) % num_full_pages; for (unsigned n = 0; n != MIX_NODES; ++n) { node const* dag_node; if (full_nodes) { dag_node = &full_nodes[MIX_NODES * index + n]; } else { node tmp_node; ethash_calculate_dag_item(&tmp_node, index * MIX_NODES + n, light); dag_node = &tmp_node; } #if defined(_M_X64) && ENABLE_SSE { __m128i fnv_prime = _mm_set1_epi32(FNV_PRIME); __m128i xmm0 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[0]); __m128i xmm1 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[1]); __m128i xmm2 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[2]); __m128i xmm3 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[3]); mix[n].xmm[0] = _mm_xor_si128(xmm0, dag_node->xmm[0]); mix[n].xmm[1] = _mm_xor_si128(xmm1, dag_node->xmm[1]); mix[n].xmm[2] = _mm_xor_si128(xmm2, dag_node->xmm[2]); mix[n].xmm[3] = _mm_xor_si128(xmm3, dag_node->xmm[3]); } #else { for (unsigned w = 0; w != NODE_WORDS; ++w) { mix[n].words[w] = fnv_hash(mix[n].words[w], dag_node->words[w]); } } #endif } } // compress mix for (uint32_t w = 0; w != MIX_WORDS; w += 4) { uint32_t reduction = mix->words[w + 0]; reduction = reduction * FNV_PRIME ^ mix->words[w + 1]; reduction = reduction * FNV_PRIME ^ mix->words[w + 2]; reduction = reduction * FNV_PRIME ^ mix->words[w + 3]; mix->words[w / 4] = reduction; } fix_endian_arr32(mix->words, MIX_WORDS / 4); memcpy(&ret->mix_hash, mix->bytes, 32); // final Keccak hash SHA3_256(&ret->result, s_mix->bytes, 64 + 32); // Keccak-256(s + compressed_mix) return true; }
//------------------------------------------------------------------------------- // For each tile go through all the bins and process all the triangles in it. // Rasterize each triangle to the CPU depth buffer. //------------------------------------------------------------------------------- void DepthBufferRasterizerSSEST::RasterizeBinnedTrianglesToDepthBuffer(UINT tileId, UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) _mm_setcsr( _mm_getcsr() | 0x8040 ); __m128i colOffset = _mm_setr_epi32(0, 1, 0, 1); __m128i rowOffset = _mm_setr_epi32(0, 0, 1, 1); __m128i fxptZero = _mm_setzero_si128(); float* pDepthBuffer = (float*)mpRenderTargetPixels[idx]; // Based on TaskId determine which tile to process UINT screenWidthInTiles = SCREENW/TILE_WIDTH_IN_PIXELS; UINT tileX = tileId % screenWidthInTiles; UINT tileY = tileId / screenWidthInTiles; int tileStartX = tileX * TILE_WIDTH_IN_PIXELS; int tileEndX = tileStartX + TILE_WIDTH_IN_PIXELS - 1; int tileStartY = tileY * TILE_HEIGHT_IN_PIXELS; int tileEndY = tileStartY + TILE_HEIGHT_IN_PIXELS - 1; ClearDepthTile(tileStartX, tileStartY, tileEndX+1, tileEndY+1, idx); UINT bin = 0; UINT binIndex = 0; UINT offset1 = YOFFSET1_ST * tileY + XOFFSET1_ST * tileX; UINT offset2 = YOFFSET2_ST * tileY + XOFFSET2_ST * tileX; UINT numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; __m128 gatherBuf[4][3]; bool done = false; bool allBinsEmpty = true; mNumRasterizedTris[idx][tileId] = numTrisInBin; while(!done) { // Loop through all the bins and process 4 binned traingles at a time UINT ii; int numSimdTris = 0; for(ii = 0; ii < SSE; ii++) { while(numTrisInBin <= 0) { // This bin is empty. Move to next bin. if(++bin >= 1) { break; } numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; mNumRasterizedTris[idx][tileId] += numTrisInBin; binIndex = 0; } if(!numTrisInBin) { break; // No more tris in the bins } USHORT modelId = mpBinModel[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; USHORT meshId = mpBinMesh[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; UINT triIdx = mpBin[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; mpTransformedModels1[modelId].Gather(gatherBuf[ii], meshId, triIdx, idx); allBinsEmpty = false; numSimdTris++; ++binIndex; --numTrisInBin; } done = bin >= NUM_XFORMVERTS_TASKS; if(allBinsEmpty) { return; } // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; __m128 Z[3]; for(int i = 0; i < 3; i++) { __m128 v0 = gatherBuf[0][i]; __m128 v1 = gatherBuf[1][i]; __m128 v2 = gatherBuf[2][i]; __m128 v3 = gatherBuf[3][i]; // transpose into SoA layout _MM_TRANSPOSE4_PS(v0, v1, v2, v3); fxPtX[i] = _mm_cvtps_epi32(v0); fxPtY[i] = _mm_cvtps_epi32(v1); Z[i] = v2; } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea)); Z[1] = _mm_mul_ps(_mm_sub_ps(Z[1], Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(Z[2], Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(tileStartX)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endX = Min(_mm_add_epi32(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndX)); __m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(tileStartY)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endY = Min(_mm_add_epi32(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndY)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < numSimdTris; lane++) { // Extract this triangle's properties from the SIMD versions __m128 zz[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]); __m128i aa0Inc = _mm_slli_epi32(aa0, 1); __m128i aa1Inc = _mm_slli_epi32(aa1, 1); __m128i aa2Inc = _mm_slli_epi32(aa2, 1); __m128i row, col; // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx)); __m128i aa0Col = _mm_mullo_epi32(aa0, col); __m128i aa1Col = _mm_mullo_epi32(aa1, col); __m128i aa2Col = _mm_mullo_epi32(aa2, col); row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy)); __m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), _mm_set1_epi32(C0.m128i_i32[lane])); __m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), _mm_set1_epi32(C1.m128i_i32[lane])); __m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), _mm_set1_epi32(C2.m128i_i32[lane])); __m128i sum0Row = _mm_add_epi32(aa0Col, bb0Row); __m128i sum1Row = _mm_add_epi32(aa1Col, bb1Row); __m128i sum2Row = _mm_add_epi32(aa2Col, bb2Row); __m128i bb0Inc = _mm_slli_epi32(bb0, 1); __m128i bb1Inc = _mm_slli_epi32(bb1, 1); __m128i bb2Inc = _mm_slli_epi32(bb2, 1); __m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm_add_epi32(sum0Row, bb0Inc), sum1Row = _mm_add_epi32(sum1Row, bb1Inc), sum2Row = _mm_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m128i alpha = sum0Row; __m128i beta = sum1Row; __m128i gama = sum2Row; //Compute barycentric-interpolated depth __m128 depth = zz[0]; depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1])); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2])); for(int c = startXx; c < endXx; c += 2, index += 4, alpha = _mm_add_epi32(alpha, aa0Inc), beta = _mm_add_epi32(beta, aa1Inc), gama = _mm_add_epi32(gama, aa2Inc), depth = _mm_add_ps(depth, zx)) { //Test Pixel inside triangle __m128i mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama); __m128 previousDepthValue = _mm_load_ps(&pDepthBuffer[index]); __m128 mergedDepth = _mm_max_ps(depth, previousDepthValue); __m128 finalDepth = _mm_blendv_ps(mergedDepth, previousDepthValue, _mm_castsi128_ps(mask)); _mm_store_ps(&pDepthBuffer[index], finalDepth); }//for each column }// for each row }// for each triangle }// for each set of SIMD# triangles }
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- bool TransformedAABBoxAVX::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels, const __m128 pXformedPos[], UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 _mm_setcsr( _mm_getcsr() | 0x8040 ); __m256i colOffset = _mm256_setr_epi32(0, 1, 2, 3, 0, 1, 2, 3); __m256i rowOffset = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i, pXformedPos, idx); // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; for(int m = 0; m < 3; m++) { fxPtX[m] = _mm_cvtps_epi32(xformedPos[m].X); fxPtY[m] = _mm_cvtps_epi32(xformedPos[m].Y); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_rcp_ps(_mm_cvtepi32_ps(triArea)); __m128 Z[3]; Z[0] = xformedPos[0].Z; Z[1] = _mm_mul_ps(_mm_sub_ps(xformedPos[1].Z, Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(xformedPos[2].Z, Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize //__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~3)); __m128i endX = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(SCREENW - 1)); __m128i startY = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i endY = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(SCREENH - 1)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m256 zz[3]; for (int vv = 0; vv < 3; vv++) { zz[vv] = _mm256_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m256i aa0 = _mm256_set1_epi32(A0.m128i_i32[lane]); __m256i aa1 = _mm256_set1_epi32(A1.m128i_i32[lane]); __m256i aa2 = _mm256_set1_epi32(A2.m128i_i32[lane]); __m256i bb0 = _mm256_set1_epi32(B0.m128i_i32[lane]); __m256i bb1 = _mm256_set1_epi32(B1.m128i_i32[lane]); __m256i bb2 = _mm256_set1_epi32(B2.m128i_i32[lane]); __m256i aa0Inc = _mm256_slli_epi32(aa0, 2); __m256i aa1Inc = _mm256_slli_epi32(aa1, 2); __m256i aa2Inc = _mm256_slli_epi32(aa2, 2); __m256i bb0Inc = _mm256_slli_epi32(bb0, 1); __m256i bb1Inc = _mm256_slli_epi32(bb1, 1); __m256i bb2Inc = _mm256_slli_epi32(bb2, 1); __m256i row, col; // Traverse pixels in 2x4 blocks and store 2x4 pixel quad depths contiguously in memory ==> 2*X // This method provides better performance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm256_add_epi32(colOffset, _mm256_set1_epi32(startXx)); __m256i aa0Col = _mm256_mullo_epi32(aa0, col); __m256i aa1Col = _mm256_mullo_epi32(aa1, col); __m256i aa2Col = _mm256_mullo_epi32(aa2, col); row = _mm256_add_epi32(rowOffset, _mm256_set1_epi32(startYy)); __m256i bb0Row = _mm256_add_epi32(_mm256_mullo_epi32(bb0, row), _mm256_set1_epi32(C0.m128i_i32[lane])); __m256i bb1Row = _mm256_add_epi32(_mm256_mullo_epi32(bb1, row), _mm256_set1_epi32(C1.m128i_i32[lane])); __m256i bb2Row = _mm256_add_epi32(_mm256_mullo_epi32(bb2, row), _mm256_set1_epi32(C2.m128i_i32[lane])); __m256i sum0Row = _mm256_add_epi32(aa0Col, bb0Row); __m256i sum1Row = _mm256_add_epi32(aa1Col, bb1Row); __m256i sum2Row = _mm256_add_epi32(aa2Col, bb2Row); __m256 zx = _mm256_mul_ps(_mm256_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm256_add_ps(zx, _mm256_mul_ps(_mm256_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for (int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm256_add_epi32(sum0Row, bb0Inc), sum1Row = _mm256_add_epi32(sum1Row, bb1Inc), sum2Row = _mm256_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m256i alpha = sum0Row; __m256i beta = sum1Row; __m256i gama = sum2Row; //Compute barycentric-interpolated depth __m256 depth = zz[0]; depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(beta), zz[1])); depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(gama), zz[2])); __m256i anyOut = _mm256_setzero_si256(); for (int c = startXx; c < endXx; c += 4, index += 8, alpha = _mm256_add_epi32(alpha, aa0Inc), beta = _mm256_add_epi32(beta, aa1Inc), gama = _mm256_add_epi32(gama, aa2Inc), depth = _mm256_add_ps(depth, zx)) { //Test Pixel inside triangle __m256i mask = _mm256_or_si256(_mm256_or_si256(alpha, beta), gama); __m256 previousDepthValue = _mm256_loadu_ps(&pDepthBuffer[index]); __m256 depthMask = _mm256_cmp_ps(depth, previousDepthValue, 0x1D); __m256i finalMask = _mm256_andnot_si256(mask, _mm256_castps_si256(depthMask)); anyOut = _mm256_or_si256(anyOut, finalMask); }//for each column if (!_mm256_testz_si256(anyOut, _mm256_set1_epi32(0x80000000))) { return true; //early exit } }// for each row }// for each triangle }// for each set of SIMD# triangles return false; }
__m128i test_mm_mullo_epi32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_mullo_epi32 // CHECK: mul <4 x i32> // CHECK-ASM: pmulld %xmm{{.*}}, %xmm{{.*}} return _mm_mullo_epi32(x, y); }
template<class T> inline void dequantise_sse4_2_16_8_3(QuantisationMatrix *qmatrix, int32_t *idata, void *_odata, int ostride) { T *odata = (T *)_odata; const int slice_width = 16; const int slice_height = 8; const int Y = 0; const int X = 0; const int N = 0; T * const optr = &odata[Y*slice_height*ostride + X*slice_width]; const int32_t * iptr = &idata[N*slice_height*slice_width]; { __m128i D0; { D0 = _mm_load_si128((__m128i *)&iptr[ 0]); // [ 0 1 2 3 ] (Q) __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[0][0]), _mm_load_si128((__m128i *)&qmatrix->qfactor[1][1])); __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[0][0]), _mm_load_si128((__m128i *)&qmatrix->qoffset[1][1])); __m128i X = _mm_abs_epi32(D0); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); D0 = _mm_sign_epi32(X, D0); D0 = _mm_shuffle_epi32(D0, 0xD8); } const __m128i D1 = LOAD_QUANTISED(&iptr[8], qmatrix, 2, 1); const __m128i D2 = LOAD_QUANTISED(&iptr[32], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[36], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 8], B2, B3); } { __m128i D0; { D0 = _mm_load_si128((__m128i *)&iptr[ 4]); __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[1][2]), _mm_load_si128((__m128i *)&qmatrix->qfactor[1][3])); __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[1][2]), _mm_load_si128((__m128i *)&qmatrix->qoffset[1][3])); __m128i X = _mm_abs_epi32(D0); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); D0 = _mm_sign_epi32(X, D0); D0 = _mm_shuffle_epi32(D0, 0xD8); } const __m128i D1 = LOAD_QUANTISED(&iptr[12], qmatrix, 2, 1); const __m128i D2 = LOAD_QUANTISED(&iptr[48], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[52], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 8], B2, B3); } { const __m128i D0 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 2); const __m128i D1 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 3); const __m128i D2 = LOAD_QUANTISED(&iptr[40], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[44], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 8], B2, B3); } { const __m128i D0 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 2); const __m128i D1 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 3); const __m128i D2 = LOAD_QUANTISED(&iptr[56], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[60], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 8], B2, B3); } for (int y = 0; y < 4; y++) { const __m128i D0 = LOAD_QUANTISED(&iptr[ 64 + y*8], qmatrix, 3, 2); const __m128i D1 = LOAD_QUANTISED(&iptr[ 68 + y*8], qmatrix, 3, 2); const __m128i D2 = LOAD_QUANTISED(&iptr[ 96 + y*8], qmatrix, 3, 3); const __m128i D3 = LOAD_QUANTISED(&iptr[100 + y*8], qmatrix, 3, 3); const __m128i A0 = _mm_unpacklo_epi32(D0, D2); const __m128i A1 = _mm_unpackhi_epi32(D0, D2); const __m128i A2 = _mm_unpacklo_epi32(D1, D3); const __m128i A3 = _mm_unpackhi_epi32(D1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*y + 1)*ostride + 0], A0, A1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*y + 1)*ostride + 8], A2, A3); } }
template<> void dequantise_sse4_2<4,8,2, int32_t>(QuantisationMatrix *qmatrix, int32_t *idata, void *_odata, int ostride, int, int, int) { int32_t *odata = (int32_t *)_odata; const int slice_width = 4; const int slice_height = 8; const int Y = 0; const int X = 0; const int N = 0; int32_t * const optr = &odata[Y*slice_height*ostride + X*slice_width]; const int32_t * iptr = &idata[N*slice_height*slice_width]; __m128i D0; { D0 = _mm_load_si128((__m128i *)&iptr[ 0]); // [ 0 1 2 3 ] (Q) __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[0][0]), _mm_load_si128((__m128i *)&qmatrix->qfactor[1][1])); __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[0][0]), _mm_load_si128((__m128i *)&qmatrix->qoffset[1][1])); __m128i X = _mm_abs_epi32(D0); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); D0 = _mm_sign_epi32(X, D0); } __m128i D4; { D4 = _mm_load_si128((__m128i *)&iptr[ 4]); // [ 4 5 6 7 ] (Q) __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[1][2]), _mm_load_si128((__m128i *)&qmatrix->qfactor[1][3])); __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[1][2]), _mm_load_si128((__m128i *)&qmatrix->qoffset[1][3])); __m128i X = _mm_abs_epi32(D4); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); D4 = _mm_sign_epi32(X, D4); } const __m128i D8 = LOAD_QUANTISED(&iptr[ 8], qmatrix, 2, 1); // [ 8 9 10 11 ] const __m128i D12 = LOAD_QUANTISED(&iptr[12], qmatrix, 2, 1); // [ 12 13 14 15 ] const __m128i D16 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 2); // [ 16 17 18 19 ] const __m128i D20 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 2); // [ 20 21 22 23 ] const __m128i D24 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 3); // [ 24 25 26 27 ] const __m128i D28 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 3); // [ 28 29 30 31 ] const __m128i X0 = _mm_unpacklo_epi32(D0, D4); // [ 0 4 1 5 ] const __m128i X1 = _mm_unpackhi_epi32(D0, D4); // [ 2 6 3 7 ] const __m128i Y0 = _mm_unpacklo_epi32(X0, X1); // [ 0 2 4 6 ] const __m128i Y1 = _mm_unpackhi_epi32(X0, X1); // [ 1 3 5 7 ] const __m128i Z0 = _mm_unpacklo_epi32(Y0, D8); // [ 0 8 2 9 ] _mm_store_si128((__m128i *)&optr[0*ostride + 0], Z0); const __m128i Z1 = _mm_unpackhi_epi32(Y0, D8); // [ 4 10 6 11 ] _mm_store_si128((__m128i *)&optr[2*ostride + 0], Z1); const __m128i Z2 = _mm_unpacklo_epi32(Y1, D12); // [ 1 12 3 13 ] _mm_store_si128((__m128i *)&optr[4*ostride + 0], Z2); const __m128i Z3 = _mm_unpackhi_epi32(Y1, D12); // [ 5 14 7 15 ] _mm_store_si128((__m128i *)&optr[6*ostride + 0], Z3); const __m128i W0 = _mm_unpacklo_epi32(D16, D24);// [ 16 24 17 25 ] _mm_store_si128((__m128i *)&optr[1*ostride + 0], W0); const __m128i W1 = _mm_unpackhi_epi32(D16, D24);// [ 18 26 19 27 ] _mm_store_si128((__m128i *)&optr[3*ostride + 0], W1); const __m128i W2 = _mm_unpacklo_epi32(D20, D28);// [ 20 28 21 29 ] _mm_store_si128((__m128i *)&optr[5*ostride + 0], W2); const __m128i W3 = _mm_unpackhi_epi32(D20, D28);// [ 22 30 23 31 ] _mm_store_si128((__m128i *)&optr[7*ostride + 0], W3); }
__m128i test_mm_mullo_epi32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_mullo_epi32 // CHECK: mul <4 x i32> return _mm_mullo_epi32(x, y); }
/***************************************************************************** * This function utilises 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * * vp9_encoder.c. * * For the joint cost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * * For the component costs: * * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * * If these do not hold, then this function cannot be used without * * modification, in which case you can revert to using the C implementation, * * which does not rely on these properties. * *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min); const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); // search_param determines the length of the initial step and hence the number // of iterations. // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; const int tot_steps = cfg->total_steps - search_param; const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; const uint8_t *const what = x->plane[0].src.buf; const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; #if ARCH_X86_64 __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif unsigned int best_sad; int i; int j; int step; // Check the prerequisite cost function properties that are easy to check // in an assert. See the function-level documentation for details on all // prerequisites. assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); // Check the starting position best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { __m128i v_sad_d; __m128i v_cost_d; __m128i v_outside_d; __m128i v_inside_d; __m128i v_diff_mv_w; #if ARCH_X86_64 __m128i v_blocka[2]; #else __m128i v_blocka[1]; #endif // Compute the candidate motion vectors const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]); const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); // Clamp them to the search bounds __m128i v_these_mv_clamp_w = v_these_mv_w; v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); // The ones that did not change are inside the search area v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); // If none of them are inside, then move on if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) { continue; } // The inverse mask indicates which of the MVs are outside v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); // Shift right to keep the sign bit clear, we will use this later // to set the cost to the maximum value. v_outside_d = _mm_srli_epi32(v_outside_d, 1); // Compute the difference MV v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); // We utilise the fact that the cost function is even, and use the // absolute difference. This allows us to use unsigned indexes later // and reduces cache pressure somewhat as only a half of the table // is ever referenced. v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); // Compute the SIMD pointer offsets. { #if ARCH_X86_64 // sizeof(intptr_t) == 8 // Load the offsets __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]); __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]); // Set the ones falling outside to zero v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); v_bo32_q = _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); // Compute the candidate addresses v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); #else // ARCH_X86 // sizeof(intptr_t) == 4 __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]); v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); #endif } fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], in_what_stride, (uint32_t*)&v_sad_d); // Look up the component cost of the residual motion vector { const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); // Note: This is a use case for vpgather in AVX2 const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; __m128i v_cost_10_d, v_cost_32_d; v_cost_10_d = _mm_cvtsi32_si128(cost0); v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); v_cost_32_d = _mm_cvtsi32_si128(cost2); v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); } // Now add in the joint cost { const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); } // Multiply by sad_per_bit v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); // ROUND_POWER_OF_TWO(v_cost_d, 8) v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80)); v_cost_d = _mm_srai_epi32(v_cost_d, 8); // Add the cost to the sad v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); // Make the motion vectors outside the search area have max cost // by or'ing in the comparison mask, this way the minimum search won't // pick them. v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); // Find the minimum value and index horizontally in v_sad_d { // Try speculatively on 16 bits, so we can use the minpos intrinsic const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); // If the local best value is not saturated, just use it, otherwise // find the horizontal minimum again the hard way on 32 bits. // This is executed rarely. if (__unlikely__(local_best_sad == 0xffff)) { __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; v_loval_d = v_sad_d; v_loidx_d = _mm_set_epi32(3, 2, 1, 0); v_hival_d = _mm_srli_si128(v_loval_d, 8); v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); v_hival_d = _mm_srli_si128(v_loval_d, 4); v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); local_best_sad = _mm_extract_epi32(v_loval_d, 0); local_best_idx = _mm_extract_epi32(v_loidx_d, 0); } // Update the global minimum if the local minimum is smaller if (__likely__(local_best_sad < best_sad)) { new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; best_sad = local_best_sad; } } } bmv = new_bmv; best_address = new_best_address; v_bmv_w = _mm_set1_epi32(bmv.as_int); #if ARCH_X86_64 v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif if (__unlikely__(best_address == in_what)) { (*num00)++; } } *best_mv = bmv.as_mv; return best_sad; }
/* Compute reflection coefficients from input signal */ void silk_burg_modified_sse4_1( opus_int32 *res_nrg, /* O Residual energy */ opus_int *res_nrg_Q, /* O Residual energy Q value */ opus_int32 A_Q16[], /* O Prediction coefficients (length order) */ const opus_int16 x[], /* I Input signal, length: nb_subfr * (D + subfr_length) */ const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */ const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I Number of subframes stacked in x */ const opus_int D, /* I Order */ int arch /* I Run-time architecture */ ) { opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain; opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2; const opus_int16 *x_ptr; opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ]; opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ]; opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ]; opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 xcorr[ SILK_MAX_ORDER_LPC ]; __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210; __m128i CONST1 = _mm_set1_epi32(1); silk_assert(subfr_length * nb_subfr <= MAX_FRAME_SIZE); /* Compute autocorrelations, added over subframes */ silk_sum_sqr_shift(&C0, &rshifts, x, nb_subfr * subfr_length); if(rshifts > MAX_RSHIFTS) { C0 = silk_LSHIFT32(C0, rshifts - MAX_RSHIFTS); silk_assert(C0 > 0); rshifts = MAX_RSHIFTS; } else { lz = silk_CLZ32(C0) - 1; rshifts_extra = N_BITS_HEAD_ROOM - lz; if(rshifts_extra > 0) { rshifts_extra = silk_min(rshifts_extra, MAX_RSHIFTS - rshifts); C0 = silk_RSHIFT32(C0, rshifts_extra); } else { rshifts_extra = silk_max(rshifts_extra, MIN_RSHIFTS - rshifts); C0 = silk_LSHIFT32(C0, -rshifts_extra); } rshifts += rshifts_extra; } CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1; /* Q(-rshifts) */ silk_memset(C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof(opus_int32)); if(rshifts > 0) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; for(n = 1; n < D + 1; n++) { C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64(x_ptr, x_ptr + n, subfr_length - n, arch), rshifts); } } } else { for(s = 0; s < nb_subfr; s++) { int i; opus_int32 d; x_ptr = x + s * subfr_length; celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch); for(n = 1; n < D + 1; n++) { for (i = n + subfr_length - D, d = 0; i < subfr_length; i++) d = MAC16_16(d, x_ptr[ i ], x_ptr[ i - n ]); xcorr[ n - 1 ] += d; } for(n = 1; n < D + 1; n++) { C_first_row[ n - 1 ] += silk_LSHIFT32(xcorr[ n - 1 ], -rshifts); } } } silk_memcpy(C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof(opus_int32)); /* Initialize */ CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1; /* Q(-rshifts) */ invGain_Q30 = (opus_int32)1 << 30; reached_max_gain = 0; for(n = 0; n < D; n++) { /* Update first row of correlation matrix (without first element) */ /* Update last row of correlation matrix (without last element, stored in reversed order) */ /* Update C * Af */ /* Update C * flipud(Af) (stored in reversed order) */ if(rshifts > -2) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; x1 = -silk_LSHIFT32((opus_int32)x_ptr[ n ], 16 - rshifts); /* Q(16-rshifts) */ x2 = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts); /* Q(16-rshifts) */ tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ], QA - 16); /* Q(QA-16) */ tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16); /* Q(QA-16) */ for(k = 0; k < n; k++) { C_first_row[ k ] = silk_SMLAWB(C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q(-rshifts) */ C_last_row[ k ] = silk_SMLAWB(C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */ Atmp_QA = Af_QA[ k ]; tmp1 = silk_SMLAWB(tmp1, Atmp_QA, x_ptr[ n - k - 1 ] ); /* Q(QA-16) */ tmp2 = silk_SMLAWB(tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ]); /* Q(QA-16) */ } tmp1 = silk_LSHIFT32(-tmp1, 32 - QA - rshifts); /* Q(16-rshifts) */ tmp2 = silk_LSHIFT32(-tmp2, 32 - QA - rshifts); /* Q(16-rshifts) */ for(k = 0; k <= n; k++) { CAf[ k ] = silk_SMLAWB(CAf[ k ], tmp1, x_ptr[ n - k ] ); /* Q(-rshift) */ CAb[ k ] = silk_SMLAWB(CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ]); /* Q(-rshift) */ } } } else { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; x1 = -silk_LSHIFT32((opus_int32)x_ptr[ n ], -rshifts); /* Q(-rshifts) */ x2 = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts); /* Q(-rshifts) */ tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ], 17); /* Q17 */ tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 17); /* Q17 */ X1_3210 = _mm_set1_epi32(x1); X2_3210 = _mm_set1_epi32(x2); TMP1_3210 = _mm_setzero_si128(); TMP2_3210 = _mm_setzero_si128(); for(k = 0; k < n - 3; k += 4) { PTR_3210 = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 1 - 3 ]); SUBFR_3210 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k ]); FIRST_3210 = _mm_loadu_si128((__m128i *)&C_first_row[ k ]); PTR_3210 = _mm_shuffle_epi32(PTR_3210, _MM_SHUFFLE(0, 1, 2, 3)); LAST_3210 = _mm_loadu_si128((__m128i *)&C_last_row[ k ]); ATMP_3210 = _mm_loadu_si128((__m128i *)&Af_QA[ k ]); T1_3210 = _mm_mullo_epi32(PTR_3210, X1_3210); T2_3210 = _mm_mullo_epi32(SUBFR_3210, X2_3210); ATMP_3210 = _mm_srai_epi32(ATMP_3210, 7); ATMP_3210 = _mm_add_epi32(ATMP_3210, CONST1); ATMP_3210 = _mm_srai_epi32(ATMP_3210, 1); FIRST_3210 = _mm_add_epi32(FIRST_3210, T1_3210); LAST_3210 = _mm_add_epi32(LAST_3210, T2_3210); PTR_3210 = _mm_mullo_epi32(ATMP_3210, PTR_3210); SUBFR_3210 = _mm_mullo_epi32(ATMP_3210, SUBFR_3210); _mm_storeu_si128((__m128i *)&C_first_row[ k ], FIRST_3210); _mm_storeu_si128((__m128i *)&C_last_row[ k ], LAST_3210); TMP1_3210 = _mm_add_epi32(TMP1_3210, PTR_3210); TMP2_3210 = _mm_add_epi32(TMP2_3210, SUBFR_3210); } TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210)); TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210)); TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E)); TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E)); tmp1 += _mm_cvtsi128_si32(TMP1_3210); tmp2 += _mm_cvtsi128_si32(TMP2_3210); for(; k < n; k++) { C_first_row[ k ] = silk_MLA(C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q(-rshifts) */ C_last_row[ k ] = silk_MLA(C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */ Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 17); /* Q17 */ tmp1 = silk_MLA(tmp1, x_ptr[ n - k - 1 ], Atmp1); /* Q17 */ tmp2 = silk_MLA(tmp2, x_ptr[ subfr_length - n + k ], Atmp1); /* Q17 */ } tmp1 = -tmp1; /* Q17 */ tmp2 = -tmp2; /* Q17 */ { __m128i xmm_tmp1, xmm_tmp2; __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1; __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1; xmm_tmp1 = _mm_set1_epi32(tmp1); xmm_tmp2 = _mm_set1_epi32(tmp2); for(k = 0; k <= n - 3; k += 4) { xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 3 ]); xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k - 1 ]); xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 1, 2, 3)); xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32(xmm_x_ptr_n_k_x2x0, -rshifts - 1); xmm_x_ptr_sub_x2x0 = _mm_slli_epi32(xmm_x_ptr_sub_x2x0, -rshifts - 1); /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/ xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 3, 2, 1)); xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_sub_x2x0, _MM_SHUFFLE(0, 3, 2, 1)); xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32(xmm_x_ptr_n_k_x2x0, xmm_tmp1); xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32(xmm_x_ptr_n_k_x3x1, xmm_tmp1); xmm_x_ptr_sub_x2x0 = _mm_mul_epi32(xmm_x_ptr_sub_x2x0, xmm_tmp2); xmm_x_ptr_sub_x3x1 = _mm_mul_epi32(xmm_x_ptr_sub_x3x1, xmm_tmp2); xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64(xmm_x_ptr_n_k_x2x0, 16); xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64(xmm_x_ptr_n_k_x3x1, 16); xmm_x_ptr_sub_x2x0 = _mm_srli_epi64(xmm_x_ptr_sub_x2x0, 16); xmm_x_ptr_sub_x3x1 = _mm_slli_epi64(xmm_x_ptr_sub_x3x1, 16); xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16(xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC); xmm_x_ptr_sub_x2x0 = _mm_blend_epi16(xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC); X1_3210 = _mm_loadu_si128((__m128i *)&CAf[ k ]); PTR_3210 = _mm_loadu_si128((__m128i *)&CAb[ k ]); X1_3210 = _mm_add_epi32(X1_3210, xmm_x_ptr_n_k_x2x0); PTR_3210 = _mm_add_epi32(PTR_3210, xmm_x_ptr_sub_x2x0); _mm_storeu_si128((__m128i *)&CAf[ k ], X1_3210); _mm_storeu_si128((__m128i *)&CAb[ k ], PTR_3210); } for(; k <= n; k++) { CAf[ k ] = silk_SMLAWW(CAf[ k ], tmp1, silk_LSHIFT32((opus_int32)x_ptr[ n - k ], -rshifts - 1)); /* Q(-rshift) */ CAb[ k ] = silk_SMLAWW(CAb[ k ], tmp2, silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1)); /* Q(-rshift) */ } } } } /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */ tmp1 = C_first_row[ n ]; /* Q(-rshifts) */ tmp2 = C_last_row[ n ]; /* Q(-rshifts) */ num = 0; /* Q(-rshifts) */ nrg = silk_ADD32(CAb[ 0 ], CAf[ 0 ]); /* Q(1-rshifts) */ for(k = 0; k < n; k++) { Atmp_QA = Af_QA[ k ]; lz = silk_CLZ32(silk_abs(Atmp_QA)) - 1; lz = silk_min(32 - QA, lz); Atmp1 = silk_LSHIFT32(Atmp_QA, lz); /* Q(QA + lz) */ tmp1 = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(C_last_row[ n - k - 1 ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ tmp2 = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(C_first_row[ n - k - 1 ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ num = silk_ADD_LSHIFT32(num, silk_SMMUL(CAb[ n - k ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ nrg = silk_ADD_LSHIFT32(nrg, silk_SMMUL(silk_ADD32(CAb[ k + 1 ], CAf[ k + 1 ]), Atmp1), 32 - QA - lz); /* Q(1-rshifts) */ } CAf[ n + 1 ] = tmp1; /* Q(-rshifts) */ CAb[ n + 1 ] = tmp2; /* Q(-rshifts) */ num = silk_ADD32(num, tmp2); /* Q(-rshifts) */ num = silk_LSHIFT32(-num, 1); /* Q(1-rshifts) */ /* Calculate the next order reflection (parcor) coefficient */ if(silk_abs(num) < nrg) { rc_Q31 = silk_DIV32_varQ(num, nrg, 31); } else { rc_Q31 = (num > 0) ? silk_int32_MAX : silk_int32_MIN; } /* Update inverse prediction gain */ tmp1 = ((opus_int32)1 << 30) - silk_SMMUL(rc_Q31, rc_Q31); tmp1 = silk_LSHIFT(silk_SMMUL(invGain_Q30, tmp1), 2); if(tmp1 <= minInvGain_Q30) { /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */ tmp2 = ((opus_int32)1 << 30) - silk_DIV32_varQ(minInvGain_Q30, invGain_Q30, 30); /* Q30 */ rc_Q31 = silk_SQRT_APPROX(tmp2); /* Q15 */ /* Newton-Raphson iteration */ rc_Q31 = silk_RSHIFT32(rc_Q31 + silk_DIV32(tmp2, rc_Q31), 1); /* Q15 */ rc_Q31 = silk_LSHIFT32(rc_Q31, 16); /* Q31 */ if(num < 0) { /* Ensure adjusted reflection coefficients has the original sign */ rc_Q31 = -rc_Q31; } invGain_Q30 = minInvGain_Q30; reached_max_gain = 1; } else { invGain_Q30 = tmp1; } /* Update the AR coefficients */ for(k = 0; k < (n + 1) >> 1; k++) { tmp1 = Af_QA[ k ]; /* QA */ tmp2 = Af_QA[ n - k - 1 ]; /* QA */ Af_QA[ k ] = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1); /* QA */ Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1); /* QA */ } Af_QA[ n ] = silk_RSHIFT32(rc_Q31, 31 - QA); /* QA */ if(reached_max_gain) { /* Reached max prediction gain; set remaining coefficients to zero and exit loop */ for(k = n + 1; k < D; k++) { Af_QA[ k ] = 0; } break; } /* Update C * Af and C * Ab */ for(k = 0; k <= n + 1; k++) { tmp1 = CAf[ k ]; /* Q(-rshifts) */ tmp2 = CAb[ n - k + 1 ]; /* Q(-rshifts) */ CAf[ k ] = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1); /* Q(-rshifts) */ CAb[ n - k + 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1); /* Q(-rshifts) */ } } if(reached_max_gain) { for(k = 0; k < D; k++) { /* Scale coefficients */ A_Q16[ k ] = -silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16); } /* Subtract energy of preceding samples from C0 */ if(rshifts > 0) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; C0 -= (opus_int32)silk_RSHIFT64(silk_inner_prod16_aligned_64(x_ptr, x_ptr, D, arch), rshifts); } } else { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; C0 -= silk_LSHIFT32(silk_inner_prod_aligned(x_ptr, x_ptr, D, arch), -rshifts); } } /* Approximate residual energy */ *res_nrg = silk_LSHIFT(silk_SMMUL(invGain_Q30, C0), 2); *res_nrg_Q = -rshifts; } else { /* Return residual energy */ nrg = CAf[ 0 ]; /* Q(-rshifts) */ tmp1 = (opus_int32)1 << 16; /* Q16 */ for(k = 0; k < D; k++) { Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16); /* Q16 */ nrg = silk_SMLAWW(nrg, CAf[ k + 1 ], Atmp1); /* Q(-rshifts) */ tmp1 = silk_SMLAWW(tmp1, Atmp1, Atmp1); /* Q16 */ A_Q16[ k ] = -Atmp1; } *res_nrg = silk_SMLAWW(nrg, silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0), -tmp1);/* Q(-rshifts) */ *res_nrg_Q = -rshifts; } }
opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, int N) { opus_int i, dataSize16; opus_int32 sum; __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; __m128i inVec1_3210, inVec2_3210; sum = 0; dataSize16 = N & ~15; acc1 = _mm_setzero_si128(); acc2 = _mm_setzero_si128(); for (i=0;i<dataSize16;i+=16) { inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); acc1 = _mm_add_epi32(acc1, inVec1_76543210); acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); } acc1 = _mm_add_epi32(acc1, acc2); if (N - i >= 8) { inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); acc1 = _mm_add_epi32(acc1, inVec1_76543210); i += 8; } if (N - i >= 4) { inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); acc1 = _mm_add_epi32(acc1, inVec1_3210); i += 4; } acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); sum += _mm_cvtsi128_si32(acc1); for (;i<N;i++) { sum = silk_SMLABB(sum, x[i], y[i]); } return sum; }