int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bps) { int i, j, test; uint32_t temp[4]; __m128i max, min, cmp0, cmp1, cmp2, cmp3; int64_t error = 0, sqcoeff = 0; const int shift = 2 * (bps - 8); const int rounding = shift > 0 ? 1 << (shift - 1) : 0; for (i = 0; i < block_size; i += 8) { // Load the data into xmm registers __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); // Check if any values require more than 15 bit max = _mm_set1_epi32(0x3fff); min = _mm_set1_epi32(0xffffc000); cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), _mm_cmplt_epi32(mm_coeff, min)); cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), _mm_cmplt_epi32(mm_coeff2, min)); cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), _mm_cmplt_epi32(mm_dqcoeff, min)); cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), _mm_cmplt_epi32(mm_dqcoeff2, min)); test = _mm_movemask_epi8( _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); if (!test) { __m128i mm_diff, error_sse2, sqcoeff_sse2; mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); _mm_storeu_si128((__m128i *)temp, error_sse2); error = error + temp[0] + temp[1] + temp[2] + temp[3]; _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; } else { for (j = 0; j < 8; j++) { const int64_t diff = coeff[i + j] - dqcoeff[i + j]; error += diff * diff; sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; } } } assert(error >= 0 && sqcoeff >= 0); error = (error + rounding) >> shift; sqcoeff = (sqcoeff + rounding) >> shift; *ssz = sqcoeff; return error; }
int searchSIMDTree(int32_t **tree, int *fanout, int levels, int32_t value) { int iLevel = 0; int lOffset = 0; int pOffset = 0; int32_t cmpmask = 0; int32_t eqmask = 0; __m128i key = _mm_cvtsi32_si128(value); key = _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0)); while (iLevel < levels) { int f = fanout[iLevel]; pOffset = lOffset; lOffset *= f - 1; int iter = 0; int position = 0; while (iter < f/4) { __m128i delimiters = _mm_load_si128((__m128i const*)&tree[iLevel][lOffset + iter*4]); __m128i compare = _mm_cmpgt_epi32(key, delimiters); cmpmask = _mm_movemask_ps(_mm_castsi128_ps(compare)); cmpmask ^= 0x0F; if (cmpmask) { position = _bit_scan_forward(cmpmask); break; } iter++; } int offset = lOffset + iter*4 + position; lOffset = offset + pOffset; iLevel++; } return lOffset; }
static inline __m128i hardlight_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { // if (2 * sc <= sa) __m128i tmp1 = _mm_slli_epi32(sc, 1); __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); __m128i rc1 = _mm_mullo_epi16(sc, dc); // sc * dc; rc1 = _mm_slli_epi32(rc1, 1); // 2 * sc * dc rc1 = _mm_andnot_si128(cmp1, rc1); // else tmp1 = _mm_mullo_epi16(sa, da); __m128i tmp2 = Multiply32_SSE2(_mm_sub_epi32(da, dc), _mm_sub_epi32(sa, sc)); tmp2 = _mm_slli_epi32(tmp2, 1); __m128i rc2 = _mm_sub_epi32(tmp1, tmp2); rc2 = _mm_and_si128(cmp1, rc2); __m128i rc = _mm_or_si128(rc1, rc2); __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); tmp1 = _mm_mullo_epi16(sc, ida); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); tmp2 = _mm_mullo_epi16(dc, isa); rc = _mm_add_epi32(rc, tmp1); rc = _mm_add_epi32(rc, tmp2); return clamp_div255round_SSE2(rc); }
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i, j; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i pa; GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL| for (j = 0; j < 4; ++j) { const __m128i L_lo = _mm_unpacklo_epi32(L, L); const __m128i TL_lo = _mm_unpacklo_epi32(TL, L); const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL| const __m128i mask = _mm_cmpgt_epi32(pb, pa); const __m128i A = _mm_and_si128(mask, L); const __m128i B = _mm_andnot_si128(mask, T); const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T L = _mm_add_epi8(src, pred); out[i + j] = _mm_cvtsi128_si32(L); // Shift the pre-computed value for the next iteration. T = _mm_srli_si128(T, 4); TL = _mm_srli_si128(TL, 4); src = _mm_srli_si128(src, 4); pa = _mm_srli_si128(pa, 4); } } if (i != num_pixels) { VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); } }
void vFindMax(__m128i *pixels, int n) { __m128i vIdx,vMax; int i; vIdx = _mm_setzero_si128(); vMax = _mm_set_epi32(INT_MIN,INT_MIN,INT_MIN,INT_MIN); for(i = 0; i < n; i++) { __m128i v = _mm_load_si128(pixels+i); __m128i vCmp = _mm_cmpgt_epi32(v, vMax); /* max value */ vMax = _mm_max_epi32(vMax,v); __m128i vBdxIdx = _mm_set_epi32(i,i,i,i); __m128 t0 = _mm_and_ps((__m128)vBdxIdx,(__m128)vCmp); __m128 t1 = _mm_andnot_ps((__m128)vCmp, (__m128)vIdx); /* max index */ vIdx = (__m128i)_mm_or_ps(t0,t1); } int indices[4]; int values[4]; _mm_store_si128((__m128i*)indices, vIdx); _mm_store_si128((__m128i*)values, vMax); printf("SSE:\n"); for(i=0;i<4;i++) { printf("%d:max=%d,idx=%d\n",i,values[i],indices[i]); //int idx = 4*indices[i] + i; //int *sArr = (int*)pixels; //printf("sArr[%d]=%d\n",idx,sArr[idx]); } }
static inline __m128i saturated_add_SSE2(const __m128i& a, const __m128i& b) { __m128i sum = _mm_add_epi32(a, b); __m128i cmp = _mm_cmpgt_epi32(sum, _mm_set1_epi32(255)); sum = _mm_or_si128(_mm_and_si128(cmp, _mm_set1_epi32(255)), _mm_andnot_si128(cmp, sum)); return sum; }
__m128i test_mm_cmpgt_epi32(__m128i A, __m128i B) { // DAG-LABEL: test_mm_cmpgt_epi32 // DAG: icmp sgt <4 x i32> // // ASM-LABEL: test_mm_cmpgt_epi32 // ASM: pcmpgtd return _mm_cmpgt_epi32(A, B); }
SIMDValue SIMDInt32x4Operation::OpGreaterThan(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_cmpgt_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a > b? return X86SIMDValue::ToSIMDValue(x86Result); }
static inline __m128i clamp_signed_byte_SSE2(const __m128i& n) { __m128i cmp1 = _mm_cmplt_epi32(n, _mm_setzero_si128()); __m128i cmp2 = _mm_cmpgt_epi32(n, _mm_set1_epi32(255)); __m128i ret = _mm_and_si128(cmp2, _mm_set1_epi32(255)); __m128i cmp = _mm_or_si128(cmp1, cmp2); ret = _mm_or_si128(_mm_and_si128(cmp, ret), _mm_andnot_si128(cmp, n)); return ret; }
__m64 _m_pcmpgtd(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_cmpgt_epi32(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
/* resample code --------------------------------------------------------------- * resample code * args : char *code I code * int len I code length (len < 2^(31-FPBIT)) * double coff I initial code offset (chip) * int smax I maximum correlator space (sample) * double ci I code sampling interval (chip) * int n I number of samples * short *rcode O resampling code * return : double code remainder *-----------------------------------------------------------------------------*/ extern double rescode(const short *code, int len, double coff, int smax, double ci, int n, short *rcode) { short *p; #if !defined(SSE2_ENABLE) coff-=smax*ci; coff-=floor(coff/len)*len; /* 0<=coff<len */ for (p=rcode;p<rcode+n+2*smax;p++,coff+=ci) { if (coff>=len) coff-=len; *p=code[(int)coff]; } return coff-smax*ci; #else int i,index[4],x[4],nbit,scale; __m128i xmm1,xmm2,xmm3,xmm4,xmm5; coff-=smax*ci; coff-=floor(coff/len)*len; /* 0<=coff<len */ for (i=len,nbit=31;i;i>>=1,nbit--) ; nbit-=1; scale=1<<nbit; /* scale factor */ for (i=0;i<4;i++,coff+=ci) { x[i]=(int)(coff*scale+0.5); } xmm1=_mm_loadu_si128((__m128i *)x); xmm2=_mm_set1_epi32(len*scale-1); xmm3=_mm_set1_epi32(len*scale); xmm4=_mm_set1_epi32((int)(ci*4*scale+0.5)); for (p=rcode;p<rcode+n+2*smax;p+=4) { xmm5=_mm_cmpgt_epi32(xmm1,xmm2); xmm5=_mm_and_si128(xmm5,xmm3); xmm1=_mm_sub_epi32(xmm1,xmm5); xmm5=_mm_srai_epi32(xmm1,nbit); _mm_storeu_si128((__m128i *)index,xmm5); p[0]=code[index[0]]; p[1]=code[index[1]]; p[2]=code[index[2]]; p[3]=code[index[3]]; xmm1=_mm_add_epi32(xmm1,xmm4); } coff+=ci*(n+2*smax)-4*ci; coff-=floor(coff/len)*len; return coff-smax*ci; #endif }
/* constant-time doubling in GF(2^128) */ static __m128i gf128_mul2(const __m128i x) { const __m128i REDPOLY = _mm_set_epi64x(0, 0x87); const __m128i ZERO = _mm_setzero_si128(); __m128i x2; __m128i mask = _mm_cmpgt_epi32(ZERO, x); mask = _mm_shuffle_epi32(mask, 0xff); x2 = _mm_slli_epi64(x, 1) | _mm_srli_epi64(_mm_slli_si128(x, 8), 63); return x2 ^ (REDPOLY & mask); }
static inline __m128i lighten_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i sd = _mm_mullo_epi16(sc, da); __m128i ds = _mm_mullo_epi16(dc, sa); __m128i cmp = _mm_cmpgt_epi32(sd, ds); __m128i tmp = _mm_add_epi32(sc, dc); __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds)); __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd)); __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1), _mm_andnot_si128(cmp, ret2)); return ret; }
static inline __m128i clamp_div255round_SSE2(const __m128i& prod) { // test if > 0 __m128i cmp1 = _mm_cmpgt_epi32(prod, _mm_setzero_si128()); // test if < 255*255 __m128i cmp2 = _mm_cmplt_epi32(prod, _mm_set1_epi32(255*255)); __m128i ret = _mm_setzero_si128(); // if value >= 255*255, value = 255 ret = _mm_andnot_si128(cmp2, _mm_set1_epi32(255)); __m128i div = SkDiv255Round_SSE2(prod); // test if > 0 && < 255*255 __m128i cmp = _mm_and_si128(cmp1, cmp2); ret = _mm_or_si128(_mm_and_si128(cmp, div), _mm_andnot_si128(cmp, ret)); return ret; }
// Portable version overlay_byte() is in SkXfermode.cpp. static inline __m128i overlay_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); __m128i tmp1 = _mm_mullo_epi16(sc, ida); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); __m128i tmp2 = _mm_mullo_epi16(dc, isa); __m128i tmp = _mm_add_epi32(tmp1, tmp2); __m128i cmp = _mm_cmpgt_epi32(_mm_slli_epi32(dc, 1), da); __m128i rc1 = _mm_slli_epi32(sc, 1); // 2 * sc rc1 = Multiply32_SSE2(rc1, dc); // *dc __m128i rc2 = _mm_mullo_epi16(sa, da); // sa * da __m128i tmp3 = _mm_slli_epi32(_mm_sub_epi32(da, dc), 1); // 2 * (da - dc) tmp3 = Multiply32_SSE2(tmp3, _mm_sub_epi32(sa, sc)); // * (sa - sc) rc2 = _mm_sub_epi32(rc2, tmp3); __m128i rc = _mm_or_si128(_mm_andnot_si128(cmp, rc1), _mm_and_si128(cmp, rc2)); return clamp_div255round_SSE2(_mm_add_epi32(rc, tmp)); }
static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i pa, pb; GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL| GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL| { const __m128i mask = _mm_cmpgt_epi32(pb, pa); const __m128i A = _mm_and_si128(mask, L); const __m128i B = _mm_andnot_si128(mask, T); const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T const __m128i res = _mm_sub_epi8(src, pred); _mm_storeu_si128((__m128i*)&out[i], res); } } if (i != num_pixels) { VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i); } }
inline FORCE_INLINE __m128i mm_cvtps_ph(__m128 x) { __m128 magic = _mm_castsi128_ps(_mm_set1_epi32((uint32_t)15 << 23)); __m128i inf = _mm_set1_epi32((uint32_t)255UL << 23); __m128i f16inf = _mm_set1_epi32((uint32_t)31UL << 23); __m128i sign_mask = _mm_set1_epi32(0x80000000UL); __m128i round_mask = _mm_set1_epi32(~0x0FFFU); __m128i ret_0x7E00 = _mm_set1_epi32(0x7E00); __m128i ret_0x7C00 = _mm_set1_epi32(0x7C00); __m128i f, sign, ge_inf, eq_inf; f = _mm_castps_si128(x); sign = _mm_and_si128(f, sign_mask); f = _mm_xor_si128(f, sign); ge_inf = _mm_cmpgt_epi32(f, inf); eq_inf = _mm_cmpeq_epi32(f, inf); f = _mm_and_si128(f, round_mask); f = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(f), magic)); f = _mm_sub_epi32(f, round_mask); f = mm_min_epi32(f, f16inf); f = _mm_srli_epi32(f, 13); f = mm_blendv_ps(ret_0x7E00, f, ge_inf); f = mm_blendv_ps(ret_0x7C00, f, eq_inf); sign = _mm_srli_epi32(sign, 16); f = _mm_or_si128(f, sign); f = mm_packus_epi32(f, _mm_setzero_si128()); return f; }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i max = _mm_set1_epi32(0xFFFF); __m128 matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_set1_ps((float)ch->m[i]); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 8) { __m128 sum[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 25; i++) { __m128i xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); __m128 xmm1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); __m128 xmm2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); xmm1 = _mm_mul_ps(xmm1, matrix[i]); xmm2 = _mm_mul_ps(xmm2, matrix[i]); sum[0] = _mm_add_ps(sum[0], xmm1); sum[1] = _mm_add_ps(sum[1], xmm2); } __m128i sumi[2]; for (int i = 0; i < 2; i++) { sum[i] = _mm_mul_ps(sum[i], rdiv); sum[i] = _mm_add_ps(sum[i], bias); if (!ch->saturate) { sum[i] = mm_abs_ps(sum[i]); } sumi[i] = _mm_cvtps_epi32(sum[i]); sumi[i] = mm_min_epi32(sumi[i], max); __m128i mask = _mm_cmpgt_epi32(sumi[i], zero); sumi[i] = _mm_and_si128(sumi[i], mask); } sumi[0] = mm_cast_epi32(sumi[0], sumi[1]); _mm_store_si128((__m128i *)(dstp + x), sumi[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static inline __m128i _mm_max_epi32_rpl(__m128i a, __m128i b) { __m128i mask = _mm_cmpgt_epi32(a, b); a = _mm_and_si128(a, mask); b = _mm_andnot_si128(mask, b); return _mm_or_si128(a, b); }
RETi CMPGT(const __m128i x, const __m128i y) { return _mm_cmpgt_epi32(x, y); }
mlib_status __mlib_VectorSumAbsDiff_S32_Sat( mlib_d64 *z, const mlib_s32 *x, const mlib_s32 *y, mlib_s32 n) { if (n <= 0) return (MLIB_FAILURE); mlib_s32 i, nstep, ax, ay, n1, n2, n3; mlib_s32 *px = (mlib_s32 *)x, *py = (mlib_s32 *)y; __m128i zero, xbuf, ybuf, zbuf, xlo, xhi, mext; mlib_d64 dsum = 0.0; zero = _mm_setzero_si128(); zbuf = zero; nstep = 16 / sizeof (mlib_s32); ax = (mlib_addr)x & 15; ay = (mlib_addr)y & 15; n1 = ((16 - ax) & 15) / sizeof (mlib_s32); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { dsum += mlib_fabs((mlib_d64)(*px++) - (*py++)); } *z = dsum; } else { for (i = 0; i < n1; i++) { dsum += mlib_fabs((mlib_d64)(*px++) - (*py++)); } if (ax == ay) { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_load_si128((__m128i *)py); mext = _mm_cmpgt_epi32(ybuf, xbuf); xbuf = _mm_sub_epi32(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi32(xbuf, mext); xlo = _mm_unpacklo_epi32(xbuf, zero); xhi = _mm_unpackhi_epi32(xbuf, zero); zbuf = _mm_add_epi64(zbuf, xlo); zbuf = _mm_add_epi64(zbuf, xhi); px += nstep; py += nstep; } } else { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_loadu_si128((__m128i *)py); mext = _mm_cmpgt_epi32(ybuf, xbuf); xbuf = _mm_sub_epi32(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi32(xbuf, mext); xlo = _mm_unpacklo_epi32(xbuf, zero); xhi = _mm_unpackhi_epi32(xbuf, zero); zbuf = _mm_add_epi64(zbuf, xlo); zbuf = _mm_add_epi64(zbuf, xhi); px += nstep; py += nstep; } } for (i = 0; i < n3; i++) { dsum += mlib_fabs((mlib_d64)(*px++) - (*py++)); } long long pz[2]; _mm_storeu_si128((__m128i *)pz, zbuf); dsum += pz[0]; dsum += pz[1]; *z = dsum; } return (MLIB_SUCCESS); }
void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, j, non_zero_regs = (int)count / 4, eob_i = -1; __m128i zbins[2]; __m128i nzbins[2]; zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[0]); zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); nzbins[0] = _mm_setzero_si128(); nzbins[1] = _mm_setzero_si128(); nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); (void)scan; memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); if (!skip_block) { // Pre-scan pass for (i = ((int)count / 4) - 1; i >= 0; i--) { __m128i coeffs, cmp1, cmp2; int test; coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); cmp1 = _mm_and_si128(cmp1, cmp2); test = _mm_movemask_epi8(cmp1); if (test == 0xffff) non_zero_regs--; else break; } // Quantization pass: for (i = 0; i < non_zero_regs; i++) { __m128i coeffs, coeffs_sign, tmp1, tmp2; int test; int abs_coeff[4]; int coeff_sign[4]; coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); coeffs_sign = _mm_srai_epi32(coeffs, 31); coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); tmp1 = _mm_or_si128(tmp1, tmp2); test = _mm_movemask_epi8(tmp1); _mm_storeu_si128((__m128i *)abs_coeff, coeffs); _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); for (j = 0; j < 4; j++) { if (test & (1 << (4 * j))) { int k = 4 * i + j; const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; const uint32_t abs_qcoeff = (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } }
static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i tmp1, tmp2, tmp3; // int m = da ? dc * 256 / da : 0; __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128()); __m128i m = _mm_slli_epi32(dc, 8); __m128 x = _mm_cvtepi32_ps(m); __m128 y = _mm_cvtepi32_ps(da); m = _mm_cvttps_epi32(_mm_div_ps(x, y)); m = _mm_andnot_si128(cmp, m); // if (2 * sc <= sa) tmp1 = _mm_slli_epi32(sc, 1); // 2 * sc __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); tmp1 = _mm_sub_epi32(tmp1, sa); // 2 * sc - sa tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m); // 256 - m tmp1 = Multiply32_SSE2(tmp1, tmp2); tmp1 = _mm_srai_epi32(tmp1, 8); tmp1 = _mm_add_epi32(sa, tmp1); tmp1 = Multiply32_SSE2(dc, tmp1); __m128i rc1 = _mm_andnot_si128(cmp1, tmp1); // else if (4 * dc <= da) tmp2 = _mm_slli_epi32(dc, 2); // dc * 4 __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da); __m128i i = _mm_slli_epi32(m, 2); // 4 * m __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256 __m128i k = Multiply32_SSE2(i, j); // 4 * m * (4 * m + 256) __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256 i = Multiply32_SSE2(k, t); // 4 * m * (4 * m + 256) * (m - 256) i = _mm_srai_epi32(i, 16); // >> 16 j = Multiply32_SSE2(_mm_set1_epi32(7), m); // 7 * m tmp2 = _mm_add_epi32(i, j); i = Multiply32_SSE2(dc, sa); // dc * sa j = _mm_slli_epi32(sc, 1); // 2 * sc j = _mm_sub_epi32(j, sa); // 2 * sc - sa j = Multiply32_SSE2(da, j); // da * (2 * sc - sa) tmp2 = Multiply32_SSE2(j, tmp2); // * tmp tmp2 = _mm_srai_epi32(tmp2, 8); // >> 8 tmp2 = _mm_add_epi32(i, tmp2); cmp = _mm_andnot_si128(cmp2, cmp1); __m128i rc2 = _mm_and_si128(cmp, tmp2); __m128i rc = _mm_or_si128(rc1, rc2); // else tmp3 = sqrt_unit_byte_SSE2(m); tmp3 = _mm_sub_epi32(tmp3, m); tmp3 = Multiply32_SSE2(j, tmp3); // j = da * (2 * sc - sa) tmp3 = _mm_srai_epi32(tmp3, 8); tmp3 = _mm_add_epi32(i, tmp3); // i = dc * sa cmp = _mm_and_si128(cmp1, cmp2); __m128i rc3 = _mm_and_si128(cmp, tmp3); rc = _mm_or_si128(rc, rc3); tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da); // 255 - da tmp1 = _mm_mullo_epi16(sc, tmp1); tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); // 255 - sa tmp2 = _mm_mullo_epi16(dc, tmp2); rc = _mm_add_epi32(rc, tmp1); rc = _mm_add_epi32(rc, tmp2); return clamp_div255round_SSE2(rc); }
int sse_auction_search(int *pr, int *P, int *ai0, int *ai1, int *a0, int *a1, int nodes, int arcs, int s, int t) { int i __attribute__ ((aligned (16))) = 0; int j __attribute__ ((aligned (16))) = t; int k __attribute__ ((aligned (16))) = 0; int m __attribute__ ((aligned (16))) = 0; int maxla __attribute__ ((aligned (32))) = 0; int argmaxla __attribute__ ((aligned (16))) = 0; int cost __attribute__ ((aligned (16))) = 0; int length __attribute__ ((aligned (16))) = 1; int path_cost __attribute__ ((aligned (16))) = 0; uint32_t tmp1, tmp2; int cost_tab[nodes+1]; __m128i a0sse, a1sse, ai0sse, ai1sse, ai1sse1, I, J, K, M, then; __m128i ARCS, MNODES, INFINITE, NEGINF, prsse, Psse, MAXLA, ARGMAXLA, LA, mask1, mask2, mask3, COST; for(i = 0; i <= nodes; i++) { cost_tab[i] = 0; } if(check_s_t(s, t, P, nodes) != 0) { return 1; } while(P[s] == INF) { k = -1; m = -1; //printf("j = %d\n", j); J = _mm_set1_epi32(j); //aktualna wartosc j K = _mm_set1_epi32(-1); //poczatkowy indeks w tablicy z kosztami krawedzi M = _mm_set1_epi32(-1); //koncowy indeks w tablicy z kosztami krawedzi MNODES = _mm_set1_epi32(nodes-1); //liczba wezlow pomniejszona o 1 (do sprawdzenia czy koniec tablicy) ARCS = _mm_set1_epi32(arcs); //liczba krawedzi /* wyliczenie k, m */ for(i = 0; i < nodes; i+=4) { ai0sse = _mm_load_si128((__m128i*) &ai0[i]); //ladowanie ai0 (numerow wezlow) ai1sse = _mm_load_si128((__m128i*) &ai1[i]); //ladowanie ai1 (indeksow w tablicy z krawedziami) ai1sse1 = _mm_set_epi32(ai1[i+4],ai1[i+3],ai1[i+2],ai1[i+1]); //ladowanie indeksow z ai1 przesunietych o 1 mask1 = _mm_cmpeq_epi32(J, ai0sse); //sprawdzenie warunku j == ai0[i] K = _mm_or_si128(_mm_and_si128(mask1,ai1sse), _mm_andnot_si128(mask1,K)); //ustalenie K I = _mm_set_epi32(i+3, i+2, i+1, i); //aktualne wartosci i mask2 = _mm_cmplt_epi32(I, MNODES); //sprawdzenie warunku i == nodes-1 mask3 = _mm_and_si128(mask1,mask2); //sprawdzenie sumy warunkow 1 i 2 then = _mm_or_si128(_mm_and_si128(mask2,ai1sse1), _mm_andnot_si128(mask2,ARCS)); //m = ai1[i+1] lub arcs M = _mm_or_si128(_mm_and_si128(mask3,then), _mm_andnot_si128(mask3,M)); //ustalenie M } for(i = 0; i < nodes; i++) { if(ai0[i] == j) { k = ai1[i]; //k - indeks startowy krawedzi wychodzacych z j //printf("i = %d ", i); if(i < nodes - 1) { m = ai1[i+1]; } else { m = arcs; } } } /* zapisanie k, m */ for(i = 0; i < 4; i++) { tmp1 = get_from_m128i(K,i); tmp2 = get_from_m128i(M,i); if(tmp1 != -1) { k = tmp1; } if(tmp2 != -1) { m = tmp2; } } //printf("K,M: %d %d\n", k, m); /* wybor optymalnej krawedzi */ if(k != -1) { INFINITE = _mm_set1_epi32(INF); //wartosc "nieskonczona" NEGINF = _mm_set1_epi32(0-INF); //wartosc -INF COST = _mm_set1_epi32(cost); //koszt wybranej krawedzi MAXLA = _mm_set1_epi32(0-INF); //maksymalna wartosc la = pr[a0[i]] - a1[i] ARGMAXLA = _mm_set1_epi32(-1); //indeks dla którego la jest najwieksza for(i = k; i < m; i+=4) { a1sse = _mm_set_epi32(a1[i],a1[i+1],a1[i+2],a1[i+3]); //ladowanie a1 a0sse = _mm_set_epi32(a0[i],a0[i+1],a0[i+2],a0[i+3]); //ladowanie a0 prsse = _mm_set_epi32(pr[a0[i]],pr[a0[i+1]],pr[a0[i+2]],pr[a0[i+3]]); //ladowanie pr Psse = _mm_set_epi32(P[a0[i]],P[a0[i+1]],P[a0[i+2]],P[a0[i+3]]); //ladowanie P mask1 = _mm_cmpgt_epi32(_mm_set1_epi32(m),_mm_set_epi32(i,i+1,i+2,i+3)); //czy ostatni obieg prsse = _mm_or_si128(_mm_and_si128(mask1,prsse), _mm_andnot_si128(mask1,NEGINF)); //obciecie cudzych lukow LA = _mm_sub_epi32(prsse, a1sse); //la = pr[a0[i]] - a1[i] then = _mm_max_epi32(LA,MAXLA); //maksymalna wartość la, maxla mask1 = _mm_cmpeq_epi32(Psse,INFINITE); //czy P[i] == INF mask2 = _mm_and_si128(mask1,_mm_cmpgt_epi32(LA,MAXLA)); //czy P[i] == INF i LA > MAXLA MAXLA = _mm_or_si128(_mm_and_si128(mask1,then), _mm_andnot_si128(mask1,MAXLA)); //aktualizacja maxla ARGMAXLA = _mm_or_si128(_mm_and_si128(mask2,a0sse), _mm_andnot_si128(mask2,ARGMAXLA)); //aktualizacja argmaxla COST = _mm_or_si128(_mm_and_si128(mask2,a1sse), _mm_andnot_si128(mask2,COST)); //aktualizacja cost } } /* zapisanie maxla, argmaxla, cost */ maxla = 0 - INF; for(i = 0; i < 4; i++) { tmp1 = get_from_m128i(MAXLA,i); if(tmp1 > maxla) { argmaxla = get_from_m128i(ARGMAXLA,i); maxla = tmp1; cost = get_from_m128i(COST,i); } } //printf("COST: %d, PATH_COST: %d\n", cost, path_cost); //printf("pr[j] = %d, maxla = %d, argmaxla = %d\n", pr[j], maxla, argmaxla); /* skrocenie sciezki */ if(pr[j] > maxla || maxla == -INF) { /* uaktualnienie ceny */ pr[j] = maxla; /* sciezka jednoelementowa nie jest skracana */ if(j != t) { /* uaktualnienie sciezki */ P[j] = INF; length = length - 1; path_cost = path_cost - cost_tab[length]; cost_tab[length] = 0; /* powrot do poprzedniego wierzcholka w sciezce (j), k - odcinany */ k = j; for(i = 0; i < nodes; i++) { if(P[i] == length - 1) { j = i; break; } } } } /* przedluzenie sciezki */ else { P[argmaxla] = length; j = argmaxla; path_cost = path_cost + cost; cost_tab[length] = cost; length = length + 1; /* sciezka doszla do wierzcholka startowego => koniec */ if(argmaxla == s) { printf("dlugosc sciezki: %d\n", path_cost); return 0; } } } return 0; }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128i all1 = _mm_cmpeq_epi32(zero, zero); __m128i one = _mm_srli_epi32(all1, 31); __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h); __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix_h[5]; __m128i matrix_v[5]; int sign_h[5]; int sign_v[5]; for (int i = 0; i < 5; i++) { sign_h[i] = ch->m_h[i] < 0 ? 1 : 0; sign_v[i] = ch->m_v[i] < 0 ? 1 : 0; uint16_t val = sign_h[i] ? (uint16_t)(ch->m_h[i] * -1) : (uint16_t)ch->m_h[i]; matrix_h[i] = _mm_set1_epi16((int16_t)val); val = sign_v[i] ? (uint16_t)(ch->m_v[i] * -1) : (uint16_t)ch->m_v[i]; matrix_v[i] = _mm_set1_epi16((int16_t)val); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); for (int x = 0; x < width; x += 8) { uint16_t *array[] = { p0 + x, p1 + x, p2 + x, p3 + x, p4 + x, p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2 }; for (int j = 0; j < 2; j++) { __m128i *matrix = j == 0 ? matrix_v : matrix_h; int *sign = j == 0 ? sign_v : sign_h; __m128 rdiv = j == 0 ? rdiv_v : rdiv_h; __m128i sum[2]; sum[0] = _mm_setzero_si128(); sum[1] = _mm_setzero_si128(); for (int i = 0; i < 5; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]); xmm1 = _mm_mullo_epi16(xmm0, matrix[i]); xmm0 = _mm_mulhi_epu16(xmm0, matrix[i]); xmm2 = _mm_unpacklo_epi16(xmm1, xmm0); xmm0 = _mm_unpackhi_epi16(xmm1, xmm0); if (sign[i]) { xmm2 = _mm_add_epi32(one, _mm_xor_si128(xmm2, all1)); xmm0 = _mm_add_epi32(one, _mm_xor_si128(xmm0, all1)); } sum[0] = _mm_add_epi32(sum[0], xmm2); sum[1] = _mm_add_epi32(sum[1], xmm0); } for (int i = 0; i < 2; i++) { __m128 sumfp; __m128i mask, temp; sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); if (j == 1) { sumfp = _mm_add_ps(sumfp, bias); } sum[i] = _mm_cvttps_epi32(sumfp); temp = _mm_srli_epi32(all1, 16); mask = _mm_cmplt_epi32(sum[i], temp); sum[i] = _mm_or_si128(_mm_and_si128(sum[i], mask), _mm_andnot_si128(mask, temp)); mask = _mm_cmpgt_epi32(sum[i], zero); if (ch->saturate) { sum[i] = _mm_and_si128(mask, sum[i]); } else { temp = _mm_add_epi32(one, _mm_xor_si128(sum[i], all1)); sum[i] = _mm_or_si128(_mm_and_si128(mask, sum[i]), _mm_andnot_si128(mask, temp)); } } sum[0] = mm_cast_epi32(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
void spu_interpreter::CG(SPUThread& CPU, spu_opcode_t op) { const auto a = _mm_xor_si128(CPU.GPR[op.ra].vi, _mm_set1_epi32(0x7fffffff)); const auto b = _mm_xor_si128(CPU.GPR[op.rb].vi, _mm_set1_epi32(0x80000000)); CPU.GPR[op.rt].vi = _mm_srli_epi32(_mm_cmpgt_epi32(b, a), 31); }
__m128i test_mm_cmpgt_epi32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_cmpgt_epi32 // CHECK: icmp sgt <4 x i32> return _mm_cmpgt_epi32(A, B); }