/* Test the 128-bit form */ static void ssse3_test_psignd128 (int *i1, int *i2, int *r) { /* Assumes incoming pointers are 16-byte aligned */ __m128i t1 = *(__m128i *) i1; __m128i t2 = *(__m128i *) i2; *(__m128i *)r = _mm_sign_epi32 (t1, t2); }
inline __m128i LOAD_QUANTISED(const int32_t *idata, const QuantisationMatrix *qmatrix, const int l, const int s) { __m128i D = _mm_load_si128((__m128i *)idata); __m128i QF = _mm_load_si128((__m128i *)&qmatrix->qfactor[l][s]); __m128i QO = _mm_load_si128((__m128i *)&qmatrix->qoffset[l][s]); __m128i X = _mm_abs_epi32(D); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); X = _mm_sign_epi32(X, D); return X; }
__m128i test_mm_sign_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_sign_epi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_sign_epi32(a, b); }
template<class T> inline void dequantise_sse4_2_16_8_3(QuantisationMatrix *qmatrix, int32_t *idata, void *_odata, int ostride) { T *odata = (T *)_odata; const int slice_width = 16; const int slice_height = 8; const int Y = 0; const int X = 0; const int N = 0; T * const optr = &odata[Y*slice_height*ostride + X*slice_width]; const int32_t * iptr = &idata[N*slice_height*slice_width]; { __m128i D0; { D0 = _mm_load_si128((__m128i *)&iptr[ 0]); // [ 0 1 2 3 ] (Q) __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[0][0]), _mm_load_si128((__m128i *)&qmatrix->qfactor[1][1])); __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[0][0]), _mm_load_si128((__m128i *)&qmatrix->qoffset[1][1])); __m128i X = _mm_abs_epi32(D0); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); D0 = _mm_sign_epi32(X, D0); D0 = _mm_shuffle_epi32(D0, 0xD8); } const __m128i D1 = LOAD_QUANTISED(&iptr[8], qmatrix, 2, 1); const __m128i D2 = LOAD_QUANTISED(&iptr[32], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[36], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 8], B2, B3); } { __m128i D0; { D0 = _mm_load_si128((__m128i *)&iptr[ 4]); __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[1][2]), _mm_load_si128((__m128i *)&qmatrix->qfactor[1][3])); __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[1][2]), _mm_load_si128((__m128i *)&qmatrix->qoffset[1][3])); __m128i X = _mm_abs_epi32(D0); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); D0 = _mm_sign_epi32(X, D0); D0 = _mm_shuffle_epi32(D0, 0xD8); } const __m128i D1 = LOAD_QUANTISED(&iptr[12], qmatrix, 2, 1); const __m128i D2 = LOAD_QUANTISED(&iptr[48], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[52], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 8], B2, B3); } { const __m128i D0 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 2); const __m128i D1 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 3); const __m128i D2 = LOAD_QUANTISED(&iptr[40], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[44], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 8], B2, B3); } { const __m128i D0 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 2); const __m128i D1 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 3); const __m128i D2 = LOAD_QUANTISED(&iptr[56], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[60], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 8], B2, B3); } for (int y = 0; y < 4; y++) { const __m128i D0 = LOAD_QUANTISED(&iptr[ 64 + y*8], qmatrix, 3, 2); const __m128i D1 = LOAD_QUANTISED(&iptr[ 68 + y*8], qmatrix, 3, 2); const __m128i D2 = LOAD_QUANTISED(&iptr[ 96 + y*8], qmatrix, 3, 3); const __m128i D3 = LOAD_QUANTISED(&iptr[100 + y*8], qmatrix, 3, 3); const __m128i A0 = _mm_unpacklo_epi32(D0, D2); const __m128i A1 = _mm_unpackhi_epi32(D0, D2); const __m128i A2 = _mm_unpacklo_epi32(D1, D3); const __m128i A3 = _mm_unpackhi_epi32(D1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*y + 1)*ostride + 0], A0, A1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*y + 1)*ostride + 8], A2, A3); } }
template<> void dequantise_sse4_2<4,8,2, int32_t>(QuantisationMatrix *qmatrix, int32_t *idata, void *_odata, int ostride, int, int, int) { int32_t *odata = (int32_t *)_odata; const int slice_width = 4; const int slice_height = 8; const int Y = 0; const int X = 0; const int N = 0; int32_t * const optr = &odata[Y*slice_height*ostride + X*slice_width]; const int32_t * iptr = &idata[N*slice_height*slice_width]; __m128i D0; { D0 = _mm_load_si128((__m128i *)&iptr[ 0]); // [ 0 1 2 3 ] (Q) __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[0][0]), _mm_load_si128((__m128i *)&qmatrix->qfactor[1][1])); __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[0][0]), _mm_load_si128((__m128i *)&qmatrix->qoffset[1][1])); __m128i X = _mm_abs_epi32(D0); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); D0 = _mm_sign_epi32(X, D0); } __m128i D4; { D4 = _mm_load_si128((__m128i *)&iptr[ 4]); // [ 4 5 6 7 ] (Q) __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[1][2]), _mm_load_si128((__m128i *)&qmatrix->qfactor[1][3])); __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[1][2]), _mm_load_si128((__m128i *)&qmatrix->qoffset[1][3])); __m128i X = _mm_abs_epi32(D4); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); D4 = _mm_sign_epi32(X, D4); } const __m128i D8 = LOAD_QUANTISED(&iptr[ 8], qmatrix, 2, 1); // [ 8 9 10 11 ] const __m128i D12 = LOAD_QUANTISED(&iptr[12], qmatrix, 2, 1); // [ 12 13 14 15 ] const __m128i D16 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 2); // [ 16 17 18 19 ] const __m128i D20 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 2); // [ 20 21 22 23 ] const __m128i D24 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 3); // [ 24 25 26 27 ] const __m128i D28 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 3); // [ 28 29 30 31 ] const __m128i X0 = _mm_unpacklo_epi32(D0, D4); // [ 0 4 1 5 ] const __m128i X1 = _mm_unpackhi_epi32(D0, D4); // [ 2 6 3 7 ] const __m128i Y0 = _mm_unpacklo_epi32(X0, X1); // [ 0 2 4 6 ] const __m128i Y1 = _mm_unpackhi_epi32(X0, X1); // [ 1 3 5 7 ] const __m128i Z0 = _mm_unpacklo_epi32(Y0, D8); // [ 0 8 2 9 ] _mm_store_si128((__m128i *)&optr[0*ostride + 0], Z0); const __m128i Z1 = _mm_unpackhi_epi32(Y0, D8); // [ 4 10 6 11 ] _mm_store_si128((__m128i *)&optr[2*ostride + 0], Z1); const __m128i Z2 = _mm_unpacklo_epi32(Y1, D12); // [ 1 12 3 13 ] _mm_store_si128((__m128i *)&optr[4*ostride + 0], Z2); const __m128i Z3 = _mm_unpackhi_epi32(Y1, D12); // [ 5 14 7 15 ] _mm_store_si128((__m128i *)&optr[6*ostride + 0], Z3); const __m128i W0 = _mm_unpacklo_epi32(D16, D24);// [ 16 24 17 25 ] _mm_store_si128((__m128i *)&optr[1*ostride + 0], W0); const __m128i W1 = _mm_unpackhi_epi32(D16, D24);// [ 18 26 19 27 ] _mm_store_si128((__m128i *)&optr[3*ostride + 0], W1); const __m128i W2 = _mm_unpacklo_epi32(D20, D28);// [ 20 28 21 29 ] _mm_store_si128((__m128i *)&optr[5*ostride + 0], W2); const __m128i W3 = _mm_unpackhi_epi32(D20, D28);// [ 22 30 23 31 ] _mm_store_si128((__m128i *)&optr[7*ostride + 0], W3); }