static void TransformColorInverse_SSE2(const VP8LMultipliers* const m, const uint32_t* const src, int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 5. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend #define MK_CST_16(HI, LO) \ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_)); const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0); #undef MK_CST_16 #undef CST const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_add_epi8(in, D); // x r' x b' const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0 const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0 const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0 const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b'' const __m128i out = _mm_or_si128(J, A); _mm_storeu_si128((__m128i*)&dst[i], out); } // Fall-back to C-version for left-overs. if (i != num_pixels) { VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); } }
static void TransformColor(const VP8LMultipliers* const m, uint32_t* argb_data, int num_pixels) { const __m128i mults_rb = _mm_set_epi16( CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_)); const __m128i mults_b2 = _mm_set_epi16( CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0); const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2 const __m128i H = _mm_add_epi8(G, D); // x dr x db const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db const __m128i out = _mm_sub_epi8(in, I); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LTransformColor_C(m, argb_data + i, num_pixels - i); }
Image<uint16_t> blur_fast2(const Image<uint16_t> &in) { Image<uint16_t> out(in.width()-8, in.height()-2); begin_timing; // multiplying by 21846 then taking the top 16 bits is equivalent to // dividing by three __m128i one_third = _mm_set1_epi16(21846); int vw = in.width()/8; if (vw > 1024) { printf("Image too large for constant-sized stack allocation\n"); return out; } #pragma omp parallel for for (int yTile = 0; yTile < in.height(); yTile += 128) { __m128i tmp[1024*4]; // four scanlines for (int y = -2; y < 128; y++) { // to produce this scanline of the output __m128i *outPtr = (__m128i *)(&(out(0, yTile + y))); // we use this scanline of the input const uint16_t *inPtr = &(in(0, yTile + y + 2)); // and these scanlines of the intermediate result // We start y at negative 2 to fill the tmp buffer __m128i *tmpPtr0 = tmp + ((y+4) & 3) * vw; __m128i *tmpPtr1 = tmp + ((y+3) & 3) * vw; __m128i *tmpPtr2 = tmp + ((y+2) & 3) * vw; for (int x = 0; x < vw; x++) { // blur horizontally to produce next scanline of tmp __m128i val = _mm_load_si128((__m128i *)(inPtr)); val = _mm_add_epi16(val, _mm_loadu_si128((__m128i *)(inPtr+1))); val = _mm_add_epi16(val, _mm_loadu_si128((__m128i *)(inPtr+2))); val = _mm_mulhi_epi16(val, one_third); _mm_store_si128(tmpPtr0++, val); // blur vertically using previous scanlines of tmp to produce output if (y >= 0) { val = _mm_add_epi16(val, _mm_load_si128(tmpPtr1++)); val = _mm_add_epi16(val, _mm_load_si128(tmpPtr2++)); val = _mm_mulhi_epi16(val, one_third); _mm_store_si128(outPtr++, val); } inPtr += 8; } } } end_timing; return out; }
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1, __m128i *q2, const __m128i* mask, int hev_thresh) { __m128i a, not_hev; const __m128i sign_bit = _mm_set1_epi8(0x80); // compute hev mask GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); // convert to signed values FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); FLIP_SIGN_BIT2(*p2, *q2); GET_BASE_DELTA(*p1, *p0, *q0, *q1, a); { // do simple filter on pixels with hev const __m128i m = _mm_andnot_si128(not_hev, *mask); const __m128i f = _mm_and_si128(a, m); DO_SIMPLE_FILTER(*p0, *q0, f); } { // do strong filter on pixels with not hev const __m128i zero = _mm_setzero_si128(); const __m128i nine = _mm_set1_epi16(0x0900); const __m128i sixty_three = _mm_set1_epi16(63); const __m128i m = _mm_and_si128(not_hev, *mask); const __m128i f = _mm_and_si128(a, m); const __m128i f_lo = _mm_unpacklo_epi8(zero, f); const __m128i f_hi = _mm_unpackhi_epi8(zero, f); const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine); // Filter (lo) * 9 const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine); // Filter (hi) * 9 const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo); // Filter (lo) * 18 const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi); // Filter (hi) * 18 const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three); // Filter * 9 + 63 const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three); // Filter * 9 + 63 const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three); // F... * 18 + 63 const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three); // F... * 18 + 63 const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo); // Filter * 27 + 63 const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi); // Filter * 27 + 63 UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi); UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi); UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi); } // unoffset FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); FLIP_SIGN_BIT2(*p2, *q2); }
static void CollectColorBlueTransforms(const uint32_t* argb, int stride, int tile_width, int tile_height, int green_to_blue, int red_to_blue, int histo[]) { const __m128i mults_r = _mm_set_epi16( CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0); const __m128i mults_g = _mm_set_epi16( 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue)); const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask int y; for (y = 0; y < tile_height; ++y) { const uint32_t* const src = argb + y * stride; int i, x; for (x = 0; x + SPAN <= tile_width; x += SPAN) { uint16_t values[SPAN]; const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0 const __m128i A1 = _mm_slli_epi16(in1, 8); const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 const __m128i B1 = _mm_and_si128(in1, mask_g); const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0 const __m128i C1 = _mm_mulhi_epi16(A1, mults_r); const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db const __m128i D1 = _mm_mulhi_epi16(B1, mults_g); const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b' const __m128i E1 = _mm_sub_epi8(in1, D1); const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db const __m128i F1 = _mm_srli_epi32(C1, 16); const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b' const __m128i G1 = _mm_sub_epi8(E1, F1); const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b const __m128i H1 = _mm_and_si128(G1, mask_b); const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b' _mm_storeu_si128((__m128i*)values, I); for (i = 0; i < SPAN; ++i) ++histo[values[i]]; } } { const int left_over = tile_width & (SPAN - 1); if (left_over > 0) { VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride, left_over, tile_height, green_to_blue, red_to_blue, histo); } } }
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1, __m128i* const p0, __m128i* const q0, __m128i* const q1, __m128i* const q2, const __m128i* const mask, int hev_thresh) { const __m128i zero = _mm_setzero_si128(); const __m128i sign_bit = _mm_set1_epi8(0x80); __m128i a, not_hev; // compute hev mask GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev); FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); FLIP_SIGN_BIT2(*p2, *q2); GetBaseDelta(p1, p0, q0, q1, &a); { // do simple filter on pixels with hev const __m128i m = _mm_andnot_si128(not_hev, *mask); __m128i f = _mm_and_si128(a, m); // insieme: dropped const DoSimpleFilter(p0, q0, &f); } { // do strong filter on pixels with not hev const __m128i k9 = _mm_set1_epi16(0x0900); const __m128i k63 = _mm_set1_epi16(63); const __m128i m = _mm_and_si128(not_hev, *mask); const __m128i f = _mm_and_si128(a, m); const __m128i f_lo = _mm_unpacklo_epi8(zero, f); const __m128i f_hi = _mm_unpackhi_epi8(zero, f); const __m128i f9_lo = _mm_mulhi_epi16(f_lo, k9); // Filter (lo) * 9 const __m128i f9_hi = _mm_mulhi_epi16(f_hi, k9); // Filter (hi) * 9 __m128i a2_lo = _mm_add_epi16(f9_lo, k63); // Filter * 9 + 63 // insieme: dropped const __m128i a2_hi = _mm_add_epi16(f9_hi, k63); // Filter * 9 + 63 // insieme: dropped const __m128i a1_lo = _mm_add_epi16(a2_lo, f9_lo); // Filter * 18 + 63 // insieme: dropped const __m128i a1_hi = _mm_add_epi16(a2_hi, f9_hi); // Filter * 18 + 63 // insieme: dropped const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo); // Filter * 27 + 63 // insieme: dropped const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi); // Filter * 27 + 63 // insieme: dropped const Update2Pixels(p2, q2, &a2_lo, &a2_hi); Update2Pixels(p1, q1, &a1_lo, &a1_hi); Update2Pixels(p0, q0, &a0_lo, &a0_hi); } }
__m128i test_mm_mulhi_epi16(__m128i A, __m128i B) { // DAG-LABEL: test_mm_mulhi_epi16 // DAG: call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // // ASM-LABEL: test_mm_mulhi_epi16 // ASM: pmulhw return _mm_mulhi_epi16(A, B); }
void m16_vv(int16_t *x, int16_t *y, int16_t *z, int N){ __m128i *x128, *y128, *z128; x128 = (__m128i *)x; y128 = (__m128i *)y; z128 = (__m128i *)z; int i; for(i=0;i<(N>>3); i++){ z128[i] = _mm_slli_epi16( _mm_mulhi_epi16(x128[i], y128[i]),1); } }
static void CollectColorRedTransforms(const uint32_t* argb, int stride, int tile_width, int tile_height, int green_to_red, int histo[]) { const __m128i mults_g = _mm_set_epi16( 0, CST_5b(green_to_red), 0, CST_5b(green_to_red), 0, CST_5b(green_to_red), 0, CST_5b(green_to_red)); const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask = _mm_set1_epi32(0xff); int y; for (y = 0; y < tile_height; ++y) { const uint32_t* const src = argb + y * stride; int i, x; for (x = 0; x + SPAN <= tile_width; x += SPAN) { uint16_t values[SPAN]; const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 const __m128i A1 = _mm_and_si128(in1, mask_g); const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r const __m128i B1 = _mm_srli_epi32(in1, 16); const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr const __m128i C1 = _mm_mulhi_epi16(A1, mults_g); const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r' const __m128i E1 = _mm_sub_epi8(B1, C1); const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r' const __m128i F1 = _mm_and_si128(E1, mask); const __m128i I = _mm_packs_epi32(F0, F1); _mm_storeu_si128((__m128i*)values, I); for (i = 0; i < SPAN; ++i) ++histo[values[i]]; } } { const int left_over = tile_width & (SPAN - 1); if (left_over > 0) { VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride, left_over, tile_height, green_to_red, histo); } } }
__m64 _m_pmulhw(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_mulhi_epi16(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred, __m128i color) { // We simulate signed 8-bit multiplication as: // * Left shift the two (8-bit) numbers by 8 bits, // * Perform a 16-bit signed multiplication and retain the higher 16-bits. const __m128i color_pred_shifted = _mm_slli_epi32(color_pred, 8); const __m128i color_shifted = _mm_slli_epi32(color, 8); // Note: This performs multiplication on 8 packed 16-bit numbers, 4 of which // happen to be zeroes. const __m128i signed_mult = _mm_mulhi_epi16(color_pred_shifted, color_shifted); return _mm_srli_epi32(signed_mult, 5); }
Image<uint16_t> blur_fast(Image<uint16_t> in) { Image<uint16_t> out(in.width()-8, in.height()-2); begin_timing; __m128i one_third = _mm_set1_epi16(21846); #pragma omp parallel for for (int yTile = 0; yTile < out.height(); yTile += 32) { __m128i a, b, c, sum, avg; __m128i tmp[(128/8) * (32 + 2)]; for (int xTile = 0; xTile < out.width(); xTile += 128) { __m128i *tmpPtr = tmp; for (int y = 0; y < 32+2; y++) { const uint16_t *inPtr = &(in(xTile, yTile+y)); for (int x = 0; x < 128; x += 8) { a = _mm_load_si128((__m128i*)(inPtr)); b = _mm_loadu_si128((__m128i*)(inPtr+1)); c = _mm_loadu_si128((__m128i*)(inPtr+2)); sum = _mm_add_epi16(_mm_add_epi16(a, b), c); avg = _mm_mulhi_epi16(sum, one_third); _mm_store_si128(tmpPtr++, avg); inPtr+=8; } } tmpPtr = tmp; for (int y = 0; y < 32; y++) { __m128i *outPtr = (__m128i *)(&(out(xTile, yTile+y))); for (int x = 0; x < 128; x += 8) { a = _mm_load_si128(tmpPtr+(2*128)/8); b = _mm_load_si128(tmpPtr+128/8); c = _mm_load_si128(tmpPtr++); sum = _mm_add_epi16(_mm_add_epi16(a, b), c); avg = _mm_mulhi_epi16(sum, one_third); _mm_store_si128(outPtr++, avg); } } } } end_timing; return out; }
static INLINE void store_coefficients(__m128i coeff_vals, tran_low_t *coeff_ptr) { if (sizeof(tran_low_t) == 4) { __m128i one = _mm_set1_epi16(1); __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); } else { _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); } }
static INLINE void store_coefficients(__m128i coeff_vals, tran_low_t* coeff_ptr) { #if CONFIG_VPX_HIGHBITDEPTH __m128i one = _mm_set1_epi16(1); __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals_1); _mm_store_si128((__m128i*)(coeff_ptr + 4), coeff_vals_2); #else _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals); #endif }
/** * SSE Implementation of \c cnsFormula (subroutine of cnsResponse). * \c scale, \c gaussI2 and \c regVar are 32bit floats (gaussI2 as A and B). * \c sobelX, \c sobelY, \c gaussI are signed short. * \c result is a packed vector of unsigned signed 8bit number with the x and y component * alternating and \c offset (unsigned char) added. */ ALWAYSINLINE static void cnsFormula(__m128i& result, __m128i sobelX, __m128i sobelY, __m128i& gaussI, const __m128& gaussI2A, const __m128& gaussI2B, const __m128& scale, const __m128& regVar, __m128i offset) { __m128 gaussIA = _mm_cvtepi32_ps(_mm_unpacklo_epi16(gaussI, _mm_setzero_si128())); __m128 gaussIB = _mm_cvtepi32_ps(_mm_unpackhi_epi16(gaussI, _mm_setzero_si128())); __m128 factorA = _mm_add_ps(_mm_sub_ps(gaussI2A, _mm_mul_ps(gaussIA, gaussIA)), regVar); // gaussI2-gaussI^2+regVar __m128 factorB = _mm_add_ps(_mm_sub_ps(gaussI2B, _mm_mul_ps(gaussIB, gaussIB)), regVar); factorA = _mm_mul_ps(_mm_rsqrt_ps(factorA), scale); // scale/sqrt(gaussI2-gaussI^2+regVar) factorB = _mm_mul_ps(_mm_rsqrt_ps(factorB), scale); // (2^-11)*sobelX*(scale/sqrt(gaussI2-gaussI^2+regVar)) __m128i factor = _mm_packs_epi32(_mm_cvtps_epi32(factorA), _mm_cvtps_epi32(factorB)); __m128i resultXepi16 = _mm_mulhi_epi16(_mm_slli_epi16(sobelX, 5), factor); __m128i resultYepi16 = _mm_mulhi_epi16(_mm_slli_epi16(sobelY, 5), factor); // Convert to 8bit and interleave X and Y // the second argument of packs duplicates values to higher bytes, but these are ignored later, unpacklo interleaves X and Y __m128i resultepi8 = _mm_unpacklo_epi8(_mm_packs_epi16(resultXepi16, resultXepi16), _mm_packs_epi16(resultYepi16, resultYepi16)); result = _mm_add_epi8(resultepi8, offset); // add offset, switching to epu8 }
static FORCE_INLINE void warp_mmword_u8_sse2(const uint8_t *srcp, const uint8_t *edgep, uint8_t *dstp, int src_stride, int edge_stride, int height, int x, int y, const __m128i &depth, const __m128i &zero, const __m128i &x_limit_min, const __m128i &x_limit_max, const __m128i &y_limit_min, const __m128i &y_limit_max, const __m128i &word_64, const __m128i &word_127, const __m128i &word_128, const __m128i &word_255, const __m128i &one_stride) { int SMAG = 1 << SMAGL; // calculate displacement __m128i above = _mm_loadl_epi64((const __m128i *)(edgep + x - (y ? edge_stride : 0))); __m128i below = _mm_loadl_epi64((const __m128i *)(edgep + x + (y < height - 1 ? edge_stride : 0))); __m128i left = _mm_loadl_epi64((const __m128i *)(edgep + x - 1)); __m128i right = _mm_loadl_epi64((const __m128i *)(edgep + x + 1)); above = _mm_unpacklo_epi8(above, zero); below = _mm_unpacklo_epi8(below, zero); left = _mm_unpacklo_epi8(left, zero); right = _mm_unpacklo_epi8(right, zero); __m128i h = _mm_sub_epi16(left, right); __m128i v = _mm_sub_epi16(above, below); h = _mm_slli_epi16(h, 7); v = _mm_slli_epi16(v, 7); h = _mm_mulhi_epi16(h, depth); v = _mm_mulhi_epi16(v, depth); v = _mm_max_epi16(v, y_limit_min); v = _mm_min_epi16(v, y_limit_max); __m128i remainder_h = h; __m128i remainder_v = v; if (SMAGL) { remainder_h = _mm_slli_epi16(remainder_h, SMAGL); remainder_v = _mm_slli_epi16(remainder_v, SMAGL); } remainder_h = _mm_and_si128(remainder_h, word_127); remainder_v = _mm_and_si128(remainder_v, word_127); h = _mm_srai_epi16(h, 7 - SMAGL); v = _mm_srai_epi16(v, 7 - SMAGL); __m128i xx = _mm_set1_epi32(x << SMAGL); xx = _mm_packs_epi32(xx, xx); h = _mm_adds_epi16(h, xx); remainder_h = _mm_and_si128(remainder_h, _mm_cmpgt_epi16(x_limit_max, h)); remainder_h = _mm_andnot_si128(_mm_cmpgt_epi16(x_limit_min, h), remainder_h); h = _mm_max_epi16(h, x_limit_min); h = _mm_min_epi16(h, x_limit_max); // h and v contain the displacement now. __m128i disp_lo = _mm_unpacklo_epi16(v, h); __m128i disp_hi = _mm_unpackhi_epi16(v, h); disp_lo = _mm_madd_epi16(disp_lo, one_stride); disp_hi = _mm_madd_epi16(disp_hi, one_stride); __m128i line0 = _mm_setzero_si128(); __m128i line1 = _mm_setzero_si128(); int offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset), 0); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride), 0); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 1 * SMAG), 1); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 1 * SMAG), 1); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 2 * SMAG), 2); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 2 * SMAG), 2); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 3 * SMAG), 3); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 3 * SMAG), 3); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 4 * SMAG), 4); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 4 * SMAG), 4); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 5 * SMAG), 5); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 5 * SMAG), 5); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 6 * SMAG), 6); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 6 * SMAG), 6); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 7 * SMAG), 7); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 7 * SMAG), 7); __m128i left0 = _mm_and_si128(line0, word_255); __m128i left1 = _mm_and_si128(line1, word_255); __m128i right0 = _mm_srli_epi16(line0, 8); __m128i right1 = _mm_srli_epi16(line1, 8); left0 = _mm_mullo_epi16(left0, _mm_sub_epi16(word_128, remainder_h)); left1 = _mm_mullo_epi16(left1, _mm_sub_epi16(word_128, remainder_h)); right0 = _mm_mullo_epi16(right0, remainder_h); right1 = _mm_mullo_epi16(right1, remainder_h); line0 = _mm_add_epi16(left0, right0); line1 = _mm_add_epi16(left1, right1); line0 = _mm_add_epi16(line0, word_64); line1 = _mm_add_epi16(line1, word_64); line0 = _mm_srai_epi16(line0, 7); line1 = _mm_srai_epi16(line1, 7); line0 = _mm_mullo_epi16(line0, _mm_sub_epi16(word_128, remainder_v)); line1 = _mm_mullo_epi16(line1, remainder_v); __m128i result = _mm_add_epi16(line0, line1); result = _mm_add_epi16(result, word_64); result = _mm_srai_epi16(result, 7); result = _mm_packus_epi16(result, result); _mm_storel_epi64((__m128i *)(dstp + x), result); }
/*---------------------------------------------------------------------------*/ PRIM_STATIC pstatus_t sse2_yCbCrToRGB_16s16s_P3P3( const INT16 *pSrc[3], int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi) /* region of interest */ { __m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096; __m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf; int srcbump, dstbump, yp, imax; if (((ULONG_PTR) (pSrc[0]) & 0x0f) || ((ULONG_PTR) (pSrc[1]) & 0x0f) || ((ULONG_PTR) (pSrc[2]) & 0x0f) || ((ULONG_PTR) (pDst[0]) & 0x0f) || ((ULONG_PTR) (pDst[1]) & 0x0f) || ((ULONG_PTR) (pDst[2]) & 0x0f) || (roi->width & 0x07) || (srcStep & 127) || (dstStep & 127)) { /* We can't maintain 16-byte alignment. */ return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi); } zero = _mm_setzero_si128(); max = _mm_set1_epi16(255); y_buf = (__m128i*) (pSrc[0]); cb_buf = (__m128i*) (pSrc[1]); cr_buf = (__m128i*) (pSrc[2]); r_buf = (__m128i*) (pDst[0]); g_buf = (__m128i*) (pDst[1]); b_buf = (__m128i*) (pDst[2]); r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */ g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */ g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */ b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */ c4096 = _mm_set1_epi16(4096); srcbump = srcStep / sizeof(__m128i); dstbump = dstStep / sizeof(__m128i); #ifdef DO_PREFETCH /* Prefetch Y's, Cb's, and Cr's. */ for (yp=0; yp<roi->height; yp++) { int i; for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; } y_buf = (__m128i*) (pSrc[0]); cb_buf = (__m128i*) (pSrc[1]); cr_buf = (__m128i*) (pSrc[2]); #endif /* DO_PREFETCH */ imax = roi->width * sizeof(INT16) / sizeof(__m128i); for (yp=0; yp<roi->height; ++yp) { int i; for (i=0; i<imax; i++) { /* In order to use SSE2 signed 16-bit integer multiplication * we need to convert the floating point factors to signed int * without losing information. * The result of this multiplication is 32 bit and we have two * SSE instructions that return either the hi or lo word. * Thus we will multiply the factors by the highest possible 2^n, * take the upper 16 bits of the signed 32-bit result * (_mm_mulhi_epi16) and correct this result by multiplying * it by 2^(16-n). * * For the given factors in the conversion matrix the best * possible n is 14. * * Example for calculating r: * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 */ /* y = (y_r_buf[i] + 4096) >> 2 */ __m128i y, cb, cr, r, g, b; y = _mm_load_si128(y_buf + i); y = _mm_add_epi16(y, c4096); y = _mm_srai_epi16(y, 2); /* cb = cb_g_buf[i]; */ cb = _mm_load_si128(cb_buf + i); /* cr = cr_b_buf[i]; */ cr = _mm_load_si128(cr_buf + i); /* (y + HIWORD(cr*22986)) >> 3 */ r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr)); r = _mm_srai_epi16(r, 3); /* r_buf[i] = MINMAX(r, 0, 255); */ _mm_between_epi16(r, zero, max); _mm_store_si128(r_buf + i, r); /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb)); g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr)); g = _mm_srai_epi16(g, 3); /* g_buf[i] = MINMAX(g, 0, 255); */ _mm_between_epi16(g, zero, max); _mm_store_si128(g_buf + i, g); /* (y + HIWORD(cb*28999)) >> 3 */ b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb)); b = _mm_srai_epi16(b, 3); /* b_buf[i] = MINMAX(b, 0, 255); */ _mm_between_epi16(b, zero, max); _mm_store_si128(b_buf + i, b); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; r_buf += dstbump; g_buf += dstbump; b_buf += dstbump; } return PRIMITIVES_SUCCESS; }
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi) { int lastRow, lastCol; BYTE *UData,*VData,*YData; int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *buffer; /* last_line: if the last (U,V doubled) line should be skipped, set to 10B * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ buffer = _aligned_malloc(4 * 16, 16); YData = (BYTE*) pSrc[0]; UData = (BYTE*) pSrc[1]; VData = (BYTE*) pSrc[2]; nWidth = roi->width; nHeight = roi->height; if ((lastCol = (nWidth & 3))) { switch (lastCol) { case 1: r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF); break; case 2: r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; case 3: r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; } _mm_store_si128(buffer+3,r7); lastCol = 1; } nWidth += 3; nWidth = nWidth >> 2; lastRow = nHeight & 1; nHeight++; nHeight = nHeight >> 1; VaddDst = (dstStep << 1) - (nWidth << 4); VaddY = (srcStep[0] << 1) - (nWidth << 2); VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC); VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC); while (nHeight-- > 0) { if (nHeight == 0) lastRow <<= 1; i = 0; do { if (!(i & 0x01)) { /* Y-, U- and V-data is stored in different arrays. * We start with processing U-data. * * at first we fetch four U-values from its array and shuffle them like this: * 0d0d 0c0c 0b0b 0a0a * we've done two things: converting the values to signed words and duplicating * each value, because always two pixel "share" the same U- (and V-) data */ r0 = _mm_cvtsi32_si128(*(UINT32 *)UData); r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); r0 = _mm_shuffle_epi8(r0,r5); UData += 4; /* then we subtract 128 from each value, so we get D */ r3 = _mm_set_epi16(128,128,128,128,128,128,128,128); r0 = _mm_subs_epi16(r0,r3); /* we need to do two things with our D, so let's store it for later use */ r2 = r0; /* now we can multiply our D with 48 and unpack it to xmm4:xmm0 * this is what we need to get G data later on */ r4 = r0; r7 = _mm_set_epi16(48,48,48,48,48,48,48,48); r0 = _mm_mullo_epi16(r0,r7); r4 = _mm_mulhi_epi16(r4,r7); r7 = r0; r0 = _mm_unpacklo_epi16(r0,r4); r4 = _mm_unpackhi_epi16(r7,r4); /* to get B data, we need to prepare a second value, D*475 */ r1 = r2; r7 = _mm_set_epi16(475,475,475,475,475,475,475,475); r1 = _mm_mullo_epi16(r1,r7); r2 = _mm_mulhi_epi16(r2,r7); r7 = r1; r1 = _mm_unpacklo_epi16(r1,r2); r7 = _mm_unpackhi_epi16(r7,r2); /* so we got something like this: xmm7:xmm1 * this pair contains values for 16 pixel: * aabbccdd * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */ _mm_store_si128(buffer+1,r7); /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ r2 = _mm_cvtsi32_si128(*(UINT32 *)VData); r2 = _mm_shuffle_epi8(r2,r5); VData += 4; r2 = _mm_subs_epi16(r2,r3); r5 = r2; /* this is also known as E*403, we need it to convert R data */ r3 = r2; r7 = _mm_set_epi16(403,403,403,403,403,403,403,403); r2 = _mm_mullo_epi16(r2,r7); r3 = _mm_mulhi_epi16(r3,r7); r7 = r2; r2 = _mm_unpacklo_epi16(r2,r3); r7 = _mm_unpackhi_epi16(r7,r3); /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); /* doing this step: E*120 */ r3 = r5; r7 = _mm_set_epi16(120,120,120,120,120,120,120,120); r3 = _mm_mullo_epi16(r3,r7); r5 = _mm_mulhi_epi16(r5,r7); r7 = r3; r3 = _mm_unpacklo_epi16(r3,r5); r7 = _mm_unpackhi_epi16(r7,r5); /* now we complete what we've begun above: * (48*D) + (120*E) = (48*D +120*E) */ r0 = _mm_add_epi32(r0,r3); r4 = _mm_add_epi32(r4,r7); /* and store to memory ! */ _mm_store_si128(buffer,r4); } else { /* maybe you've wondered about the conditional above ? * Well, we prepared UV data for eight pixel in each line, but can only process four * per loop. So we need to load the upper four pixel data from memory each secound loop! */ r1 = _mm_load_si128(buffer+1); r2 = _mm_load_si128(buffer+2); r0 = _mm_load_si128(buffer); } if (++i == nWidth) lastCol <<= 1; /* We didn't produce any output yet, so let's do so! * Ok, fetch four pixel from the Y-data array and shuffle them like this: * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */ r4 = _mm_cvtsi32_si128(*(UINT32 *)YData); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; /* no we can perform the "real" conversion itself and produce output! */ r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); /* in the end, we only need bytes for RGB values. * So, what do we do? right! shifting left makes values bigger and thats always good. * before we had dwords of data, and by shifting left and treating the result * as packed words, we get not only signed words, but do also divide by 256 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least * significant byte, that we don't need anymore, because we've done some rounding */ r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); /* one thing we still have to face is the clip() function ... * we have still signed words, and there are those min/max instructions in SSE2 ... * the max instruction takes always the bigger of the two operands and stores it in the first one, * and it operates with signs ! * if we feed it with our values and zeros, it takes the zeros if our values are smaller than * zero and otherwise our values */ r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); /* the same thing just completely different can be used to limit our values to 255, * but now using the min instruction and 255s */ r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); /* Now we got our bytes. * the moment has come to assemble the three channels R,G and B to the xrgb dwords * on Red channel we just have to and each futural dword with 00FF0000H */ //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); /* on Green channel we have to shuffle somehow, so we get something like this: * 00d0 00c0 00b0 00a0 */ r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); /* and on Blue channel that one: * 000d 000c 000b 000a */ r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); /* and at last we or it together and get this one: * xrgb xrgb xrgb xrgb */ r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); /* Only thing to do know is writing data to memory, but this gets a bit more * complicated if the width is not a multiple of four and it is the last column in line. */ if (lastCol & 0x02) { /* let's say, we need to only convert six pixel in width * Ok, the first 4 pixel will be converted just like every 4 pixel else, but * if it's the last loop in line, last_column is shifted left by one (curious? have a look above), * and we land here. Through initialisation a mask was prepared. In this case it looks like * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */ r6 = _mm_load_si128(buffer+3); /* we and our output data with this mask to get only the valid pixel */ r4 = _mm_and_si128(r4,r6); /* then we fetch memory from the destination array ... */ r5 = _mm_lddqu_si128((__m128i *)pDst); /* ... and and it with the inverse mask. We get only those pixel, which should not be updated */ r6 = _mm_andnot_si128(r6,r5); /* we only have to or the two values together and write it back to the destination array, * and only the pixel that should be updated really get changed. */ r4 = _mm_or_si128(r4,r6); } _mm_storeu_si128((__m128i *)pDst,r4); if (!(lastRow & 0x02)) { /* Because UV data is the same for two lines, we can process the secound line just here, * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination * pointer. These offsets are iStride[0] and the target scanline. * But if we don't need to process the secound line, like if we are in the last line of processing nine lines, * we just skip all this. */ r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); if (lastCol & 0x02) { r6 = _mm_load_si128(buffer+3); r4 = _mm_and_si128(r4,r6); r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep)); r6 = _mm_andnot_si128(r6,r5); r4 = _mm_or_si128(r4,r6); /* only thing is, we should shift [rbp-42] back here, because we have processed the last column, * and this "special condition" can be released */ lastCol >>= 1; } _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } /* after all we have to increase the destination- and Y-data pointer by four pixel */ pDst += 16; YData += 4; }
mlib_status mlib_VideoColorBGR2JFIFYCC444_S16_naligned( mlib_s16 *y, mlib_s16 *cb, mlib_s16 *cr, const mlib_s16 *bgr, mlib_s32 n) { /* 0.299*32768 */ const __m128i x_c11 = _mm_set1_epi16(9798); /* 0.587*32768 */ const __m128i x_c12 = _mm_set1_epi16(19235); /* 0.114*32768 */ const __m128i x_c13 = _mm_set1_epi16(3735); /* -0.16874*32768 */ const __m128i x_c21 = _mm_set1_epi16(-5529); /* -0.33126*32768 */ const __m128i x_c22 = _mm_set1_epi16(-10855); /* 0.5*32768 */ const __m128i x_c23 = _mm_set1_epi16(16384); /* 0.5*32768 */ const __m128i x_c31 = x_c23; /* -0.41869*32768 */ const __m128i x_c32 = _mm_set1_epi16(-13720); /* -0.08131*32768 */ const __m128i x_c33 = _mm_set1_epi16(-2664); /* 2048 */ const __m128i x_coff = _mm_set1_epi16(2048 << 2); const __m128i x_zero = _mm_setzero_si128(); __m128i x_bgr0, x_bgr1, x_bgr2, x_r, x_g, x_b; __m128i x_y, x_cb, x_cr; __m128i x_t0, x_t1, x_t2, x_t3, x_t4, x_t5; __m128i *px_y, *px_cb, *px_cr, *px_bgr; mlib_d64 fr, fg, fb, fy, fcb, fcr; mlib_s32 i; px_y = (__m128i *)y; px_cb = (__m128i *)cb; px_cr = (__m128i *)cr; px_bgr = (__m128i *)bgr; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (n - 8); i += 8) { x_bgr0 = _mm_loadu_si128(px_bgr++); x_bgr0 = _mm_slli_epi16(x_bgr0, 3); x_bgr1 = _mm_loadu_si128(px_bgr++); x_bgr1 = _mm_slli_epi16(x_bgr1, 3); x_bgr2 = _mm_loadu_si128(px_bgr++); x_bgr2 = _mm_slli_epi16(x_bgr2, 3); SeparateBGR48_S16; x_t0 = _mm_mulhi_epi16(x_r, x_c11); x_t1 = _mm_mulhi_epi16(x_g, x_c12); x_t2 = _mm_mulhi_epi16(x_b, x_c13); x_y = _mm_add_epi16(x_t0, x_t1); x_y = _mm_add_epi16(x_y, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c21); x_t1 = _mm_mulhi_epi16(x_g, x_c22); x_t2 = _mm_mulhi_epi16(x_b, x_c23); x_cb = _mm_add_epi16(x_t0, x_t1); x_cb = _mm_add_epi16(x_cb, x_coff); x_cb = _mm_add_epi16(x_cb, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c31); x_t1 = _mm_mulhi_epi16(x_g, x_c32); x_t2 = _mm_mulhi_epi16(x_b, x_c33); x_cr = _mm_add_epi16(x_t0, x_t1); x_cr = _mm_add_epi16(x_cr, x_coff); x_cr = _mm_add_epi16(x_cr, x_t2); /* save */ x_y = _mm_srli_epi16(x_y, 2); x_cb = _mm_srli_epi16(x_cb, 2); x_cr = _mm_srli_epi16(x_cr, 2); _mm_storeu_si128(px_y++, x_y); _mm_storeu_si128(px_cb++, x_cb); _mm_storeu_si128(px_cr++, x_cr); } if (i <= (n - 4)) { x_bgr0 = _mm_loadu_si128(px_bgr++); x_bgr0 = _mm_slli_epi16(x_bgr0, 3); x_bgr1 = _mm_loadl_epi64(px_bgr); x_bgr1 = _mm_slli_epi16(x_bgr1, 3); px_bgr = (__m128i *)((__m64 *)px_bgr + 1); SeparateBGR24_S16; x_t0 = _mm_mulhi_epi16(x_r, x_c11); x_t1 = _mm_mulhi_epi16(x_g, x_c12); x_t2 = _mm_mulhi_epi16(x_b, x_c13); x_y = _mm_add_epi16(x_t0, x_t1); x_y = _mm_add_epi16(x_y, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c21); x_t1 = _mm_mulhi_epi16(x_g, x_c22); x_t2 = _mm_mulhi_epi16(x_b, x_c23); x_cb = _mm_add_epi16(x_t0, x_t1); x_cb = _mm_add_epi16(x_cb, x_coff); x_cb = _mm_add_epi16(x_cb, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c31); x_t1 = _mm_mulhi_epi16(x_g, x_c32); x_t2 = _mm_mulhi_epi16(x_b, x_c33); x_cr = _mm_add_epi16(x_t0, x_t1); x_cr = _mm_add_epi16(x_cr, x_coff); x_cr = _mm_add_epi16(x_cr, x_t2); /* save */ x_y = _mm_srli_epi16(x_y, 2); x_cb = _mm_srli_epi16(x_cb, 2); x_cr = _mm_srli_epi16(x_cr, 2); _mm_storel_epi64(px_y, x_y); px_y = (__m128i *)((__m64 *)px_y + 1); _mm_storel_epi64(px_cb, x_cb); px_cb = (__m128i *)((__m64 *)px_cb + 1); _mm_storel_epi64(px_cr, x_cr); px_cr = (__m128i *)((__m64 *)px_cr + 1); i += 4; } for (; i <= (n - 1); i++) { fb = bgr[3 * i]; fg = bgr[3 * i + 1]; fr = bgr[3 * i + 2]; fy = 0.29900f * fr + 0.58700f * fg + 0.11400f * fb; fcb = -0.16874f * fr - 0.33126f * fg + 0.50000f * fb + 2048; fcr = 0.50000f * fr - 0.41869f * fg - 0.08131f * fb + 2048; y[i] = (mlib_s16)fy; cb[i] = (mlib_s16)fcb; cr[i] = (mlib_s16)fcr; } return (MLIB_SUCCESS); }
//void xfft(int gatepow[NGATEAUTO][NFAUTO],char *obuf, char *ibuf, __m128i norm[NCHAN/2][FFTLEN/8],short int idelay[4],int nthread,double time0, double period) void xfft(char *obuf, char *ibuf, __m128i norm[NCHAN/2][FFTLEN/8],short int idelay[4],int nthread,char walshsign[NCHAN][NWALSH2]) { int nbuf,k; __declspec(align(128)) static short int fftbuf[4][NCHAN][FFTLEN]; // cache align this nbuf=NT/FFTLEN/NCHAN/2; // the last factor of two because half the data is sent off to quad cores omp_set_num_threads(nthread); #pragma omp parallel for default(none) shared(obuf,ibuf,norm,nbuf,fftbuf,idelay,walshsign) schedule(dynamic,64) for (k=0;k<nbuf-1;k++){ int i,j,r32,i32,io,imp; short int i16,r16,igate,*ibuf16; register __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *fftbuf_sse; #ifdef _OPENMP imp=omp_get_thread_num(); #else imp=0; #endif /* we want fftbuf to stay in cache */ for (j=0;j<NCHAN;j++) { for(i=0;i<FFTLEN;i++) { char ctmp,ctmp1; ctmp=ibuf[(k*FFTLEN+(i+idelay[j]))*NCHAN+j]; // ctmp1=(ctmp & 0b10111111) | (ctmp >> 1 & 0b0100000); // clip fftbuf[imp][j][i]=ctmp*walshsign[j][k/NTCHUNK]; } fft1dippirc16(fftbuf[imp][j],fftbuf[imp][j],1); fftbuf_sse=fftbuf[imp][j]; for(i=0;i<FFTLEN/8;i++) fftbuf_sse[i]=_mm_mulhi_epi16(fftbuf_sse[i],norm[j][i]); r7=_mm_set1_epi8(0xf0); for (i=0;i<FFTLEN/2;i+=FFTBLOCK){ #if 0 for (io=0;io<FFTBLOCK;io++){ // we process 2 numbers at once. r32=fftbuf[imp][j][2*(i+io)]; // r32=r32*norm[j][i+io]; i32=fftbuf[imp][j][2*(i+io)+1]; // i32*=norm[j][i+io]; obuf[io+j*FFTBLOCK+k*FFTBLOCK*NCHAN+i*(NT/(FFTLEN)/2)]=(r32 >> 16)&0x0f | (i32 >> 12)&0xf0; } #else for (io=0;io<FFTBLOCK;io+=2*8){ // we process 32 numbers at once. /* bits 5-8 are extracted(?) */ r0=_mm_load_si128(&fftbuf[imp][j][2*(i+io)]); r1=_mm_load_si128(&fftbuf[imp][j][2*(i+io)+8]); r2=_mm_load_si128(&fftbuf[imp][j][2*(i+io)+16]); r3=_mm_load_si128(&fftbuf[imp][j][2*(i+io)+24]); // squeeze four 16-bit ints into 4-bit ints #define MMSHUF _MM_SHUFFLE(3,1,2,0) // little endian, swap i1 r1 i0 r0 -> i1 i0 r1 r0 r5=_mm_shufflehi_epi16(r0,MMSHUF); r6=_mm_shufflelo_epi16(r5,MMSHUF); r0=_mm_shuffle_epi32(r6,MMSHUF); // i3 i2 r3 r2 i1 i0 r1 r0 -> i3210 r3210 r5=_mm_shufflehi_epi16(r1,MMSHUF); r6=_mm_shufflelo_epi16(r5,MMSHUF); r1=_mm_shuffle_epi32(r6,MMSHUF); r5=_mm_unpacklo_epi64(r0,r1); // r0=i3210r3210, r1=i7654r7654 -> r5=r76543210 r6=_mm_unpackhi_epi64(r0,r1); // r6=i76543210 r0=r5; r1=r6; // now for the second set r5=_mm_shufflehi_epi16(r2,MMSHUF); r6=_mm_shufflelo_epi16(r5,MMSHUF); r2=_mm_shuffle_epi32(r6,MMSHUF); r5=_mm_shufflehi_epi16(r3,MMSHUF); r6=_mm_shufflelo_epi16(r5,MMSHUF); r3=_mm_shuffle_epi32(r6,MMSHUF); r5=_mm_unpacklo_epi64(r2,r3); r6=_mm_unpackhi_epi64(r2,r3); r2=r5; // r5 is the real part r3=r6; /* this part reduces the number of bits to LSB with saturate */ r5=_mm_packs_epi16(r0,r2); // r5=rFEDCBA9876543210, saturate r0=_mm_srli_epi16(r5,4); // in little-endian, real into LSB // modified next few lines to just store MSB's. r0=_mm_andnot_si128(r7,r0);//zero 4 MSB r6=_mm_packs_epi16(r1,r3); // imaginary r1=_mm_and_si128(r6,r7); r2=_mm_or_si128(r0,r1); /* write without polluting caches */ _mm_stream_si128(&obuf[io+j*FFTBLOCK+k*FFTBLOCK*NCHAN+i*(NT/FFTLEN/2)],r2); /* the outgoing structure is obuf[FFTREST][TIME][CHAN][FFTBLOCK]. The BLOCK is cache friendly, the FFTREST is the MPI transpose order, and we need all channels locally for the correlation. */ } #endif // prefetch obuf non-persistent } } } }
void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) { __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); __m128i round0 = _mm_load_si128((__m128i *)(b->round)); __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; /* sign of z: z >> 15 */ sz0 = _mm_srai_epi16(z0, 15); sz1 = _mm_srai_epi16(z1, 15); /* x = abs(z): (z ^ sz) - sz */ x0 = _mm_xor_si128(z0, sz0); x1 = _mm_xor_si128(z1, sz1); x0 = _mm_sub_epi16(x0, sz0); x1 = _mm_sub_epi16(x1, sz1); /* x += round */ x0 = _mm_add_epi16(x0, round0); x1 = _mm_add_epi16(x1, round1); /* y = (x * quant) >> 16 */ y0 = _mm_mulhi_epi16(x0, quant_fast0); y1 = _mm_mulhi_epi16(x1, quant_fast1); /* x = abs(y) = (y ^ sz) - sz */ y0 = _mm_xor_si128(y0, sz0); y1 = _mm_xor_si128(y1, sz1); x0 = _mm_sub_epi16(y0, sz0); x1 = _mm_sub_epi16(y1, sz1); /* qcoeff = x */ _mm_store_si128((__m128i *)(d->qcoeff), x0); _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); /* x * dequant */ xdq0 = _mm_mullo_epi16(x0, dequant0); xdq1 = _mm_mullo_epi16(x1, dequant1); /* dqcoeff = x * dequant */ _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); /* build a mask for the zig zag */ zeros = _mm_setzero_si128(); x0 = _mm_cmpeq_epi16(x0, zeros); x1 = _mm_cmpeq_epi16(x1, zeros); ones = _mm_cmpeq_epi16(zeros, zeros); x0 = _mm_xor_si128(x0, ones); x1 = _mm_xor_si128(x1, ones); x0 = _mm_and_si128(x0, inv_zig_zag0); x1 = _mm_and_si128(x1, inv_zig_zag1); x0 = _mm_max_epi16(x0, x1); /* now down to 8 */ x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 x0 = _mm_max_epi16(x0, x1); /* only 4 left */ x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 x0 = _mm_max_epi16(x0, x1); /* okay, just 2! */ x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 x0 = _mm_max_epi16(x0, x1); *d->eob = 0xFF & _mm_cvtsi128_si32(x0); }
void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t* zbin_ptr, const int16_t* round_ptr, const int16_t* quant_ptr, const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr, tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr, uint16_t* eob_ptr, const int16_t* scan_ptr, const int16_t* iscan_ptr) { __m128i zero; (void)scan_ptr; coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); if (!skip_block) { __m128i eob; __m128i zbin; __m128i round, quant, dequant, shift; { __m128i coeff0, coeff1; // Setup global values { __m128i pw_1; zbin = _mm_load_si128((const __m128i*)zbin_ptr); round = _mm_load_si128((const __m128i*)round_ptr); quant = _mm_load_si128((const __m128i*)quant_ptr); pw_1 = _mm_set1_epi16(1); zbin = _mm_sub_epi16(zbin, pw_1); dequant = _mm_load_si128((const __m128i*)dequant_ptr); shift = _mm_load_si128((const __m128i*)quant_shift_ptr); } { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; // Do DC and first 15 AC coeff0 = load_coefficients(coeff_ptr + n_coeffs); coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); qcoeff0 = _mm_adds_epi16(qcoeff0, round); round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); shift = _mm_unpackhi_epi64(shift, shift); qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); // Reinsert signs qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob = _mm_max_epi16(eob, eob1); } n_coeffs += 8 * 2; } // AC only loop while (n_coeffs < 0) { __m128i coeff0, coeff1; { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; coeff0 = load_coefficients(coeff_ptr + n_coeffs); coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); qcoeff0 = _mm_adds_epi16(qcoeff0, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); // Reinsert signs qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob0, eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob0 = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob0 = _mm_max_epi16(eob0, eob1); eob = _mm_max_epi16(eob, eob0); } n_coeffs += 8 * 2; } // Accumulate EOB { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); *eob_ptr = _mm_extract_epi16(eob, 1); } } else { do { store_coefficients(zero, dqcoeff_ptr + n_coeffs); store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); store_coefficients(zero, qcoeff_ptr + n_coeffs); store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; } }
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point * numbers. See the general code above. */ PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3( const INT16 *pSrc[3], int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi) /* region of interest */ { __m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b; __m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf; int srcbump, dstbump, yp, imax; if (((ULONG_PTR) (pSrc[0]) & 0x0f) || ((ULONG_PTR) (pSrc[1]) & 0x0f) || ((ULONG_PTR) (pSrc[2]) & 0x0f) || ((ULONG_PTR) (pDst[0]) & 0x0f) || ((ULONG_PTR) (pDst[1]) & 0x0f) || ((ULONG_PTR) (pDst[2]) & 0x0f) || (roi->width & 0x07) || (srcStep & 127) || (dstStep & 127)) { /* We can't maintain 16-byte alignment. */ return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi); } min = _mm_set1_epi16(-128 << 5); max = _mm_set1_epi16(127 << 5); r_buf = (__m128i*) (pSrc[0]); g_buf = (__m128i*) (pSrc[1]); b_buf = (__m128i*) (pSrc[2]); y_buf = (__m128i*) (pDst[0]); cb_buf = (__m128i*) (pDst[1]); cr_buf = (__m128i*) (pDst[2]); y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */ y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */ y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */ cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */ cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */ cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */ cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */ cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */ cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */ srcbump = srcStep / sizeof(__m128i); dstbump = dstStep / sizeof(__m128i); #ifdef DO_PREFETCH /* Prefetch RGB's. */ for (yp=0; yp<roi->height; yp++) { int i; for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA); } r_buf += srcbump; g_buf += srcbump; b_buf += srcbump; } r_buf = (__m128i*) (pSrc[0]); g_buf = (__m128i*) (pSrc[1]); b_buf = (__m128i*) (pSrc[2]); #endif /* DO_PREFETCH */ imax = roi->width * sizeof(INT16) / sizeof(__m128i); for (yp=0; yp<roi->height; ++yp) { int i; for (i=0; i<imax; i++) { /* In order to use SSE2 signed 16-bit integer multiplication we * need to convert the floating point factors to signed int * without loosing information. The result of this multiplication * is 32 bit and using SSE2 we get either the product's hi or lo * word. Thus we will multiply the factors by the highest * possible 2^n and take the upper 16 bits of the signed 32-bit * result (_mm_mulhi_epi16). Since the final result needs to * be scaled by << 5 and also in in order to keep the precision * within the upper 16 bits we will also have to scale the RGB * values used in the multiplication by << 5+(16-n). */ __m128i r, g, b, y, cb, cr; r = _mm_load_si128(y_buf+i); g = _mm_load_si128(g_buf+i); b = _mm_load_si128(b_buf+i); /* r<<6; g<<6; b<<6 */ r = _mm_slli_epi16(r, 6); g = _mm_slli_epi16(g, 6); b = _mm_slli_epi16(b, 6); /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */ y = _mm_mulhi_epi16(r, y_r); y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g)); y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b)); y = _mm_add_epi16(y, min); /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */ _mm_between_epi16(y, min, max); _mm_store_si128(y_buf+i, y); /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */ cb = _mm_mulhi_epi16(r, cb_r); cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g)); cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b)); /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */ _mm_between_epi16(cb, min, max); _mm_store_si128(cb_buf+i, cb); /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */ cr = _mm_mulhi_epi16(r, cr_r); cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g)); cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b)); /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */ _mm_between_epi16(cr, min, max); _mm_store_si128(cr_buf+i, cr); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; r_buf += dstbump; g_buf += dstbump; b_buf += dstbump; } return PRIMITIVES_SUCCESS; }
void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; __m128i thr; int16_t nzflag; (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); if (!skip_block) { __m128i eob; __m128i round, quant, dequant; { __m128i coeff0, coeff1; // Setup global values { round = _mm_load_si128((const __m128i *)round_ptr); quant = _mm_load_si128((const __m128i *)quant_ptr); dequant = _mm_load_si128((const __m128i *)dequant_ptr); } { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; // Do DC and first 15 AC coeff0 = load_tran_low(coeff_ptr + n_coeffs); coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); qcoeff0 = _mm_adds_epi16(qcoeff0, round); round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); // Reinsert signs qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob = _mm_max_epi16(eob, eob1); } n_coeffs += 8 * 2; } thr = _mm_srai_epi16(dequant, 1); // AC only loop while (n_coeffs < 0) { __m128i coeff0, coeff1; { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; coeff0 = load_tran_low(coeff_ptr + n_coeffs); coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); if (nzflag) { qcoeff0 = _mm_adds_epi16(qcoeff0, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); // Reinsert signs qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } else { store_zero_tran_low(qcoeff_ptr + n_coeffs); store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); store_zero_tran_low(dqcoeff_ptr + n_coeffs); store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); } } if (nzflag) { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob0, eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob0 = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob0 = _mm_max_epi16(eob0, eob1); eob = _mm_max_epi16(eob, eob0); } n_coeffs += 8 * 2; } // Accumulate EOB { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); *eob_ptr = _mm_extract_epi16(eob, 1); } } else { do { store_zero_tran_low(qcoeff_ptr + n_coeffs); store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); store_zero_tran_low(dqcoeff_ptr + n_coeffs); store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; } }
void ulsch_channel_compensation(int **rxdataF_ext, int **ul_ch_estimates_ext, int **ul_ch_mag, int **ul_ch_magb, int **rxdataF_comp, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift) { unsigned short rb; __m128i *ul_ch128,*ul_ch_mag128,*ul_ch_mag128b,*rxdataF128,*rxdataF_comp128; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) QAM_amp128U = _mm_set1_epi16(QAM16_n1); else if (Qm == 6) { QAM_amp128U = _mm_set1_epi16(QAM64_n1); QAM_amp128bU = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128 = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128 = (__m128i *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b = (__m128i *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128 = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); #ifdef OFDMA_ULSCH if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[0] = ul_ch_mag128[0]; ul_ch_mag128[0] = _mm_mulhi_epi16(ul_ch_mag128[0],QAM_amp128U); ul_ch_mag128[0] = _mm_slli_epi16(ul_ch_mag128[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[1] = ul_ch_mag128[1]; ul_ch_mag128[1] = _mm_mulhi_epi16(ul_ch_mag128[1],QAM_amp128U); ul_ch_mag128[1] = _mm_slli_epi16(ul_ch_mag128[1],2); // 2 to compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b[2] = ul_ch_mag128[2]; ul_ch_mag128[2] = _mm_mulhi_epi16(ul_ch_mag128[2],QAM_amp128U); ul_ch_mag128[2] = _mm_slli_epi16(ul_ch_mag128[2],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[0] = _mm_mulhi_epi16(ul_ch_mag128b[0],QAM_amp128bU); ul_ch_mag128b[0] = _mm_slli_epi16(ul_ch_mag128b[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[1] = _mm_mulhi_epi16(ul_ch_mag128b[1],QAM_amp128bU); ul_ch_mag128b[1] = _mm_slli_epi16(ul_ch_mag128b[1],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[2] = _mm_mulhi_epi16(ul_ch_mag128b[2],QAM_amp128bU); ul_ch_mag128b[2] = _mm_slli_epi16(ul_ch_mag128b[2],2);// 2 to compensate the scale channel estimate } #else mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift-1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); // printf("comp: symbol %d rb %d => %d,%d,%d\n",symbol,rb,*((short*)&ul_ch_mag128[0]),*((short*)&ul_ch_mag128[1]),*((short*)&ul_ch_mag128[2])); #endif // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128[0]); // print_shorts("pack:",rxdataF_comp128[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128[1]); // print_shorts("pack:",rxdataF_comp128[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128[2]); // print_shorts("pack:",rxdataF_comp128[2]); ul_ch128+=3; ul_ch_mag128+=3; ul_ch_mag128b+=3; rxdataF128+=3; rxdataF_comp128+=3; } } _mm_empty(); _m_empty(); }
test (__m128i s1, __m128i s2) { return _mm_mulhi_epi16 (s1, s2); }
void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) { char eob = 0; short *zbin_boost_ptr; short *qcoeff_ptr = d->qcoeff; DECLARE_ALIGNED_ARRAY(16, short, x, 16); DECLARE_ALIGNED_ARRAY(16, short, y, 16); __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); __m128i round0 = _mm_load_si128((__m128i *)(b->round)); __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); memset(qcoeff_ptr, 0, 32); /* Duplicate to all lanes. */ zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); /* Sign of z: z >> 15 */ sz0 = _mm_srai_epi16(z0, 15); sz1 = _mm_srai_epi16(z1, 15); /* x = abs(z): (z ^ sz) - sz */ x0 = _mm_xor_si128(z0, sz0); x1 = _mm_xor_si128(z1, sz1); x0 = _mm_sub_epi16(x0, sz0); x1 = _mm_sub_epi16(x1, sz1); /* zbin[] + zbin_extra */ zbin0 = _mm_add_epi16(zbin0, zbin_extra); zbin1 = _mm_add_epi16(zbin1, zbin_extra); /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance * the equation because boost is the only value which can change: * x - (zbin[] + extra) >= boost */ x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); _mm_store_si128((__m128i *)(x), x_minus_zbin0); _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); /* All the remaining calculations are valid whether they are done now with * simd or later inside the loop one at a time. */ x0 = _mm_add_epi16(x0, round0); x1 = _mm_add_epi16(x1, round1); y0 = _mm_mulhi_epi16(x0, quant0); y1 = _mm_mulhi_epi16(x1, quant1); y0 = _mm_add_epi16(y0, x0); y1 = _mm_add_epi16(y1, x1); /* Instead of shifting each value independently we convert the scaling * factor with 1 << (16 - shift) so we can use multiply/return high half. */ y0 = _mm_mulhi_epi16(y0, quant_shift0); y1 = _mm_mulhi_epi16(y1, quant_shift1); /* Return the sign: (y ^ sz) - sz */ y0 = _mm_xor_si128(y0, sz0); y1 = _mm_xor_si128(y1, sz1); y0 = _mm_sub_epi16(y0, sz0); y1 = _mm_sub_epi16(y1, sz1); _mm_store_si128((__m128i *)(y), y0); _mm_store_si128((__m128i *)(y + 8), y1); zbin_boost_ptr = b->zrun_zbin_boost; /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ SELECT_EOB(1, 0); SELECT_EOB(2, 1); SELECT_EOB(3, 4); SELECT_EOB(4, 8); SELECT_EOB(5, 5); SELECT_EOB(6, 2); SELECT_EOB(7, 3); SELECT_EOB(8, 6); SELECT_EOB(9, 9); SELECT_EOB(10, 12); SELECT_EOB(11, 13); SELECT_EOB(12, 10); SELECT_EOB(13, 7); SELECT_EOB(14, 11); SELECT_EOB(15, 14); SELECT_EOB(16, 15); y0 = _mm_load_si128((__m128i *)(d->qcoeff)); y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); /* dqcoeff = qcoeff * dequant */ y0 = _mm_mullo_epi16(y0, dequant0); y1 = _mm_mullo_epi16(y1, dequant1); _mm_store_si128((__m128i *)(d->dqcoeff), y0); _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); *d->eob = eob; }
void EmitColorIndices_Intrinsics( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte *&outData ) { ALIGN16( byte color0[16] ); ALIGN16( byte color1[16] ); ALIGN16( byte color2[16] ); ALIGN16( byte color3[16] ); ALIGN16( byte result[16] ); // mov esi, maxColor // mov edi, minColor __m128i t0, t1, t2, t3, t4, t5, t6, t7; t7 = _mm_setzero_si128(); //t7 = _mm_xor_si128(t7, t7); _mm_store_si128 ( (__m128i*) &result, t7 ); //t0 = _mm_load_si128 ( (__m128i*) maxColor ); t0 = _mm_cvtsi32_si128( *(int*)maxColor); // Bitwise AND __m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask ); t0 = _mm_and_si128(t0, tt); t0 = _mm_unpacklo_epi8(t0, t7); t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); // Bitwise Logical OR t0 = _mm_or_si128(t0, t4); t0 = _mm_or_si128(t0, t5); // t0 contains color0 in 565 //t1 = _mm_load_si128 ( (__m128i*) minColor ); t1 = _mm_cvtsi32_si128( *(int*)minColor); t1 = _mm_and_si128(t1, tt); t1 = _mm_unpacklo_epi8(t1, t7); t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); t1 = _mm_or_si128(t1, t4); t1 = _mm_or_si128(t1, t5); // t1 contains color1 in 565 t2 = t0; t2 = _mm_packus_epi16(t2, t7); t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color0, t2 ); t6 = t0; t6 = _mm_add_epi16(t6, t0); t6 = _mm_add_epi16(t6, t1); // Multiply Packed Signed Integers and Store High Result __m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 ); t6 = _mm_mulhi_epi16(t6, tw3); t6 = _mm_packus_epi16(t6, t7); t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color2, t6 ); t3 = t1; t3 = _mm_packus_epi16(t3, t7); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color1, t3 ); t1 = _mm_add_epi16(t1, t1); t0 = _mm_add_epi16(t0, t1); t0 = _mm_mulhi_epi16(t0, tw3); t0 = _mm_packus_epi16(t0, t7); t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color3, t0 ); __m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0); __m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1); __m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2); // mov eax, 32 // mov esi, colorBlock int x = 32; //const byte *c = colorBlock; while (x >= 0) { t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0)); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t0 = t3; t6 = t5; // Compute Sum of Absolute Difference __m128i c0 = _mm_load_si128 ( (__m128i*) color0 ); t0 = _mm_sad_epu8(t0, c0); t6 = _mm_sad_epu8(t6, c0); // Pack with Signed Saturation t0 = _mm_packs_epi32 (t0, t6); t1 = t3; t6 = t5; __m128i c1 = _mm_load_si128 ( (__m128i*) color1 ); t1 = _mm_sad_epu8(t1, c1); t6 = _mm_sad_epu8(t6, c1); t1 = _mm_packs_epi32 (t1, t6); t2 = t3; t6 = t5; __m128i c2 = _mm_load_si128 ( (__m128i*) color2 ); t2 = _mm_sad_epu8(t2, c2); t6 = _mm_sad_epu8(t6, c2); t2 = _mm_packs_epi32 (t2, t6); __m128i c3 = _mm_load_si128 ( (__m128i*) color3 ); t3 = _mm_sad_epu8(t3, c3); t5 = _mm_sad_epu8(t5, c3); t3 = _mm_packs_epi32 (t3, t5); t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16)); t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c0); t7 = _mm_sad_epu8(t7, c0); t6 = _mm_packs_epi32 (t6, t7); t0 = _mm_packs_epi32 (t0, t6); // d0 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c1); t7 = _mm_sad_epu8(t7, c1); t6 = _mm_packs_epi32 (t6, t7); t1 = _mm_packs_epi32 (t1, t6); // d1 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c2); t7 = _mm_sad_epu8(t7, c2); t6 = _mm_packs_epi32 (t6, t7); t2 = _mm_packs_epi32 (t2, t6); // d2 t4 = _mm_sad_epu8(t4, c3); t5 = _mm_sad_epu8(t5, c3); t4 = _mm_packs_epi32 (t4, t5); t3 = _mm_packs_epi32 (t3, t4); // d3 t7 = _mm_load_si128 ( (__m128i*) result ); t7 = _mm_slli_epi32( t7, 16); t4 = t0; t5 = t1; // Compare Packed Signed Integers for Greater Than t0 = _mm_cmpgt_epi16(t0, t3); // b0 t1 = _mm_cmpgt_epi16(t1, t2); // b1 t4 = _mm_cmpgt_epi16(t4, t2); // b2 t5 = _mm_cmpgt_epi16(t5, t3); // b3 t2 = _mm_cmpgt_epi16(t2, t3); // b4 t4 = _mm_and_si128(t4, t1); // x0 t5 = _mm_and_si128(t5, t0); // x1 t2 = _mm_and_si128(t2, t0); // x2 t4 = _mm_or_si128(t4, t5); t2 = _mm_and_si128(t2, w1); t4 = _mm_and_si128(t4, w2); t2 = _mm_or_si128(t2, t4); t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 )); // Unpack Low Data t2 = _mm_unpacklo_epi16 ( t2, w0); t5 = _mm_unpacklo_epi16 ( t5, w0); //t5 = _mm_slli_si128 ( t5, 8); t5 = _mm_slli_epi32( t5, 8); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t2); _mm_store_si128 ( (__m128i*) &result, t7 ); x -=32; } t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 )); t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 )); t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 )); t4 = _mm_slli_epi32 ( t4, 2); t5 = _mm_slli_epi32 ( t5, 4); t6 = _mm_slli_epi32 ( t6, 6); t7 = _mm_or_si128(t7, t4); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t6); //_mm_store_si128 ( (__m128i*) outData, t7 ); int r = _mm_cvtsi128_si32 (t7); memcpy(outData, &r, 4); // Anything better ? outData += 4; }
// Does one or two inverse transforms. static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 // // To be able to use signed 16-bit integers, we use the following trick to // have constants within range: // - Associated constants are obtained by subtracting the 16-bit fixed point // version of one: // k = K - (1 << 16) => K = k + (1 << 16) // K1 = 85267 => k1 = 20091 // K2 = 35468 => k2 = -30068 // - The multiplication of a variable by a constant become the sum of the // variable and the multiplication of that variable by the associated // constant: // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x const __m128i k1 = _mm_set1_epi16(20091); const __m128i k2 = _mm_set1_epi16(-30068); __m128i T0, T1, T2, T3; // Load and concatenate the transform coefficients (we'll do two inverse // transforms in parallel). In the case of only one inverse transform, the // second half of the vectors will just contain random value we'll never // use nor store. __m128i in0, in1, in2, in3; { in0 = _mm_loadl_epi64((__m128i*)&in[0]); in1 = _mm_loadl_epi64((__m128i*)&in[4]); in2 = _mm_loadl_epi64((__m128i*)&in[8]); in3 = _mm_loadl_epi64((__m128i*)&in[12]); // a00 a10 a20 a30 x x x x // a01 a11 a21 a31 x x x x // a02 a12 a22 a32 x x x x // a03 a13 a23 a33 x x x x if (do_two) { const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]); const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]); const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]); const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]); in0 = _mm_unpacklo_epi64(in0, inB0); in1 = _mm_unpacklo_epi64(in1, inB1); in2 = _mm_unpacklo_epi64(in2, inB2); in3 = _mm_unpacklo_epi64(in3, inB3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } } // Vertical pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i a = _mm_add_epi16(in0, in2); const __m128i b = _mm_sub_epi16(in0, in2); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 const __m128i c1 = _mm_mulhi_epi16(in1, k2); const __m128i c2 = _mm_mulhi_epi16(in3, k1); const __m128i c3 = _mm_sub_epi16(in1, in3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 const __m128i d1 = _mm_mulhi_epi16(in1, k1); const __m128i d2 = _mm_mulhi_epi16(in3, k2); const __m128i d3 = _mm_add_epi16(in1, in3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i four = _mm_set1_epi16(4); const __m128i dc = _mm_add_epi16(T0, four); const __m128i a = _mm_add_epi16(dc, T2); const __m128i b = _mm_sub_epi16(dc, T2); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 const __m128i c1 = _mm_mulhi_epi16(T1, k2); const __m128i c2 = _mm_mulhi_epi16(T3, k1); const __m128i c3 = _mm_sub_epi16(T1, T3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 const __m128i d1 = _mm_mulhi_epi16(T1, k1); const __m128i d2 = _mm_mulhi_epi16(T3, k2); const __m128i d3 = _mm_add_epi16(T1, T3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Add inverse transform to 'ref' and store. { const __m128i zero = _mm_set1_epi16(0); // Load the reference(s). __m128i ref0, ref1, ref2, ref3; if (do_two) { // Load eight bytes/pixels per line. ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); } else { // Load four bytes/pixels per line. ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]); ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]); ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]); ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]); } // Convert to 16b. ref0 = _mm_unpacklo_epi8(ref0, zero); ref1 = _mm_unpacklo_epi8(ref1, zero); ref2 = _mm_unpacklo_epi8(ref2, zero); ref3 = _mm_unpacklo_epi8(ref3, zero); // Add the inverse transform(s). ref0 = _mm_add_epi16(ref0, T0); ref1 = _mm_add_epi16(ref1, T1); ref2 = _mm_add_epi16(ref2, T2); ref3 = _mm_add_epi16(ref3, T3); // Unsigned saturate to 8b. ref0 = _mm_packus_epi16(ref0, ref0); ref1 = _mm_packus_epi16(ref1, ref1); ref2 = _mm_packus_epi16(ref2, ref2); ref3 = _mm_packus_epi16(ref3, ref3); // Store the results. if (do_two) { // Store eight bytes/pixels per line. _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0); _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1); _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2); _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); } else { // Store four bytes/pixels per line. *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0); *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1); *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2); *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3); } } }
void ulsch_channel_compensation_alamouti(int **rxdataF_ext, // For Distributed Alamouti Combining int **ul_ch_estimates_ext_0, int **ul_ch_estimates_ext_1, int **ul_ch_mag_0, int **ul_ch_magb_0, int **ul_ch_mag_1, int **ul_ch_magb_1, int **rxdataF_comp_0, int **rxdataF_comp_1, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift_0, unsigned char output_shift_1) { unsigned short rb; __m128i *ul_ch128_0,*ul_ch128_1,*ul_ch_mag128_0,*ul_ch_mag128_1,*ul_ch_mag128b_0,*ul_ch_mag128b_1,*rxdataF128,*rxdataF_comp128_0,*rxdataF_comp128_1; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) { QAM_amp128U_0 = _mm_set1_epi16(QAM16_n1); QAM_amp128U_1 = _mm_set1_epi16(QAM16_n1); } else if (Qm == 6) { QAM_amp128U_0 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_0 = _mm_set1_epi16(QAM64_n2); QAM_amp128U_1 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_1 = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128_0 = (__m128i *)&ul_ch_estimates_ext_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_0 = (__m128i *)&ul_ch_mag_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_0 = (__m128i *)&ul_ch_magb_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch128_1 = (__m128i *)&ul_ch_estimates_ext_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_1 = (__m128i *)&ul_ch_mag_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_1 = (__m128i *)&ul_ch_magb_1[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_0 = (__m128i *)&rxdataF_comp_0[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp_1[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],ul_ch128_0[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_madd_epi16(ul_ch128_0[1],ul_ch128_0[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_0[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[0] = ul_ch_mag128_0[0]; ul_ch_mag128_0[0] = _mm_mulhi_epi16(ul_ch_mag128_0[0],QAM_amp128U_0); ul_ch_mag128_0[0] = _mm_slli_epi16(ul_ch_mag128_0[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_0[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[1] = ul_ch_mag128_0[1]; ul_ch_mag128_0[1] = _mm_mulhi_epi16(ul_ch_mag128_0[1],QAM_amp128U_0); ul_ch_mag128_0[1] = _mm_slli_epi16(ul_ch_mag128_0[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],ul_ch128_0[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_0[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_0[2] = ul_ch_mag128_0[2]; ul_ch_mag128_0[2] = _mm_mulhi_epi16(ul_ch_mag128_0[2],QAM_amp128U_0); ul_ch_mag128_0[2] = _mm_slli_epi16(ul_ch_mag128_0[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_0[0] = _mm_mulhi_epi16(ul_ch_mag128b_0[0],QAM_amp128bU_0); ul_ch_mag128b_0[0] = _mm_slli_epi16(ul_ch_mag128b_0[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[1] = _mm_mulhi_epi16(ul_ch_mag128b_0[1],QAM_amp128bU_0); ul_ch_mag128b_0[1] = _mm_slli_epi16(ul_ch_mag128b_0[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[2] = _mm_mulhi_epi16(ul_ch_mag128b_0[2],QAM_amp128bU_0); ul_ch_mag128b_0[2] = _mm_slli_epi16(ul_ch_mag128b_0[2],2); // 2 to scale compensate the scale channel estima mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],ul_ch128_1[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_madd_epi16(ul_ch128_1[1],ul_ch128_1[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_1[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[0] = ul_ch_mag128_1[0]; ul_ch_mag128_1[0] = _mm_mulhi_epi16(ul_ch_mag128_1[0],QAM_amp128U_1); ul_ch_mag128_1[0] = _mm_slli_epi16(ul_ch_mag128_1[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_1[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[1] = ul_ch_mag128_1[1]; ul_ch_mag128_1[1] = _mm_mulhi_epi16(ul_ch_mag128_1[1],QAM_amp128U_1); ul_ch_mag128_1[1] = _mm_slli_epi16(ul_ch_mag128_1[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],ul_ch128_1[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_1[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_1[2] = ul_ch_mag128_1[2]; ul_ch_mag128_1[2] = _mm_mulhi_epi16(ul_ch_mag128_1[2],QAM_amp128U_0); ul_ch_mag128_1[2] = _mm_slli_epi16(ul_ch_mag128_1[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_1[0] = _mm_mulhi_epi16(ul_ch_mag128b_1[0],QAM_amp128bU_1); ul_ch_mag128b_1[0] = _mm_slli_epi16(ul_ch_mag128b_1[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[1] = _mm_mulhi_epi16(ul_ch_mag128b_1[1],QAM_amp128bU_1); ul_ch_mag128b_1[1] = _mm_slli_epi16(ul_ch_mag128b_1[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[2] = _mm_mulhi_epi16(ul_ch_mag128b_1[2],QAM_amp128bU_1); ul_ch_mag128b_1[2] = _mm_slli_epi16(ul_ch_mag128b_1[2],2); // 2 to scale compensate the scale channel estima } /************************For Computing (y)*(h0*)********************************************/ // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_0[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128_0[0]); // print_shorts("pack:",rxdataF_comp128_0[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128_0[1]); // print_shorts("pack:",rxdataF_comp128_0[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_0[2]); /*************************For Computing (y*)*(h1)************************************/ // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_1[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch_conjugate:",ul_ch128_1[0]); // print_shorts("pack:",rxdataF_comp128_1[0]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch_conjugate:",ul_ch128_1[1]); // print_shorts("pack:",rxdataF_comp128_1[1]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch_conjugate:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_1[2]); ul_ch128_0+=3; ul_ch_mag128_0+=3; ul_ch_mag128b_0+=3; ul_ch128_1+=3; ul_ch_mag128_1+=3; ul_ch_mag128b_1+=3; rxdataF128+=3; rxdataF_comp128_0+=3; rxdataF_comp128_1+=3; } } _mm_empty(); _m_empty(); }