int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3) { int x = 0; if( useSIMD ) { __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); __m128i dx = _mm_set1_epi16(8); __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init; for( ; x <= len - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); __m128i sx = _mm_mullo_epi16(qx, qx); qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx)); qx = _mm_add_epi16(qx, dx); } _mm_store_si128((__m128i*)buf, qx0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx1); x1 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx2); x2 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx3); x3 = buf[0] + buf[1] + buf[2] + buf[3]; } return x; }
static INLINE __m128i load_coefficients(const tran_low_t* coeff_ptr) { #if CONFIG_VPX_HIGHBITDEPTH return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); #else return _mm_load_si128((const __m128i*)coeff_ptr); #endif }
static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { if (sizeof(tran_low_t) == 4) { return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); } else { return _mm_load_si128((const __m128i *)coeff_ptr); } }
template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* moments ) { typedef uchar T; typedef int WT; typedef int MT; Size size = img.size(); int y; MT mom[10] = {0,0,0,0,0,0,0,0,0,0}; bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); for( y = 0; y < size.height; y++ ) { const T* ptr = img.ptr<T>(y); int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x = 0; if( useSIMD ) { __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); __m128i dx = _mm_set1_epi16(8); __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init; for( ; x <= size.width - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); __m128i px = _mm_mullo_epi16(p, qx); __m128i sx = _mm_mullo_epi16(qx, qx); qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx)); qx = _mm_add_epi16(qx, dx); } int CV_DECL_ALIGNED(16) buf[4]; _mm_store_si128((__m128i*)buf, qx0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx1); x1 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx2); x2 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx3); x3 = buf[0] + buf[1] + buf[2] + buf[3]; } for( ; x < size.width; x++ ) { WT p = ptr[x]; WT xp = x * p, xxp; x0 += p; x1 += xp; xxp = xp * x; x2 += xxp; x3 += xxp * x; } WT py = y * x0, sy = y*y; mom[9] += ((MT)py) * sy; // m03 mom[8] += ((MT)x1) * sy; // m12 mom[7] += ((MT)x2) * y; // m21 mom[6] += x3; // m30 mom[5] += x0 * sy; // m02 mom[4] += x1 * y; // m11 mom[3] += x2; // m20 mom[2] += py; // m01 mom[1] += x1; // m10 mom[0] += x0; // m00 } for(int x = 0; x < 10; x++ ) moments[x] = (double)mom[x]; }
void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride, const uint8_t *b, unsigned int width, unsigned int height, int strength, int weight, uint32_t *accumulator, uint16_t *count) { unsigned int h; const int rounding = strength > 0 ? 1 << (strength - 1) : 0; assert(strength >= 0); assert(strength <= 6); assert(weight >= 0); assert(weight <= 2); assert(width == 8 || width == 16); if (width == 8) { __m128i sum_row_a, sum_row_b, sum_row_c; __m128i mul_constants = _mm_setr_epi16( NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); sum_8(a, b, &sum_row_a); sum_8(a + stride, b + width, &sum_row_b); sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b); sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight); accumulate_and_store_8(sum_row_c, b, count, accumulator); a += stride + stride; b += width; count += width; accumulator += width; mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); for (h = 0; h < height - 2; ++h) { sum_8(a, b + width, &sum_row_c); sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c); sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight); accumulate_and_store_8(sum_row_a, b, count, accumulator); a += stride; b += width; count += width; accumulator += width; sum_row_a = sum_row_b; sum_row_b = sum_row_c; } mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight); accumulate_and_store_8(sum_row_a, b, count, accumulator); } else { // width == 16 __m128i sum_row_a_0, sum_row_a_1; __m128i sum_row_b_0, sum_row_b_1; __m128i sum_row_c_0, sum_row_c_1; __m128i mul_constants_0 = _mm_setr_epi16( NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6), mul_constants_1 = _mm_setr_epi16( NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); sum_16(a, b, &sum_row_a_0, &sum_row_a_1); sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1); sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, strength, rounding, weight); accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); a += stride + stride; b += width; count += width; accumulator += width; mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9); mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); for (h = 0; h < height - 2; ++h) { sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1); sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0); sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1); average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1, strength, rounding, weight); accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator); a += stride; b += width; count += width; accumulator += width; sum_row_a_0 = sum_row_b_0; sum_row_a_1 = sum_row_b_1; sum_row_b_0 = sum_row_c_0; sum_row_b_1 = sum_row_c_1; } mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6); mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, strength, rounding, weight); accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); } }
mlib_status mlib_VideoColorJFIFYCC2RGB444_S16_naligned( mlib_s16 *rgb, const mlib_s16 *y, const mlib_s16 *cb, const mlib_s16 *cr, mlib_s32 n) { /* 0 & 1.402*16384 */ const __m128i x_c1 = _mm_setr_epi16(0, 22970, 0, 22970, 0, 22970, 0, 22970); /* -0.34414*16384 & -0.71414*16384 */ const __m128i x_c2 = _mm_setr_epi16(-5638, -11700, -5638, -11700, -5638, -11700, -5638, -11700); /* 1.772*16384 & 0 */ const __m128i x_c3 = _mm_setr_epi16(29032, 0, 29032, 0, 29032, 0, 29032, 0); const __m128i x_coff = _mm_set1_epi16(2048); const __m128i x_cps1 = _mm_set1_epi32(0x8000); const __m128i x_cps2 = _mm_set1_epi16(0x8000); const __m128i x_zero = _mm_setzero_si128(); const __m128i x_mask1 = _mm_setr_epi32(0xffffffff, 0xffff, 0, 0); const __m128i x_mask2 = _mm_setr_epi32(0, 0xffff0000, 0xffffffff, 0); /* __m128i variables */ __m128i x_y, x_cb, x_cr, x_r, x_g, x_b, x_y1, x_y2; __m128i x_r1, x_r2, x_g1, x_g2, x_b1, x_b2, x_t1, x_t2; __m128i x_rgbl, x_rgbh, x_rgl, x_rgh, x_bbl, x_bbh; __m128i x_cbcr1, x_cbcr2; /* pointers */ __m128i *px_y, *px_cb, *px_cr; mlib_s16 *prgb; /* other var */ mlib_d64 fr, fg, fb, fy, fcb, fcr; mlib_s32 i; px_y = (__m128i *)y; px_cb = (__m128i *)cb; px_cr = (__m128i *)cr; prgb = rgb; i = 0; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (; i <= n - 16; i += 8) { x_y = _mm_loadu_si128(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); x_y2 = _mm_unpackhi_epi16(x_y, x_zero); x_y2 = _mm_slli_epi32(x_y2, 4); px_y++; x_cb = _mm_loadu_si128(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb++; x_cr = _mm_loadu_si128(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr++; x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); x_t2 = _mm_madd_epi16(x_cbcr2, x_c1); x_t2 = _mm_srai_epi32(x_t2, 10); x_r2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c2); x_t2 = _mm_srai_epi32(x_t2, 10); x_g2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c3); x_t2 = _mm_srai_epi32(x_t2, 10); x_b2 = _mm_add_epi32(x_t2, x_y2); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r2 = _mm_sub_epi32(x_r2, x_cps1); x_r = _mm_packs_epi32(x_r1, x_r2); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g2 = _mm_sub_epi32(x_g2, x_cps1); x_g = _mm_packs_epi32(x_g1, x_g2); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b2 = _mm_sub_epi32(x_b2, x_cps1); x_b = _mm_packs_epi32(x_b1, x_b2); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_rgh = _mm_unpackhi_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); x_bbh = _mm_unpackhi_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbh); x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbh); } if (i <= (n - 8)) { x_y = _mm_loadu_si128(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); x_y2 = _mm_unpackhi_epi16(x_y, x_zero); x_y2 = _mm_slli_epi32(x_y2, 4); px_y++; x_cb = _mm_loadu_si128(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb++; x_cr = _mm_loadu_si128(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr++; x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); x_t2 = _mm_madd_epi16(x_cbcr2, x_c1); x_t2 = _mm_srai_epi32(x_t2, 10); x_r2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c2); x_t2 = _mm_srai_epi32(x_t2, 10); x_g2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c3); x_t2 = _mm_srai_epi32(x_t2, 10); x_b2 = _mm_add_epi32(x_t2, x_y2); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r2 = _mm_sub_epi32(x_r2, x_cps1); x_r = _mm_packs_epi32(x_r1, x_r2); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g2 = _mm_sub_epi32(x_g2, x_cps1); x_g = _mm_packs_epi32(x_g1, x_g2); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b2 = _mm_sub_epi32(x_b2, x_cps1); x_b = _mm_packs_epi32(x_b1, x_b2); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_rgh = _mm_unpackhi_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); x_bbh = _mm_unpackhi_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbh); x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh); PACK_RGB2(x_rgbh); i += 8; } if (i <= (n - 4)) { x_y = _mm_loadl_epi64(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); px_y = (__m128i *)(((__m64 *)px_y) + 1); x_cb = _mm_loadl_epi64(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb = (__m128i *)(((__m64 *)px_cb) + 1); x_cr = _mm_loadl_epi64(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr = (__m128i *)(((__m64 *)px_cr) + 1); x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r = _mm_packs_epi32(x_r1, x_zero); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g = _mm_packs_epi32(x_g1, x_zero); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b = _mm_packs_epi32(x_b1, x_zero); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB2(x_rgbh); i += 4; } /* pure C implementation */ for (; i < n; i++) { fy = y[i] * SCALE - SAT; fcb = (mlib_d64)((cb[i] - 2048) << 20); fcr = (mlib_d64)((cr[i] - 2048) << 20); fr = fy + 1.40200f * fcr; fg = fy - 0.34414f * fcb - 0.71414f * fcr; fb = fy + 1.77200f * fcb; rgb[3 * i] = CLAMP_U12(fr); rgb[3 * i + 1] = CLAMP_U12(fg); rgb[3 * i + 2] = CLAMP_U12(fb); } return (MLIB_SUCCESS); }
void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). const int stride = pitch >> 1; int pass; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); const __m128i kOne = _mm_set1_epi16(1); __m128i in0, in1, in2, in3; // Load inputs. { in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); // x = x << 4 in0 = _mm_slli_epi16(in0, 4); in1 = _mm_slli_epi16(in1, 4); in2 = _mm_slli_epi16(in2, 4); in3 = _mm_slli_epi16(in3, 4); // if (i == 0 && input[0]) input[0] += 1; { // The mask will only contain wether the first value is zero, all // other comparison will fail as something shifted by 4 (above << 4) // can never be equal to one. To increment in the non-zero case, we // add the mask and one for the first element: // - if zero, mask = -1, v = v - 1 + 1 = v // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); in0 = _mm_add_epi16(in0, mask); in0 = _mm_add_epi16(in0, k__nonzero_bias_b); } } // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // Transform 1/2: Add/substract const __m128i r0 = _mm_add_epi16(in0, in3); const __m128i r1 = _mm_add_epi16(in1, in2); const __m128i r2 = _mm_sub_epi16(in1, in2); const __m128i r3 = _mm_sub_epi16(in0, in3); // Transform 1/2: Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); // Combine and transpose const __m128i res0 = _mm_packs_epi32(w0, w2); const __m128i res1 = _mm_packs_epi32(w4, w6); // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 if (0 == pass) { // Extract values in the high part for second pass as transform code // only uses the first four values. in1 = _mm_unpackhi_epi64(in0, in0); in3 = _mm_unpackhi_epi64(in2, in2); } else { // Post-condition output and store it (v + 1) >> 2, taking advantage // of the fact 1/3 are stored just after 0/2. __m128i out01 = _mm_add_epi16(in0, kOne); __m128i out23 = _mm_add_epi16(in2, kOne); out01 = _mm_srai_epi16(out01, 2); out23 = _mm_srai_epi16(out23, 2); _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); } } }