mlib_status __mlib_VideoP64Decimate_U8_U8( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 width, mlib_s32 height, mlib_s32 dst_stride, mlib_s32 src_stride) { mlib_s32 x, y; const mlib_u8 *sd1, *sd2; mlib_u8 *dd; mlib_u32 src_stride2; sd1 = src; sd2 = src + src_stride; src_stride2 = 2 * src_stride; dd = dst; mlib_s32 dw = width & 0xF; __m128i txmm0, txmm1, txmm2, txmm3, txmm4, txmm5, txmm6, txmm7; txmm7 = _mm_set1_epi16(0xff); for (y = 0; y < height; y++) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (x = 0; x < width - dw; x += 16) { txmm0 = _mm_loadu_si128((__m128i *)&sd1[2*x]); txmm1 = _mm_loadu_si128((__m128i *)&sd2[2*x]); txmm2 = _mm_srli_si128(txmm0, 1); txmm3 = _mm_srli_si128(txmm1, 1); txmm4 = _mm_avg_epu8(txmm0, txmm2); txmm5 = _mm_avg_epu8(txmm1, txmm3); txmm6 = _mm_avg_epu8(txmm5, txmm4); txmm6 = _mm_and_si128(txmm6, txmm7); txmm0 = _mm_loadu_si128((__m128i *)&sd1[2 * x + 16]); txmm1 = _mm_loadu_si128((__m128i *)&sd2[2 * x + 16]); txmm2 = _mm_srli_si128(txmm0, 1); txmm3 = _mm_srli_si128(txmm1, 1); txmm4 = _mm_avg_epu8(txmm0, txmm2); txmm5 = _mm_avg_epu8(txmm1, txmm3); txmm5 = _mm_avg_epu8(txmm5, txmm4); txmm5 = _mm_and_si128(txmm5, txmm7); txmm1 = _mm_packus_epi16(txmm6, txmm5); _mm_storeu_si128((__m128i *)&dd[x], txmm1); } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (; x < width; x++) { dd[x] = (sd1[x * 2] + sd1[x * 2 + 1] + sd2[x * 2] + sd2[x * 2 + 1] + 2) >> 2; } sd1 += src_stride2; sd2 += src_stride2; dd += dst_stride; } return (MLIB_SUCCESS); }
const int32_t s1Len_PAD = s1Len+PAD; const int32_t s2Len_PAD = s2Len+PAD; int16_t * const restrict s1 = parasail_memalign_int16_t(16, s1Len+PAD); int16_t * const restrict s2B= parasail_memalign_int16_t(16, s2Len+PAD2); int16_t * const restrict _H_pr = parasail_memalign_int16_t(16, s2Len+PAD2); int16_t * const restrict _F_pr = parasail_memalign_int16_t(16, s2Len+PAD2); int16_t * const restrict s2 = s2B+PAD; /* will allow later for negative indices */ int16_t * const restrict H_pr = _H_pr+PAD; int16_t * const restrict F_pr = _F_pr+PAD; parasail_result_t *result = parasail_result_new_trace(s1Len, s2Len, 16, sizeof(int8_t)); int32_t i = 0; int32_t j = 0; int32_t end_query = s1Len-1; int32_t end_ref = s2Len-1; int16_t score = NEG_INF; __m128i vNegInf = _mm_set1_epi16(NEG_INF); __m128i vOpen = _mm_set1_epi16(open); __m128i vGap = _mm_set1_epi16(gap); __m128i vOne = _mm_set1_epi16(1); __m128i vN = _mm_set1_epi16(N); __m128i vGapN = _mm_set1_epi16(gap*N); __m128i vNegOne = _mm_set1_epi16(-1); __m128i vI = _mm_set_epi16(0,1,2,3,4,5,6,7); __m128i vJreset = _mm_set_epi16(0,-1,-2,-3,-4,-5,-6,-7); __m128i vMax = vNegInf; __m128i vILimit = _mm_set1_epi16(s1Len); __m128i vILimit1 = _mm_sub_epi16(vILimit, vOne); __m128i vJLimit = _mm_set1_epi16(s2Len); __m128i vJLimit1 = _mm_sub_epi16(vJLimit, vOne); __m128i vIBoundary = _mm_set_epi16( -open-0*gap,
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], const uint16_t* const sharpen, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]); const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]); // coeff = abs(in) __m128i coeff0 = _mm_abs_epi16(in0); __m128i coeff8 = _mm_abs_epi16(in8); // coeff = abs(in) + sharpen if (sharpen != NULL) { const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]); const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]); coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); } // out = (coeff * iQ + B) >> QFIX { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // out = (coeff * iQ + B) const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]); const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]); const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]); const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]); out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = QUANTDIV(coeff, iQ, B, QFIX) out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); // if (coeff > 2047) coeff = 2047 out0 = _mm_min_epi16(out0, max_coeff_2047); out8 = _mm_min_epi16(out8, max_coeff_2047); } // put sign back out0 = _mm_sign_epi16(out0, in0); out8 = _mm_sign_epi16(out8, in8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); // zigzag the output before storing it. The re-ordering is: // 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15 // There's only two misplaced entries ([8] and [7]) that are crossing the // reg's boundaries. // We use pshufb instead of pshuflo/pshufhi. { const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6); const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1); const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo); const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7); // extract #7 const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7); const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1); const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi); const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8); // extract #8 const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8); const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7); _mm_storeu_si128((__m128i*)&out[0], out_z0); _mm_storeu_si128((__m128i*)&out[8], out_z8); packed_out = _mm_packs_epi16(out_z0, out_z8); } // detect if all 'out' values are zeroes or not return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); }
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer) { __m128i min = _mm_set1_epi16(-128 << 5); __m128i max = _mm_set1_epi16(127 << 5); __m128i* y_r_buf = (__m128i*) y_r_buffer; __m128i* cb_g_buf = (__m128i*) cb_g_buffer; __m128i* cr_b_buf = (__m128i*) cr_b_buffer; __m128i y; __m128i cr; __m128i cb; __m128i r; __m128i g; __m128i b; int i; for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA); } for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++) { /* r = y_r_buf[i]; */ r = _mm_load_si128(&y_r_buf[i]); /* g = cb_g_buf[i]; */ g = _mm_load_si128(&cb_g_buf[i]); /* b = cr_b_buf[i]; */ b = _mm_load_si128(&cr_b_buf[i]); /* y = ((r << 3) + (r) + (r >> 1) + (r >> 4) + (r >> 7)) + ((g << 4) + (g << 1) + (g >> 1) + (g >> 2) + (g >> 5)) + ((b << 1) + (b) + (b >> 1) + (b >> 3) + (b >> 6) + (b >> 7)); */ /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */ y = _mm_add_epi16(_mm_slli_epi16(r, 3), r); y = _mm_add_epi16(y, _mm_srai_epi16(r, 1)); y = _mm_add_epi16(y, _mm_srai_epi16(r, 4)); y = _mm_add_epi16(y, _mm_srai_epi16(r, 7)); y = _mm_add_epi16(y, _mm_slli_epi16(g, 4)); y = _mm_add_epi16(y, _mm_slli_epi16(g, 1)); y = _mm_add_epi16(y, _mm_srai_epi16(g, 1)); y = _mm_add_epi16(y, _mm_srai_epi16(g, 2)); y = _mm_add_epi16(y, _mm_srai_epi16(g, 5)); y = _mm_add_epi16(y, _mm_slli_epi16(b, 1)); y = _mm_add_epi16(y, b); y = _mm_add_epi16(y, _mm_srai_epi16(b, 1)); y = _mm_add_epi16(y, _mm_srai_epi16(b, 3)); y = _mm_add_epi16(y, _mm_srai_epi16(b, 6)); y = _mm_add_epi16(y, _mm_srai_epi16(b, 7)); y = _mm_add_epi16(y, min); _mm_between_epi16(y, min, max); _mm_store_si128(&y_r_buf[i], y); /* cb = 0 - ((r << 2) + (r) + (r >> 2) + (r >> 3) + (r >> 5)) - ((g << 3) + (g << 1) + (g >> 1) + (g >> 4) + (g >> 5) + (g >> 6)) + ((b << 4) + (b >> 6)); */ /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */ cb = _mm_add_epi16(_mm_slli_epi16(b, 4), _mm_srai_epi16(b, 6)); cb = _mm_sub_epi16(cb, _mm_slli_epi16(r, 2)); cb = _mm_sub_epi16(cb, r); cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 2)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 3)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5)); cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 3)); cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 1)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 1)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 5)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6)); _mm_between_epi16(cb, min, max); _mm_store_si128(&cb_g_buf[i], cb); /* cr = ((r << 4) - (r >> 7)) - ((g << 3) + (g << 2) + (g) + (g >> 2) + (g >> 3) + (g >> 6)) - ((b << 1) + (b >> 1) + (b >> 4) + (b >> 5) + (b >> 7)); */ /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */ cr = _mm_sub_epi16(_mm_slli_epi16(r, 4), _mm_srai_epi16(r, 7)); cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 3)); cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 2)); cr = _mm_sub_epi16(cr, g); cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 2)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 6)); cr = _mm_sub_epi16(cr, _mm_slli_epi16(b, 1)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 1)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 5)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 7)); _mm_between_epi16(cr, min, max); _mm_store_si128(&cr_b_buf[i], cr); } }
rfx_dwt_2d_decode_block_vert_sse2(sint16* l, sint16* h, sint16* dst, int subband_width) { int x, n; sint16* l_ptr = l; sint16* h_ptr = h; sint16* dst_ptr = dst; __m128i l_n; __m128i h_n; __m128i tmp_n; __m128i h_n_m; __m128i dst_n; __m128i dst_n_m; __m128i dst_n_p; int total_width = subband_width + subband_width; /* Even coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ l_n = _mm_load_si128((__m128i*) l_ptr); h_n = _mm_load_si128((__m128i*) h_ptr); tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));; if (n == 0) tmp_n = _mm_add_epi16(tmp_n, h_n); else { h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width)); tmp_n = _mm_add_epi16(tmp_n, h_n_m); } tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst_n); l_ptr+=8; h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } h_ptr = h; dst_ptr = dst + total_width; /* Odd coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ h_n = _mm_load_si128((__m128i*) h_ptr); dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width)); h_n = _mm_slli_epi16(h_n, 1); tmp_n = dst_n_m; if (n == subband_width - 1) tmp_n = _mm_add_epi16(tmp_n, dst_n_m); else { dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width)); tmp_n = _mm_add_epi16(tmp_n, dst_n_p); } tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_add_epi16(tmp_n, h_n); _mm_store_si128((__m128i*) dst_ptr, dst_n); h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } }
void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[8]; const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data in[0] = load_input_data8(input); in[1] = load_input_data8(input + 8 * 1); in[2] = load_input_data8(input + 8 * 2); in[3] = load_input_data8(input + 8 * 3); in[4] = load_input_data8(input + 8 * 4); in[5] = load_input_data8(input + 8 * 5); in[6] = load_input_data8(input + 8 * 6); in[7] = load_input_data8(input + 8 * 7); switch (tx_type) { case DCT_DCT: vpx_idct8_sse2(in); vpx_idct8_sse2(in); break; case ADST_DCT: vpx_idct8_sse2(in); iadst8_sse2(in); break; case DCT_ADST: iadst8_sse2(in); vpx_idct8_sse2(in); break; default: assert(tx_type == ADST_ADST); iadst8_sse2(in); iadst8_sse2(in); break; } // Final rounding and shift in[0] = _mm_adds_epi16(in[0], final_rounding); in[1] = _mm_adds_epi16(in[1], final_rounding); in[2] = _mm_adds_epi16(in[2], final_rounding); in[3] = _mm_adds_epi16(in[3], final_rounding); in[4] = _mm_adds_epi16(in[4], final_rounding); in[5] = _mm_adds_epi16(in[5], final_rounding); in[6] = _mm_adds_epi16(in[6], final_rounding); in[7] = _mm_adds_epi16(in[7], final_rounding); in[0] = _mm_srai_epi16(in[0], 5); in[1] = _mm_srai_epi16(in[1], 5); in[2] = _mm_srai_epi16(in[2], 5); in[3] = _mm_srai_epi16(in[3], 5); in[4] = _mm_srai_epi16(in[4], 5); in[5] = _mm_srai_epi16(in[5], 5); in[6] = _mm_srai_epi16(in[6], 5); in[7] = _mm_srai_epi16(in[7], 5); recon_and_store(dest + 0 * stride, in[0]); recon_and_store(dest + 1 * stride, in[1]); recon_and_store(dest + 2 * stride, in[2]); recon_and_store(dest + 3 * stride, in[3]); recon_and_store(dest + 4 * stride, in[4]); recon_and_store(dest + 5 * stride, in[5]); recon_and_store(dest + 6 * stride, in[6]); recon_and_store(dest + 7 * stride, in[7]); }
a01_##ind = srcPixelPtr[ind2]; \ a10_##ind = srcPixelPtr2[ind1]; \ a11_##ind = srcPixelPtr2[ind2] /* *********************************************************** */ mlib_status FUN_NAME( 1ch) ( mlib_affine_param *param) { DECLAREVAR_BL(); DTYPE *dstLineEnd; DTYPE *srcPixelPtr2; __m128i deltax, deltay; __m128i const_7fff = _mm_set1_epi16(0x7fff); __m128i const_7ffe = _mm_set1_epi16(0x7ffe); __m128i mask_8000 = _mm_set1_epi16(0x8000); for (j = yStart; j <= yFinish; j++) { __m128i a00_0s, a01_0s, a10_0s, a11_0s; __m128i pix0_0s, pix0_1s, pix1_0s, pix1_1s, res0s; __m128i pix0s, pix1s; __m128i fdxs, fdys; __m128i fdx2s, fdy2s; mlib_s32 fdx, fdy; mlib_s32 a00_0, a01_0, a10_0, a11_0; mlib_s32 a00_1, a01_1, a10_1, a11_1; mlib_s32 pix0_0, pix1_0, res0; mlib_s32 pix0_1, pix1_1, res1;
void aom_filter_block1d8_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits (first and second byte) // across 128 bit register firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 128 bit register forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); filt1Reg = _mm_load_si128((__m128i const *)filt1_global); filt2Reg = _mm_load_si128((__m128i const *)filt2_global); filt3Reg = _mm_load_si128((__m128i const *)filt3_global); filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); // add and saturate all the results together minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pixels_per_line; // save only 8 bytes _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); output_ptr += output_pitch; } }
void aom_filter_block1d8_v8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i addFilterReg64, filtersReg, minReg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; __m128i srcReg8; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits in the filter secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits in the filter thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits in the filter forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 8 bytes srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 8 bytes srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); // merge the result together srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); // add and saturate the results together minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pitch; // shift down a row srcReg1 = srcReg2; srcReg2 = srcReg3; srcReg3 = srcReg4; srcReg4 = srcReg5; srcReg5 = srcReg6; srcReg6 = srcReg7; srcReg7 = srcReg8; // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); output_ptr += out_pitch; } }
/** * \brief Linear interpolation for 4 pixels. Returns 4 filtered pixels in lowest 32-bits of the register. * \param ref_main Reference pixels * \param delta_pos Fractional pixel precise position of sample displacement * \param x Sample offset in direction x in ref_main array */ static INLINE __m128i filter_4x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){ int8_t delta_int = delta_pos >> 5; int8_t delta_fract = delta_pos & (32-1); __m128i sample0 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int])); __m128i sample1 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int + 1])); __m128i pairs = _mm_unpacklo_epi8(sample0, sample1); __m128i weight = _mm_set1_epi16( (delta_fract << 8) | (32 - delta_fract) ); sample0 = _mm_maddubs_epi16(pairs, weight); sample0 = _mm_add_epi16(sample0, _mm_set1_epi16(16)); sample0 = _mm_srli_epi16(sample0, 5); sample0 = _mm_packus_epi16(sample0, sample0); return sample0; } /** * \brief Linear interpolation for 4x4 block. Writes filtered 4x4 block to dst. * \param dst Destination buffer * \param ref_main Reference pixels * \param sample_disp Sample displacement per row * \param vertical_mode Mode direction, true if vertical */
EB_ERRORTYPE GatherSaoStatisticsLcu_OnlyEo_90_45_135_16bit_SSE2_INTRIN( EB_U16 *inputSamplePtr, // input parameter, source Picture Ptr EB_U32 inputStride, // input parameter, source stride EB_U16 *reconSamplePtr, // input parameter, deblocked Picture Ptr EB_U32 reconStride, // input parameter, deblocked stride EB_U32 lcuWidth, // input parameter, LCU width EB_U32 lcuHeight, // input parameter, LCU height EB_S32 eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1], // output parameter, used to store Edge Offset diff, eoDiff[SAO_EO_TYPES] [SAO_EO_CATEGORIES] EB_U16 eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1]) // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES] // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES] { #define boShift 5 EB_ERRORTYPE return_error = EB_ErrorNone; EB_U64 count_x, count_y; EB_S32 diff; __m128i xmm0, xmm_1, xmm_N1, xmm_N3, xmm_N4, xmm_skip_mask, xmm9, xmm10, xmm11, xmm12, xmm13, xmm15; __m128i xmm_temp_input1, xmm_temp_input2, xmm_temp_recon1, xmm_temp_recon2, xmm_diff1, xmm_diff2; __m128i xmm_sign_1, xmm_sign_1a, xmm_sign_1b, xmm_sign_2a, xmm_sign_2b, xmm_sign_2, xmm_eoIndex; xmm0 = _mm_setzero_si128(); xmm12 = _mm_setzero_si128(); xmm15 = _mm_set1_epi16(0x0001); xmm_N1 = _mm_set1_epi8((signed char)0xFF); xmm_N3 = _mm_set1_epi8((signed char)0xFD); xmm_N4 = _mm_set1_epi8((signed char)0xFC); xmm_1 = _mm_sub_epi8(xmm0, xmm_N1); // Initialize SAO Arrays EB_ALIGN(16) EB_S8 rTemp[512] = { 0 }; EB_U64 reconStrideTemp; lcuHeight -= 2; inputSamplePtr += inputStride + 1; reconSamplePtr++; if (lcuWidth == 16) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 2); for (count_y = 0; count_y < lcuHeight; ++count_y) { xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 4); //skip last 2 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 4); //skip last 2 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStride; } lcuWidth = 2; } else if (lcuWidth == 28) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 6); for (count_y = 0; count_y < lcuHeight; ++count_y) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) //----------- 16-25 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 16)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 24)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 16)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 24)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 12); //skip last 6 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 12); //skip last 6 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr+16, reconSamplePtr+2*reconStride+16) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr+15, reconSamplePtr+2*reconStride+17) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+17, reconSamplePtr+2*reconStride+15) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStride; } lcuWidth = 6; } else if (lcuWidth == 56) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 10); lcuWidth -= 8; inputStride -= lcuWidth; reconStrideTemp = reconStride - lcuWidth; for (count_y = 0; count_y < lcuHeight; ++count_y) { for (count_x = 0; count_x < lcuWidth; count_x += 16) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr + 2 * reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr - 1, reconSamplePtr + 2 * reconStride + 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr + 1, reconSamplePtr + 2 * reconStride - 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += 16; reconSamplePtr += 16; } //----------- 48-53 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff1 = _mm_slli_si128(xmm_diff1, 4); //skip last 10 samples xmm_diff1 = _mm_srli_si128(xmm_diff1, 4); //skip last 10 samples // EO-90 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStrideTemp; } lcuWidth = 10; } else { lcuWidth -= 16; inputStride -= lcuWidth; reconStrideTemp = reconStride - lcuWidth; xmm_skip_mask = _mm_srli_si128(xmm_N1, 2); for (count_y = 0; count_y < lcuHeight; ++count_y) { for (count_x = 0; count_x < lcuWidth; count_x += 16) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); //EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr + 2 * reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) //EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr - 1, reconSamplePtr + 2 * reconStride + 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) //EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr + 1, reconSamplePtr + 2 * reconStride - 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += 16; reconSamplePtr += 16; } //----------- 48-61 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 4); //skip last 2 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 4); //skip last 2 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStrideTemp; } lcuWidth = 2; } lcuWidth = (EB_U16)lcuWidth * (EB_U16)lcuHeight; MACRO_SAVE_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1, 1) MACRO_SAVE_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2, 2) MACRO_SAVE_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3, 3) return return_error; }
int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]); DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]); DECLARE_ALIGNED(16, unsigned char, flat_op[3][8]); DECLARE_ALIGNED(16, unsigned char, flat_oq[3][8]); DECLARE_ALIGNED(16, unsigned char, ap[8][8]); DECLARE_ALIGNED(16, unsigned char, aq[8][8]); __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; __m128i q5, q6, q7; int i = 0; const unsigned int extended_thresh = _thresh[0] * 0x01010101u; const unsigned int extended_limit = _limit[0] * 0x01010101u; const unsigned int extended_blimit = _blimit[0] * 0x01010101u; const __m128i thresh = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); const __m128i limit = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); const __m128i blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
static alignment_end* sw_sse2_word (const int8_t* ref, int8_t ref_dir, // 0: forward ref; 1: reverse ref int32_t refLen, int32_t readLen, const uint8_t weight_gapO, /* will be used as - */ const uint8_t weight_gapE, /* will be used as - */ const __m128i* vProfile, uint16_t terminate, int32_t maskLen) { #define max8(m, vm) (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 8)); \ (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 4)); \ (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 2)); \ (m) = _mm_extract_epi16((vm), 0) uint16_t max = 0; /* the max alignment score */ int32_t end_read = readLen - 1; int32_t end_ref = 0; /* 1_based best alignment ending point; Initialized as isn't aligned - 0. */ int32_t segLen = (readLen + 7) / 8; /* number of segment */ /* array to record the largest score of each reference position */ uint16_t* maxColumn = (uint16_t*) calloc(refLen, 2); /* array to record the alignment read ending position of the largest score of each reference position */ int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t)); /* Define 16 byte 0 vector. */ __m128i vZero = _mm_set1_epi32(0); __m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i)); __m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i)); __m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i)); __m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i)); int32_t i, j, k; /* 16 byte insertion begin vector */ __m128i vGapO = _mm_set1_epi16(weight_gapO); /* 16 byte insertion extension vector */ __m128i vGapE = _mm_set1_epi16(weight_gapE); __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */ __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */ __m128i vTemp; int32_t edge, begin = 0, end = refLen, step = 1; /* outer loop to process the reference sequence */ if (ref_dir == 1) { begin = refLen - 1; end = -1; step = -1; } for (i = begin; LIKELY(i != end); i += step) { int32_t cmp; __m128i e, vF = vZero; /* Initialize F value to 0. Any errors to vH values will be corrected in the Lazy_F loop. */ __m128i vH = pvHStore[segLen - 1]; vH = _mm_slli_si128 (vH, 2); /* Shift the 128-bit value in vH left by 2 byte. */ /* Swap the 2 H buffers. */ __m128i* pv = pvHLoad; __m128i vMaxColumn = vZero; /* vMaxColumn is used to record the max values of column i. */ const __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */ pvHLoad = pvHStore; pvHStore = pv; /* inner loop to process the query sequence */ for (j = 0; LIKELY(j < segLen); j ++) { vH = _mm_adds_epi16(vH, _mm_load_si128(vP + j)); /* Get max from vH, vE and vF. */ e = _mm_load_si128(pvE + j); vH = _mm_max_epi16(vH, e); vH = _mm_max_epi16(vH, vF); vMaxColumn = _mm_max_epi16(vMaxColumn, vH); /* Save vH values. */ _mm_store_si128(pvHStore + j, vH); /* Update vE value. */ vH = _mm_subs_epu16(vH, vGapO); /* saturation arithmetic, result >= 0 */ e = _mm_max_epi16(e, vH); e = _mm_subs_epu16(e, vGapE); _mm_store_si128(pvE + j, e); /* Update vF value. */ vF = _mm_max_epi16(vF, vH); vF = _mm_subs_epu16(vF, vGapE); /* Load the next vH. */ vH = _mm_load_si128(pvHLoad + j); } /* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */ for (k = 0; LIKELY(k < 8); ++k) { vF = _mm_slli_si128 (vF, 2); for (j = 0; LIKELY(j < segLen); ++j) { vH = _mm_load_si128(pvHStore + j); vH = _mm_max_epi16(vH, vF); _mm_store_si128(pvHStore + j, vH); vH = _mm_subs_epu16(vH, vGapO); vF = _mm_subs_epu16(vF, vGapE); if (UNLIKELY(! _mm_movemask_epi8(_mm_cmpgt_epi16(vF, vH)))) goto end; } } end: vMaxScore = _mm_max_epi16(vMaxScore, vMaxColumn); vTemp = _mm_cmpeq_epi16(vMaxMark, vMaxScore); cmp = _mm_movemask_epi8(vTemp); if (cmp != 0xffff) { uint16_t temp; vMaxMark = vMaxScore; max8(temp, vMaxScore); vMaxScore = vMaxMark; if (LIKELY(temp > max)) { max = temp; end_ref = i; for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j]; } } /* Record the max score of current column. */ max8(maxColumn[i], vMaxColumn); if (maxColumn[i] == terminate) break; } /* Trace the alignment ending position on read. */ uint16_t *t = (uint16_t*)pvHmax; int32_t column_len = segLen * 8; for (i = 0; LIKELY(i < column_len); ++i, ++t) { int32_t temp; if (*t == max) { temp = i / 8 + i % 8 * segLen; if (temp < end_read) end_read = temp; } } free(pvHmax); free(pvE); free(pvHLoad); free(pvHStore); /* Find the most possible 2nd best alignment. */ alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end)); bests[0].score = max; bests[0].ref = end_ref; bests[0].read = end_read; bests[1].score = 0; bests[1].ref = 0; bests[1].read = 0; edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0; for (i = 0; i < edge; i ++) { if (maxColumn[i] > bests[1].score) { bests[1].score = maxColumn[i]; bests[1].ref = i; } } edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen); for (i = edge; i < refLen; i ++) { if (maxColumn[i] > bests[1].score) { bests[1].score = maxColumn[i]; bests[1].ref = i; } } free(maxColumn); free(end_read_column); return bests; }
int16_t * const restrict F_pr = _F_pr+PAD; #ifdef PARASAIL_TABLE parasail_result_t *result = parasail_result_new_table1(s1Len, s2Len); #else #ifdef PARASAIL_ROWCOL parasail_result_t *result = parasail_result_new_rowcol1(s1Len, s2Len); #else parasail_result_t *result = parasail_result_new(); #endif #endif int32_t i = 0; int32_t j = 0; int32_t end_query = s1Len-1; int32_t end_ref = s2Len-1; int16_t score = NEG_INF; __m128i vNegInf = _mm_set1_epi16(NEG_INF); __m128i vOpen = _mm_set1_epi16(open); __m128i vGap = _mm_set1_epi16(gap); __m128i vOne = _mm_set1_epi16(1); __m128i vN = _mm_set1_epi16(N); __m128i vGapN = _mm_set1_epi16(gap*N); __m128i vNegOne = _mm_set1_epi16(-1); __m128i vI = _mm_set_epi16(0,1,2,3,4,5,6,7); __m128i vJreset = _mm_set_epi16(0,-1,-2,-3,-4,-5,-6,-7); __m128i vMax = vNegInf; __m128i vILimit = _mm_set1_epi16(s1Len); __m128i vILimit1 = _mm_sub_epi16(vILimit, vOne); __m128i vJLimit = _mm_set1_epi16(s2Len); __m128i vJLimit1 = _mm_sub_epi16(vJLimit, vOne); __m128i vIBoundary = _mm_set_epi16( -open-0*gap,
void ulsch_channel_compensation(int **rxdataF_ext, int **ul_ch_estimates_ext, int **ul_ch_mag, int **ul_ch_magb, int **rxdataF_comp, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift) { unsigned short rb; __m128i *ul_ch128,*ul_ch_mag128,*ul_ch_mag128b,*rxdataF128,*rxdataF_comp128; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) QAM_amp128U = _mm_set1_epi16(QAM16_n1); else if (Qm == 6) { QAM_amp128U = _mm_set1_epi16(QAM64_n1); QAM_amp128bU = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128 = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128 = (__m128i *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b = (__m128i *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128 = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); #ifdef OFDMA_ULSCH if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[0] = ul_ch_mag128[0]; ul_ch_mag128[0] = _mm_mulhi_epi16(ul_ch_mag128[0],QAM_amp128U); ul_ch_mag128[0] = _mm_slli_epi16(ul_ch_mag128[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[1] = ul_ch_mag128[1]; ul_ch_mag128[1] = _mm_mulhi_epi16(ul_ch_mag128[1],QAM_amp128U); ul_ch_mag128[1] = _mm_slli_epi16(ul_ch_mag128[1],2); // 2 to compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b[2] = ul_ch_mag128[2]; ul_ch_mag128[2] = _mm_mulhi_epi16(ul_ch_mag128[2],QAM_amp128U); ul_ch_mag128[2] = _mm_slli_epi16(ul_ch_mag128[2],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[0] = _mm_mulhi_epi16(ul_ch_mag128b[0],QAM_amp128bU); ul_ch_mag128b[0] = _mm_slli_epi16(ul_ch_mag128b[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[1] = _mm_mulhi_epi16(ul_ch_mag128b[1],QAM_amp128bU); ul_ch_mag128b[1] = _mm_slli_epi16(ul_ch_mag128b[1],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[2] = _mm_mulhi_epi16(ul_ch_mag128b[2],QAM_amp128bU); ul_ch_mag128b[2] = _mm_slli_epi16(ul_ch_mag128b[2],2);// 2 to compensate the scale channel estimate } #else mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift-1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); // printf("comp: symbol %d rb %d => %d,%d,%d\n",symbol,rb,*((short*)&ul_ch_mag128[0]),*((short*)&ul_ch_mag128[1]),*((short*)&ul_ch_mag128[2])); #endif // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128[0]); // print_shorts("pack:",rxdataF_comp128[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128[1]); // print_shorts("pack:",rxdataF_comp128[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128[2]); // print_shorts("pack:",rxdataF_comp128[2]); ul_ch128+=3; ul_ch_mag128+=3; ul_ch_mag128b+=3; rxdataF128+=3; rxdataF_comp128+=3; } } _mm_empty(); _m_empty(); }
static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *filter, int w) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); int i; for (i = 0; i < w; i += 16) { const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); const __m128i C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); const __m128i D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); const __m128i E = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); const __m128i F = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); const __m128i G = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); const __m128i H = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); // multiply 2 adjacent elements with the filter and add the result const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); // add and saturate the results together const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); // merge the result together const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); // multiply 2 adjacent elements with the filter and add the result const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); // merge the result together const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); // multiply 2 adjacent elements with the filter and add the result const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); // add and saturate the results together __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); // add and saturate the results together temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); // round and shift by 7 bit each 16 bit temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result temp_hi = _mm_packus_epi16(temp_lo, temp_hi); src_ptr += 16; // save 16 bytes convolve result _mm_store_si128((__m128i *)&dst[i], temp_hi); } }
void ulsch_channel_compensation_alamouti(int **rxdataF_ext, // For Distributed Alamouti Combining int **ul_ch_estimates_ext_0, int **ul_ch_estimates_ext_1, int **ul_ch_mag_0, int **ul_ch_magb_0, int **ul_ch_mag_1, int **ul_ch_magb_1, int **rxdataF_comp_0, int **rxdataF_comp_1, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift_0, unsigned char output_shift_1) { unsigned short rb; __m128i *ul_ch128_0,*ul_ch128_1,*ul_ch_mag128_0,*ul_ch_mag128_1,*ul_ch_mag128b_0,*ul_ch_mag128b_1,*rxdataF128,*rxdataF_comp128_0,*rxdataF_comp128_1; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) { QAM_amp128U_0 = _mm_set1_epi16(QAM16_n1); QAM_amp128U_1 = _mm_set1_epi16(QAM16_n1); } else if (Qm == 6) { QAM_amp128U_0 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_0 = _mm_set1_epi16(QAM64_n2); QAM_amp128U_1 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_1 = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128_0 = (__m128i *)&ul_ch_estimates_ext_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_0 = (__m128i *)&ul_ch_mag_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_0 = (__m128i *)&ul_ch_magb_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch128_1 = (__m128i *)&ul_ch_estimates_ext_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_1 = (__m128i *)&ul_ch_mag_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_1 = (__m128i *)&ul_ch_magb_1[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_0 = (__m128i *)&rxdataF_comp_0[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp_1[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],ul_ch128_0[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_madd_epi16(ul_ch128_0[1],ul_ch128_0[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_0[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[0] = ul_ch_mag128_0[0]; ul_ch_mag128_0[0] = _mm_mulhi_epi16(ul_ch_mag128_0[0],QAM_amp128U_0); ul_ch_mag128_0[0] = _mm_slli_epi16(ul_ch_mag128_0[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_0[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[1] = ul_ch_mag128_0[1]; ul_ch_mag128_0[1] = _mm_mulhi_epi16(ul_ch_mag128_0[1],QAM_amp128U_0); ul_ch_mag128_0[1] = _mm_slli_epi16(ul_ch_mag128_0[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],ul_ch128_0[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_0[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_0[2] = ul_ch_mag128_0[2]; ul_ch_mag128_0[2] = _mm_mulhi_epi16(ul_ch_mag128_0[2],QAM_amp128U_0); ul_ch_mag128_0[2] = _mm_slli_epi16(ul_ch_mag128_0[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_0[0] = _mm_mulhi_epi16(ul_ch_mag128b_0[0],QAM_amp128bU_0); ul_ch_mag128b_0[0] = _mm_slli_epi16(ul_ch_mag128b_0[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[1] = _mm_mulhi_epi16(ul_ch_mag128b_0[1],QAM_amp128bU_0); ul_ch_mag128b_0[1] = _mm_slli_epi16(ul_ch_mag128b_0[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[2] = _mm_mulhi_epi16(ul_ch_mag128b_0[2],QAM_amp128bU_0); ul_ch_mag128b_0[2] = _mm_slli_epi16(ul_ch_mag128b_0[2],2); // 2 to scale compensate the scale channel estima mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],ul_ch128_1[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_madd_epi16(ul_ch128_1[1],ul_ch128_1[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_1[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[0] = ul_ch_mag128_1[0]; ul_ch_mag128_1[0] = _mm_mulhi_epi16(ul_ch_mag128_1[0],QAM_amp128U_1); ul_ch_mag128_1[0] = _mm_slli_epi16(ul_ch_mag128_1[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_1[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[1] = ul_ch_mag128_1[1]; ul_ch_mag128_1[1] = _mm_mulhi_epi16(ul_ch_mag128_1[1],QAM_amp128U_1); ul_ch_mag128_1[1] = _mm_slli_epi16(ul_ch_mag128_1[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],ul_ch128_1[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_1[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_1[2] = ul_ch_mag128_1[2]; ul_ch_mag128_1[2] = _mm_mulhi_epi16(ul_ch_mag128_1[2],QAM_amp128U_0); ul_ch_mag128_1[2] = _mm_slli_epi16(ul_ch_mag128_1[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_1[0] = _mm_mulhi_epi16(ul_ch_mag128b_1[0],QAM_amp128bU_1); ul_ch_mag128b_1[0] = _mm_slli_epi16(ul_ch_mag128b_1[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[1] = _mm_mulhi_epi16(ul_ch_mag128b_1[1],QAM_amp128bU_1); ul_ch_mag128b_1[1] = _mm_slli_epi16(ul_ch_mag128b_1[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[2] = _mm_mulhi_epi16(ul_ch_mag128b_1[2],QAM_amp128bU_1); ul_ch_mag128b_1[2] = _mm_slli_epi16(ul_ch_mag128b_1[2],2); // 2 to scale compensate the scale channel estima } /************************For Computing (y)*(h0*)********************************************/ // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_0[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128_0[0]); // print_shorts("pack:",rxdataF_comp128_0[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128_0[1]); // print_shorts("pack:",rxdataF_comp128_0[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_0[2]); /*************************For Computing (y*)*(h1)************************************/ // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_1[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch_conjugate:",ul_ch128_1[0]); // print_shorts("pack:",rxdataF_comp128_1[0]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch_conjugate:",ul_ch128_1[1]); // print_shorts("pack:",rxdataF_comp128_1[1]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch_conjugate:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_1[2]); ul_ch128_0+=3; ul_ch_mag128_0+=3; ul_ch_mag128b_0+=3; ul_ch128_1+=3; ul_ch_mag128_1+=3; ul_ch_mag128b_1+=3; rxdataF128+=3; rxdataF_comp128_0+=3; rxdataF_comp128_1+=3; } } _mm_empty(); _m_empty(); }
static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 // // To be able to use signed 16-bit integers, we use the following trick to // have constants within range: // - Associated constants are obtained by subtracting the 16-bit fixed point // version of one: // k = K - (1 << 16) => K = k + (1 << 16) // K1 = 85267 => k1 = 20091 // K2 = 35468 => k2 = -30068 // - The multiplication of a variable by a constant become the sum of the // variable and the multiplication of that variable by the associated // constant: // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x const __m128i k1 = _mm_set1_epi16(20091); const __m128i k2 = _mm_set1_epi16(-30068); __m128i T0, T1, T2, T3; // Load and concatenate the transform coefficients (we'll do two transforms // in parallel). In the case of only one transform, the second half of the // vectors will just contain random value we'll never use nor store. __m128i in0, in1, in2, in3; { in0 = _mm_loadl_epi64((__m128i*)&in[0]); in1 = _mm_loadl_epi64((__m128i*)&in[4]); in2 = _mm_loadl_epi64((__m128i*)&in[8]); in3 = _mm_loadl_epi64((__m128i*)&in[12]); // a00 a10 a20 a30 x x x x // a01 a11 a21 a31 x x x x // a02 a12 a22 a32 x x x x // a03 a13 a23 a33 x x x x if (do_two) { const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]); const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]); const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]); const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]); in0 = _mm_unpacklo_epi64(in0, inB0); in1 = _mm_unpacklo_epi64(in1, inB1); in2 = _mm_unpacklo_epi64(in2, inB2); in3 = _mm_unpacklo_epi64(in3, inB3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } } // Vertical pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i a = _mm_add_epi16(in0, in2); const __m128i b = _mm_sub_epi16(in0, in2); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 const __m128i c1 = _mm_mulhi_epi16(in1, k2); const __m128i c2 = _mm_mulhi_epi16(in3, k1); const __m128i c3 = _mm_sub_epi16(in1, in3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 const __m128i d1 = _mm_mulhi_epi16(in1, k1); const __m128i d2 = _mm_mulhi_epi16(in3, k2); const __m128i d3 = _mm_add_epi16(in1, in3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i four = _mm_set1_epi16(4); const __m128i dc = _mm_add_epi16(T0, four); const __m128i a = _mm_add_epi16(dc, T2); const __m128i b = _mm_sub_epi16(dc, T2); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 const __m128i c1 = _mm_mulhi_epi16(T1, k2); const __m128i c2 = _mm_mulhi_epi16(T3, k1); const __m128i c3 = _mm_sub_epi16(T1, T3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 const __m128i d1 = _mm_mulhi_epi16(T1, k1); const __m128i d2 = _mm_mulhi_epi16(T3, k2); const __m128i d3 = _mm_add_epi16(T1, T3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Add inverse transform to 'dst' and store. { const __m128i zero = _mm_setzero_si128(); // Load the reference(s). __m128i dst0, dst1, dst2, dst3; if (do_two) { // Load eight bytes/pixels per line. dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]); dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]); dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]); dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]); } else { // Load four bytes/pixels per line. dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]); dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]); dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]); dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]); } // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); dst2 = _mm_unpacklo_epi8(dst2, zero); dst3 = _mm_unpacklo_epi8(dst3, zero); // Add the inverse transform(s). dst0 = _mm_add_epi16(dst0, T0); dst1 = _mm_add_epi16(dst1, T1); dst2 = _mm_add_epi16(dst2, T2); dst3 = _mm_add_epi16(dst3, T3); // Unsigned saturate to 8b. dst0 = _mm_packus_epi16(dst0, dst0); dst1 = _mm_packus_epi16(dst1, dst1); dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. if (do_two) { // Store eight bytes/pixels per line. _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0); _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1); _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2); _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3); } else { // Store four bytes/pixels per line. *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0); *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1); *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2); *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3); } } }
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel. static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits, uint32_t* dst) { int x; assert(xbits >= 0); assert(xbits <= 3); switch (xbits) { case 0: { const __m128i ff = _mm_set1_epi16(0xff00); const __m128i zero = _mm_setzero_si128(); // Store 0xff000000 | (row[x] << 8). for (x = 0; x + 16 <= width; x += 16, dst += 16) { const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); const __m128i in_lo = _mm_unpacklo_epi8(zero, in); const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff); const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff); const __m128i in_hi = _mm_unpackhi_epi8(zero, in); const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff); const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff); _mm_storeu_si128((__m128i*)&dst[0], dst0); _mm_storeu_si128((__m128i*)&dst[4], dst1); _mm_storeu_si128((__m128i*)&dst[8], dst2); _mm_storeu_si128((__m128i*)&dst[12], dst3); } break; } case 1: { const __m128i ff = _mm_set1_epi16(0xff00); const __m128i mul = _mm_set1_epi16(0x110); for (x = 0; x + 16 <= width; x += 16, dst += 8) { // 0a0b | (where a/b are 4 bits). const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); const __m128i tmp = _mm_mullo_epi16(in, mul); // aba0 const __m128i pack = _mm_and_si128(tmp, ff); // ab00 const __m128i dst0 = _mm_unpacklo_epi16(pack, ff); const __m128i dst1 = _mm_unpackhi_epi16(pack, ff); _mm_storeu_si128((__m128i*)&dst[0], dst0); _mm_storeu_si128((__m128i*)&dst[4], dst1); } break; } case 2: { const __m128i mask_or = _mm_set1_epi32(0xff000000); const __m128i mul_cst = _mm_set1_epi16(0x0104); const __m128i mask_mul = _mm_set1_epi16(0x0f00); for (x = 0; x + 16 <= width; x += 16, dst += 4) { // 000a000b000c000d | (where a/b/c/d are 2 bits). const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); const __m128i mul = _mm_mullo_epi16(in, mul_cst); // 00ab00b000cd00d0 const __m128i tmp = _mm_and_si128(mul, mask_mul); // 00ab000000cd0000 const __m128i shift = _mm_srli_epi32(tmp, 12); // 00000000ab000000 const __m128i pack = _mm_or_si128(shift, tmp); // 00000000abcd0000 // Convert to 0xff00**00. const __m128i res = _mm_or_si128(pack, mask_or); _mm_storeu_si128((__m128i*)dst, res); } break; } default: { assert(xbits == 3); for (x = 0; x + 16 <= width; x += 16, dst += 2) { // 0000000a00000000b... | (where a/b are 1 bit). const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); const __m128i shift = _mm_slli_epi64(in, 7); const uint32_t move = _mm_movemask_epi8(shift); dst[0] = 0xff000000 | ((move & 0xff) << 8); dst[1] = 0xff000000 | (move & 0xff00); } break; } } if (x != width) { VP8LBundleColorMap_C(row + x, width - x, xbits, dst); } }
/*---------------------------------------------------------------------------*/ PRIM_STATIC pstatus_t sse2_yCbCrToRGB_16s16s_P3P3( const INT16 *pSrc[3], int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi) /* region of interest */ { __m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096; __m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf; int srcbump, dstbump, yp, imax; if (((ULONG_PTR) (pSrc[0]) & 0x0f) || ((ULONG_PTR) (pSrc[1]) & 0x0f) || ((ULONG_PTR) (pSrc[2]) & 0x0f) || ((ULONG_PTR) (pDst[0]) & 0x0f) || ((ULONG_PTR) (pDst[1]) & 0x0f) || ((ULONG_PTR) (pDst[2]) & 0x0f) || (roi->width & 0x07) || (srcStep & 127) || (dstStep & 127)) { /* We can't maintain 16-byte alignment. */ return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi); } zero = _mm_setzero_si128(); max = _mm_set1_epi16(255); y_buf = (__m128i*) (pSrc[0]); cb_buf = (__m128i*) (pSrc[1]); cr_buf = (__m128i*) (pSrc[2]); r_buf = (__m128i*) (pDst[0]); g_buf = (__m128i*) (pDst[1]); b_buf = (__m128i*) (pDst[2]); r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */ g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */ g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */ b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */ c4096 = _mm_set1_epi16(4096); srcbump = srcStep / sizeof(__m128i); dstbump = dstStep / sizeof(__m128i); #ifdef DO_PREFETCH /* Prefetch Y's, Cb's, and Cr's. */ for (yp=0; yp<roi->height; yp++) { int i; for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; } y_buf = (__m128i*) (pSrc[0]); cb_buf = (__m128i*) (pSrc[1]); cr_buf = (__m128i*) (pSrc[2]); #endif /* DO_PREFETCH */ imax = roi->width * sizeof(INT16) / sizeof(__m128i); for (yp=0; yp<roi->height; ++yp) { int i; for (i=0; i<imax; i++) { /* In order to use SSE2 signed 16-bit integer multiplication * we need to convert the floating point factors to signed int * without losing information. * The result of this multiplication is 32 bit and we have two * SSE instructions that return either the hi or lo word. * Thus we will multiply the factors by the highest possible 2^n, * take the upper 16 bits of the signed 32-bit result * (_mm_mulhi_epi16) and correct this result by multiplying * it by 2^(16-n). * * For the given factors in the conversion matrix the best * possible n is 14. * * Example for calculating r: * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 */ /* y = (y_r_buf[i] + 4096) >> 2 */ __m128i y, cb, cr, r, g, b; y = _mm_load_si128(y_buf + i); y = _mm_add_epi16(y, c4096); y = _mm_srai_epi16(y, 2); /* cb = cb_g_buf[i]; */ cb = _mm_load_si128(cb_buf + i); /* cr = cr_b_buf[i]; */ cr = _mm_load_si128(cr_buf + i); /* (y + HIWORD(cr*22986)) >> 3 */ r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr)); r = _mm_srai_epi16(r, 3); /* r_buf[i] = MINMAX(r, 0, 255); */ _mm_between_epi16(r, zero, max); _mm_store_si128(r_buf + i, r); /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb)); g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr)); g = _mm_srai_epi16(g, 3); /* g_buf[i] = MINMAX(g, 0, 255); */ _mm_between_epi16(g, zero, max); _mm_store_si128(g_buf + i, g); /* (y + HIWORD(cb*28999)) >> 3 */ b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb)); b = _mm_srai_epi16(b, 3); /* b_buf[i] = MINMAX(b, 0, 255); */ _mm_between_epi16(b, zero, max); _mm_store_si128(b_buf + i, b); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; r_buf += dstbump; g_buf += dstbump; b_buf += dstbump; } return PRIMITIVES_SUCCESS; }
void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; (void)scan_ptr; coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); if (!skip_block) { __m128i eob; __m128i zbin; __m128i round, quant, dequant, shift; { __m128i coeff0, coeff1; // Setup global values { __m128i pw_1; zbin = _mm_load_si128((const __m128i *)zbin_ptr); round = _mm_load_si128((const __m128i *)round_ptr); quant = _mm_load_si128((const __m128i *)quant_ptr); pw_1 = _mm_set1_epi16(1); zbin = _mm_sub_epi16(zbin, pw_1); dequant = _mm_load_si128((const __m128i *)dequant_ptr); shift = _mm_load_si128((const __m128i *)quant_shift_ptr); } { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; // Do DC and first 15 AC coeff0 = load_tran_low(coeff_ptr + n_coeffs); coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); qcoeff0 = _mm_adds_epi16(qcoeff0, round); round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); shift = _mm_unpackhi_epi64(shift, shift); qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); // Reinsert signs qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob = _mm_max_epi16(eob, eob1); } n_coeffs += 8 * 2; } // AC only loop while (n_coeffs < 0) { __m128i coeff0, coeff1; { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; coeff0 = load_tran_low(coeff_ptr + n_coeffs); coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); qcoeff0 = _mm_adds_epi16(qcoeff0, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); // Reinsert signs qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob0, eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob0 = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob0 = _mm_max_epi16(eob0, eob1); eob = _mm_max_epi16(eob, eob0); } n_coeffs += 8 * 2; } // Accumulate EOB { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); *eob_ptr = _mm_extract_epi16(eob, 1); } } else { do { store_tran_low(zero, dqcoeff_ptr + n_coeffs); store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8); store_tran_low(zero, qcoeff_ptr + n_coeffs); store_tran_low(zero, qcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; } }
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point * numbers. See the general code above. */ PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3( const INT16 *pSrc[3], int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi) /* region of interest */ { __m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b; __m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf; int srcbump, dstbump, yp, imax; if (((ULONG_PTR) (pSrc[0]) & 0x0f) || ((ULONG_PTR) (pSrc[1]) & 0x0f) || ((ULONG_PTR) (pSrc[2]) & 0x0f) || ((ULONG_PTR) (pDst[0]) & 0x0f) || ((ULONG_PTR) (pDst[1]) & 0x0f) || ((ULONG_PTR) (pDst[2]) & 0x0f) || (roi->width & 0x07) || (srcStep & 127) || (dstStep & 127)) { /* We can't maintain 16-byte alignment. */ return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi); } min = _mm_set1_epi16(-128 << 5); max = _mm_set1_epi16(127 << 5); r_buf = (__m128i*) (pSrc[0]); g_buf = (__m128i*) (pSrc[1]); b_buf = (__m128i*) (pSrc[2]); y_buf = (__m128i*) (pDst[0]); cb_buf = (__m128i*) (pDst[1]); cr_buf = (__m128i*) (pDst[2]); y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */ y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */ y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */ cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */ cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */ cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */ cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */ cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */ cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */ srcbump = srcStep / sizeof(__m128i); dstbump = dstStep / sizeof(__m128i); #ifdef DO_PREFETCH /* Prefetch RGB's. */ for (yp=0; yp<roi->height; yp++) { int i; for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA); } r_buf += srcbump; g_buf += srcbump; b_buf += srcbump; } r_buf = (__m128i*) (pSrc[0]); g_buf = (__m128i*) (pSrc[1]); b_buf = (__m128i*) (pSrc[2]); #endif /* DO_PREFETCH */ imax = roi->width * sizeof(INT16) / sizeof(__m128i); for (yp=0; yp<roi->height; ++yp) { int i; for (i=0; i<imax; i++) { /* In order to use SSE2 signed 16-bit integer multiplication we * need to convert the floating point factors to signed int * without loosing information. The result of this multiplication * is 32 bit and using SSE2 we get either the product's hi or lo * word. Thus we will multiply the factors by the highest * possible 2^n and take the upper 16 bits of the signed 32-bit * result (_mm_mulhi_epi16). Since the final result needs to * be scaled by << 5 and also in in order to keep the precision * within the upper 16 bits we will also have to scale the RGB * values used in the multiplication by << 5+(16-n). */ __m128i r, g, b, y, cb, cr; r = _mm_load_si128(y_buf+i); g = _mm_load_si128(g_buf+i); b = _mm_load_si128(b_buf+i); /* r<<6; g<<6; b<<6 */ r = _mm_slli_epi16(r, 6); g = _mm_slli_epi16(g, 6); b = _mm_slli_epi16(b, 6); /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */ y = _mm_mulhi_epi16(r, y_r); y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g)); y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b)); y = _mm_add_epi16(y, min); /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */ _mm_between_epi16(y, min, max); _mm_store_si128(y_buf+i, y); /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */ cb = _mm_mulhi_epi16(r, cb_r); cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g)); cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b)); /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */ _mm_between_epi16(cb, min, max); _mm_store_si128(cb_buf+i, cb); /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */ cr = _mm_mulhi_epi16(r, cr_r); cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g)); cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b)); /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */ _mm_between_epi16(cr, min, max); _mm_store_si128(cr_buf+i, cr); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; r_buf += dstbump; g_buf += dstbump; b_buf += dstbump; } return PRIMITIVES_SUCCESS; }
rfx_dwt_2d_decode_block_horiz_sse2(sint16* l, sint16* h, sint16* dst, int subband_width) { int y, n; sint16* l_ptr = l; sint16* h_ptr = h; sint16* dst_ptr = dst; int first; int last; __m128i l_n; __m128i h_n; __m128i h_n_m; __m128i tmp_n; __m128i dst_n; __m128i dst_n_p; __m128i dst1; __m128i dst2; for (y = 0; y < subband_width; y++) { /* Even coefficients */ for (n = 0; n < subband_width; n+=8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ l_n = _mm_load_si128((__m128i*) l_ptr); h_n = _mm_load_si128((__m128i*) h_ptr); h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1)); if (n == 0) { first = _mm_extract_epi16(h_n_m, 1); h_n_m = _mm_insert_epi16(h_n_m, first, 0); } tmp_n = _mm_add_epi16(h_n, h_n_m); tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1)); tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) l_ptr, dst_n); l_ptr+=8; h_ptr+=8; } l_ptr -= subband_width; h_ptr -= subband_width; /* Odd coefficients */ for (n = 0; n < subband_width; n+=8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ h_n = _mm_load_si128((__m128i*) h_ptr); h_n = _mm_slli_epi16(h_n, 1); dst_n = _mm_load_si128((__m128i*) (l_ptr)); dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1)); if (n == subband_width - 8) { last = _mm_extract_epi16(dst_n_p, 6); dst_n_p = _mm_insert_epi16(dst_n_p, last, 7); } tmp_n = _mm_add_epi16(dst_n_p, dst_n); tmp_n = _mm_srai_epi16(tmp_n, 1); tmp_n = _mm_add_epi16(tmp_n, h_n); dst1 = _mm_unpacklo_epi16(dst_n, tmp_n); dst2 = _mm_unpackhi_epi16(dst_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst1); _mm_store_si128((__m128i*) (dst_ptr + 8), dst2); l_ptr+=8; h_ptr+=8; dst_ptr+=16; } } }
void ahd_interpolate_tile(int top, char * buffer) { int row, col, tr, tc, c, val; const int dir[4] = { -1, 1, -width, width }; __m128i ldiff[2], abdiff[2]; union hvrgbpix (*rgb)[width] = (union hvrgbpix (*)[width])buffer; union hvrgbpix *rix; union rgbpix * pix; union hvrgbpix (*lab)[width]; short (*lix)[8]; char (*h**o)[width][2]; lab = (union hvrgbpix (*)[width])(buffer + 16*width*TS); h**o = (char (*)[width][2])(buffer + 32*width*TS); const int left=2; if ((uintptr_t)(image+top*width)&0xf || (uintptr_t)buffer&0xf) { fprintf(stderr, "unaligned buffers defeat speed!\n"); abort(); } /* Interpolate gren horz&vert, red and blue, and convert to CIELab: */ //do the first two rows of green first. //then one green, and rgb through the tile.. this because R/B needs down-right green value for (row=top; row < top+2 && row < height-2; row++) { col = left + (FC(row,left) & 1); for (c = FC(row,col); col < width-2; col+=2) { pix = (union rgbpix*)image + row*width+col; val = ((pix[-1].g + pix[0].c[c] + pix[1].g) * 2 - pix[-2].c[c] - pix[2].c[c]) >> 2; rgb[row-top][col-left].h.g = ULIM(val,pix[-1].g,pix[1].g); val = ((pix[-width].g + pix[0].c[c] + pix[width].g) * 2 - pix[-2*width].c[c] - pix[2*width].c[c]) >> 2; rgb[row-top][col-left].v.g = ULIM(val,pix[-width].g,pix[width].g); } } for (; row < top+TS && row < height-2; row++) { int rowx = row-1; if (FC(rowx,left+1)==1) { int c1 = FC(rowx+1,left+1), c2 = FC(rowx,left+2); pix = (union rgbpix*)image + row*width+left+1; rix = &rgb[row-top][1]; val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2; rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g); val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2; rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g); for (col=left+1; col < width-3; col+=2) { pix = (union rgbpix*)image + rowx*width+col+1; union hvrgbpix rixr, rix0; rix = &rgb[rowx-top][col-left]+1; signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1]; signed pix_ul = pix[-width-1].c[c1]; rixr.vec = _mm_set1_epi16(pix[-1].g); signed pix_lr = pix[-2].c[c2] + pix[0].c[c2]; rix0.h.c[c2] = rix0.v.c[c2] = pix[0].c[c2]; pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1] + 1; signed pix_dl = pix[width-1].c[c1]; //fully loaded __m128i rix_dr = _mm_setr_epi32(pix[width].g, pix[width-1].c[c1], pix[1].g, pix[-width+1].c[c1]); rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+1].c[c1], pix[width+3].c[c1], pix[width+1].c[c1], 0)); rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+2].g, 0, pix[2*width+1].g, pix[3*width+1].c[c1])); rix_dr = _mm_mullo_epi32(rix_dr,_mm_setr_epi32(2,1,2,1)); //half loaded rix_dr = _mm_hsub_epi32(rix_dr,_mm_setzero_si128()); rix_dr = _mm_srai_epi32(rix_dr,2); __m128i a = _mm_setr_epi32(pix[width].g,pix[1].g,0,0); __m128i b = _mm_setr_epi32(pix[width+2].g,pix[2*width+1].g,0,0); __m128i m = _mm_min_epi32(a,b); __m128i M = _mm_max_epi32(a,b); rix_dr = _mm_min_epi32(rix_dr,M); rix_dr = _mm_max_epi32(rix_dr,m); signed pix_udr = pix_ul + pix_dl; signed rix0_ul = rix[-width-1].h.g; signed rix1_ul = rix[-width-1].v.g; __m128i rix_ur = _mm_setr_epi32(rix[-width+1].h.g, rix[-width+1].v.g, 0, 0); signed rix0_rr = rix[-2].h.g; signed rix1_rr = rix[-2].v.g; rix0.h.g = rix[0].h.g; rix0.v.g = rix[0].v.g; signed rix0_dl = rix[width-1].h.g; signed rix1_dl = rix[width-1].v.g; // fully loaded __m128i rix_udr = _mm_setr_epi32(rix0_ul, rix1_ul, rix0_rr, rix1_rr); rix_udr = _mm_add_epi32(rix_udr, _mm_setr_epi32(rix0_dl, rix1_dl, rix0.h.g, rix0.v.g)); __m128i v2 = _mm_set_epi32(pix_lr, pix_lr, pix_udr, pix_udr); v2 = _mm_sub_epi32(v2, rix_udr); v2 = _mm_srai_epi32(v2,1); v2 = _mm_add_epi32(v2,_mm_cvtepu16_epi32(rixr.vec)); v2 = _mm_max_epi32(v2, _mm_setzero_si128()); v2 = _mm_min_epi32(v2, _mm_set1_epi32(0xffff)); rixr.h.c[c2] = _mm_extract_epi32(v2,2); rixr.v.c[c2] = _mm_extract_epi32(v2,3); rixr.h.c[c1] = _mm_extract_epi32(v2,0); rixr.v.c[c1] = _mm_extract_epi32(v2,1); // following only uses 64 bit __m128i v1 = _mm_set1_epi32(pix_diag); v1 = _mm_sub_epi32(v1, rix_ur); v1 = _mm_sub_epi32(v1, rix_dr); v1 = _mm_sub_epi32(v1, rix_udr); v1 = _mm_srai_epi32(v1,2); v1 = _mm_add_epi32(v1, _mm_setr_epi32(rix0.h.g, rix0.v.g, 0, 0)); v1 = _mm_max_epi32(v1, _mm_setzero_si128()); v1 = _mm_min_epi32(v1, _mm_set1_epi32(0xffff)); rix0.h.c[c1] = _mm_extract_epi32(v1,0); rix0.v.c[c1] = _mm_extract_epi32(v1,1); lab[rowx-top][col-left].vec = cielabv(rixr); lab[rowx-top][col-left+1].vec = cielabv(rix0); _mm_store_si128(&rix[-1].vec,rixr.vec); _mm_store_si128(&rix[0].vec,rix0.vec); rix[width+1].h.g = _mm_extract_epi32(rix_dr,0); rix[width+1].v.g = _mm_extract_epi32(rix_dr,1); } } else {
static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer) { __m128i zero = _mm_setzero_si128(); __m128i max = _mm_set1_epi16(255); __m128i* y_r_buf = (__m128i*) y_r_buffer; __m128i* cb_g_buf = (__m128i*) cb_g_buffer; __m128i* cr_b_buf = (__m128i*) cr_b_buffer; __m128i y; __m128i cr; __m128i cb; __m128i r; __m128i g; __m128i b; int i; for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA); } for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++) { /* y = (y_r_buf[i] >> 5) + 128; */ y = _mm_load_si128(&y_r_buf[i]); y = _mm_add_epi16(_mm_srai_epi16(y, 5), _mm_set1_epi16(128)); /* cr = cr_b_buf[i]; */ cr = _mm_load_si128(&cr_b_buf[i]); /* r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */ /* y_r_buf[i] = MINMAX(r, 0, 255); */ r = _mm_add_epi16(y, _mm_srai_epi16(cr, 5)); r = _mm_add_epi16(r, _mm_srai_epi16(cr, 7)); r = _mm_add_epi16(r, _mm_srai_epi16(cr, 8)); r = _mm_add_epi16(r, _mm_srai_epi16(cr, 11)); r = _mm_add_epi16(r, _mm_srai_epi16(cr, 12)); r = _mm_add_epi16(r, _mm_srai_epi16(cr, 13)); _mm_between_epi16(r, zero, max); _mm_store_si128(&y_r_buf[i], r); /* cb = cb_g_buf[i]; */ cb = _mm_load_si128(&cb_g_buf[i]); /* g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) - ((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */ /* cb_g_buf[i] = MINMAX(g, 0, 255); */ g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 7)); g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 9)); g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 10)); g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 6)); g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 8)); g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 9)); g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 11)); g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 12)); g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 13)); _mm_between_epi16(g, zero, max); _mm_store_si128(&cb_g_buf[i], g); /* b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13)); */ /* cr_b_buf[i] = MINMAX(b, 0, 255); */ b = _mm_add_epi16(y, _mm_srai_epi16(cb, 5)); b = _mm_add_epi16(b, _mm_srai_epi16(cb, 6)); b = _mm_add_epi16(b, _mm_srai_epi16(cb, 7)); b = _mm_add_epi16(b, _mm_srai_epi16(cb, 11)); b = _mm_add_epi16(b, _mm_srai_epi16(cb, 13)); _mm_between_epi16(b, zero, max); _mm_store_si128(&cr_b_buf[i], b); } }
void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[2]; const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); in[0] = load_input_data(input); in[1] = load_input_data(input + 8); switch (tx_type) { case 0: // DCT_DCT idct4_sse2(in); idct4_sse2(in); break; case 1: // ADST_DCT idct4_sse2(in); iadst4_sse2(in); break; case 2: // DCT_ADST iadst4_sse2(in); idct4_sse2(in); break; case 3: // ADST_ADST iadst4_sse2(in); iadst4_sse2(in); break; default: assert(0); break; } // Final round and shift in[0] = _mm_add_epi16(in[0], eight); in[1] = _mm_add_epi16(in[1], eight); in[0] = _mm_srai_epi16(in[0], 4); in[1] = _mm_srai_epi16(in[1], 4); // Reconstruction and Store { __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); d0 = _mm_unpacklo_epi32(d0, _mm_cvtsi32_si128(*(const int *)(dest + stride))); d2 = _mm_unpacklo_epi32( d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3))); d0 = _mm_unpacklo_epi8(d0, zero); d2 = _mm_unpacklo_epi8(d2, zero); d0 = _mm_add_epi16(d0, in[0]); d2 = _mm_add_epi16(d2, in[1]); d0 = _mm_packus_epi16(d0, d2); // store result[0] *(int *)dest = _mm_cvtsi128_si32(d0); // store result[1] d0 = _mm_srli_si128(d0, 4); *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); // store result[2] d0 = _mm_srli_si128(d0, 4); *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); // store result[3] d0 = _mm_srli_si128(d0, 4); *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); } }
void process_sse2(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { const dt_iop_rawprepare_data_t *const d = (dt_iop_rawprepare_data_t *)piece->data; // fprintf(stderr, "roi in %d %d %d %d\n", roi_in->x, roi_in->y, roi_in->width, roi_in->height); // fprintf(stderr, "roi out %d %d %d %d\n", roi_out->x, roi_out->y, roi_out->width, roi_out->height); const float scale = roi_in->scale / piece->iscale; const int csx = (int)roundf((float)d->x * scale), csy = (int)roundf((float)d->y * scale); if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && piece->pipe->filters) { // raw mosaic #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) #endif for(int j = 0; j < roi_out->height; j++) { const uint16_t *in = ((uint16_t *)ivoid) + ((size_t)roi_in->width * (j + csy) + csx); float *out = ((float *)ovoid) + (size_t)roi_out->width * j; int i = 0; // FIXME: figure alignment! !!! replace with for !!! while((!dt_is_aligned(in, 16) || !dt_is_aligned(out, 16)) && (i < roi_out->width)) { const int id = BL(roi_out, d, j, i); *out = (((float)(*in)) - d->sub[id]) / d->div[id]; i++; in++; out++; } const __m128 sub = _mm_set_ps(d->sub[BL(roi_out, d, j, i + 3)], d->sub[BL(roi_out, d, j, i + 2)], d->sub[BL(roi_out, d, j, i + 1)], d->sub[BL(roi_out, d, j, i)]); const __m128 div = _mm_set_ps(d->div[BL(roi_out, d, j, i + 3)], d->div[BL(roi_out, d, j, i + 2)], d->div[BL(roi_out, d, j, i + 1)], d->div[BL(roi_out, d, j, i)]); // process aligned pixels with SSE for(; i < roi_out->width - (8 - 1); i += 8, in += 8) { const __m128i input = _mm_load_si128((__m128i *)in); __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0)); __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0)); __m128 flo = _mm_cvtepi32_ps(ilo); __m128 fhi = _mm_cvtepi32_ps(ihi); flo = _mm_div_ps(_mm_sub_ps(flo, sub), div); fhi = _mm_div_ps(_mm_sub_ps(fhi, sub), div); _mm_stream_ps(out, flo); out += 4; _mm_stream_ps(out, fhi); out += 4; } // process the rest for(; i < roi_out->width; i++, in++, out++) { const int id = BL(roi_out, d, j, i); *out = MAX(0.0f, ((float)(*in)) - d->sub[id]) / d->div[id]; } } piece->pipe->filters = dt_rawspeed_crop_dcraw_filters(self->dev->image_storage.filters, csx, csy); adjust_xtrans_filters(piece->pipe, csx, csy); } else { // pre-downsampled buffer that needs black/white scaling const __m128 sub = _mm_load_ps(d->sub), div = _mm_load_ps(d->div); #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) #endif for(int j = 0; j < roi_out->height; j++) { const float *in = ((float *)ivoid) + (size_t)4 * (roi_in->width * (j + csy) + csx); float *out = ((float *)ovoid) + (size_t)4 * roi_out->width * j; // process aligned pixels with SSE for(int i = 0; i < roi_out->width; i++, in += 4, out += 4) { const __m128 input = _mm_load_ps(in); const __m128 scaled = _mm_div_ps(_mm_sub_ps(input, sub), div); _mm_stream_ps(out, scaled); } } } _mm_sfence(); }
void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[8]; const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data in[0] = load_input_data(input); in[1] = load_input_data(input + 8 * 1); in[2] = load_input_data(input + 8 * 2); in[3] = load_input_data(input + 8 * 3); in[4] = load_input_data(input + 8 * 4); in[5] = load_input_data(input + 8 * 5); in[6] = load_input_data(input + 8 * 6); in[7] = load_input_data(input + 8 * 7); switch (tx_type) { case 0: // DCT_DCT idct8_sse2(in); idct8_sse2(in); break; case 1: // ADST_DCT idct8_sse2(in); iadst8_sse2(in); break; case 2: // DCT_ADST iadst8_sse2(in); idct8_sse2(in); break; case 3: // ADST_ADST iadst8_sse2(in); iadst8_sse2(in); break; default: assert(0); break; } // Final rounding and shift in[0] = _mm_adds_epi16(in[0], final_rounding); in[1] = _mm_adds_epi16(in[1], final_rounding); in[2] = _mm_adds_epi16(in[2], final_rounding); in[3] = _mm_adds_epi16(in[3], final_rounding); in[4] = _mm_adds_epi16(in[4], final_rounding); in[5] = _mm_adds_epi16(in[5], final_rounding); in[6] = _mm_adds_epi16(in[6], final_rounding); in[7] = _mm_adds_epi16(in[7], final_rounding); in[0] = _mm_srai_epi16(in[0], 5); in[1] = _mm_srai_epi16(in[1], 5); in[2] = _mm_srai_epi16(in[2], 5); in[3] = _mm_srai_epi16(in[3], 5); in[4] = _mm_srai_epi16(in[4], 5); in[5] = _mm_srai_epi16(in[5], 5); in[6] = _mm_srai_epi16(in[6], 5); in[7] = _mm_srai_epi16(in[7], 5); RECON_AND_STORE(dest + 0 * stride, in[0]); RECON_AND_STORE(dest + 1 * stride, in[1]); RECON_AND_STORE(dest + 2 * stride, in[2]); RECON_AND_STORE(dest + 3 * stride, in[3]); RECON_AND_STORE(dest + 4 * stride, in[4]); RECON_AND_STORE(dest + 5 * stride, in[5]); RECON_AND_STORE(dest + 6 * stride, in[6]); RECON_AND_STORE(dest + 7 * stride, in[7]); }
static void thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const short* src = _src.ptr<short>(); short* dst = _dst.ptr<short>(); size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; src_step = dst_step = roi.width; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) CV_IPP_CHECK() { IppiSize sz = { roi.width, roi.height }; CV_SUPPRESS_DEPRECATED_START switch( type ) { case THRESH_TRUNC: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GT_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_GT_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_LTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh + 1, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_LTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+1, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO_INV: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_GTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; } CV_SUPPRESS_DEPRECATED_END } #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_and_si128( v0, maxval8 ); v1 = _mm_and_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); for( ; j <= roi.width - 8; j += 8 ) { uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_andnot_si128( v0, maxval8 ); v1 = _mm_andnot_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); for( ; j <= roi.width - 8; j += 8 ) { uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_min_epi16( v0, thresh8 ); v1 = _mm_min_epi16( v1, thresh8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh)); #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8)); v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8)); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) { int16x8_t v_src = vld1q_s16(src + j); uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0); v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) { int16x8_t v_src = vld1q_s16(src + j); uint16x8_t v_mask = vcleq_s16(v_src, v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }
void mlib_s_ImageCopyMask_u8( mlib_u8 *src, mlib_s32 slb, const mlib_u8 *mask, mlib_s32 mlb, mlib_u8 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize, mlib_s32 nchan, const mlib_s32 *thresh) { mlib_s32 i, j, nsize; mlib_s32 thresh0, thresh1, thresh2, thresh3; __m128i threshs, threshu, threshv, xormask; xormask = _mm_set1_epi8(0x80); nsize = xsize * nchan; switch (nchan) { case 1: thresh0 = thresh[0]; threshs = _mm_set1_epi8(SAT_U8(thresh0)); threshs = _mm_xor_si128(threshs, xormask); if ((((mlib_addr)dst | dlb | (mlib_addr)src | slb | (mlib_addr)mask | mlb) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 16); i += 16) { MLIB_S_COPYIMAGEMASK( _mm_cmplt_epi8, _mm_store_si128, _mm_load_si128, _mm_load_si128, _mm_load_si128); } for (; i < nsize; i ++) { if (mask[i] <= thresh0) { dst[i] = src[i]; } } src = (mlib_u8 *)((mlib_u8 *)src + slb); mask = (mlib_u8 *)((mlib_u8 *)mask + mlb); dst = (mlib_u8 *)((mlib_u8 *)dst + dlb); } } else { for (j = 0; j < ysize; j ++) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 16); i += 16) { MLIB_S_COPYIMAGEMASK( _mm_cmplt_epi8, _mm_storeu_si128, _mm_loadu_si128, _mm_loadu_si128, _mm_loadu_si128); } for (; i < nsize; i ++) { if (mask[i] <= thresh0) { dst[i] = src[i]; } } src = (mlib_u8 *)((mlib_u8 *)src + slb); mask = (mlib_u8 *)((mlib_u8 *)mask + mlb); dst = (mlib_u8 *)((mlib_u8 *)dst + dlb); } } break; case 2: thresh0 = thresh[0]; thresh1 = thresh[1]; threshs = _mm_set1_epi16((SAT_U8(thresh1) << 8) | (SAT_U8(thresh0) & 0xff)); threshs = _mm_xor_si128(threshs, xormask); if ((((mlib_addr)dst | dlb | (mlib_addr)src | slb | (mlib_addr)mask | mlb) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 16); i += 16) { MLIB_S_COPYIMAGEMASK( _mm_cmplt_epi8, _mm_store_si128, _mm_load_si128, _mm_load_si128, _mm_load_si128); } for (; i < nsize; i += 2) { if (mask[i] <= thresh0) { dst[i] = src[i]; } if (mask[i + 1] <= thresh1) { dst[i + 1] = src[i + 1]; } } src = (mlib_u8 *)((mlib_u8 *)src + slb); mask = (mlib_u8 *)((mlib_u8 *)mask + mlb); dst = (mlib_u8 *)((mlib_u8 *)dst + dlb); } } else { for (j = 0; j < ysize; j ++) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 16); i += 16) { MLIB_S_COPYIMAGEMASK( _mm_cmplt_epi8, _mm_storeu_si128, _mm_loadu_si128, _mm_loadu_si128, _mm_loadu_si128); } for (; i < nsize; i += 2) { if (mask[i] <= thresh0) { dst[i] = src[i]; } if (mask[i + 1] <= thresh1) { dst[i + 1] = src[i + 1]; } } src = (mlib_u8 *)((mlib_u8 *)src + slb); mask = (mlib_u8 *)((mlib_u8 *)mask + mlb); dst = (mlib_u8 *)((mlib_u8 *)dst + dlb); } } break; case 3: thresh0 = thresh[0]; thresh1 = thresh[1]; thresh2 = thresh[2]; threshs = _mm_set_epi8(SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0)); threshu = _mm_set_epi8(SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1)); threshv = _mm_set_epi8(SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2), SAT_U8(thresh1), SAT_U8(thresh0), SAT_U8(thresh2)); threshs = _mm_xor_si128(threshs, xormask); threshu = _mm_xor_si128(threshu, xormask); threshv = _mm_xor_si128(threshv, xormask); if ((((mlib_addr)dst | dlb | (mlib_addr)src | slb | (mlib_addr)mask | mlb) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 48); i += 48) { MLIB_S_COPYIMAGEMASK3( 16, 32, _mm_cmplt_epi8, _mm_store_si128, _mm_load_si128, _mm_load_si128, _mm_load_si128); } for (; i < nsize; i += 3) { if (mask[i] <= thresh0) { dst[i] = src[i]; } if (mask[i + 1] <= thresh1) { dst[i + 1] = src[i + 1]; } if (mask[i + 2] <= thresh2) { dst[i + 2] = src[i + 2]; } } src = (mlib_u8 *)((mlib_u8 *)src + slb); mask = (mlib_u8 *)((mlib_u8 *)mask + mlb); dst = (mlib_u8 *)((mlib_u8 *)dst + dlb); } } else { for (j = 0; j < ysize; j ++) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 48); i += 48) { MLIB_S_COPYIMAGEMASK3( 16, 32, _mm_cmplt_epi8, _mm_storeu_si128, _mm_loadu_si128, _mm_loadu_si128, _mm_loadu_si128); } for (; i < nsize; i += 3) { if (mask[i] <= thresh0) { dst[i] = src[i]; } if (mask[i + 1] <= thresh1) { dst[i + 1] = src[i + 1]; } if (mask[i + 2] <= thresh2) { dst[i + 2] = src[i + 2]; } } src = (mlib_u8 *)((mlib_u8 *)src + slb); mask = (mlib_u8 *)((mlib_u8 *)mask + mlb); dst = (mlib_u8 *)((mlib_u8 *)dst + dlb); } } break; case 4: thresh0 = thresh[0]; thresh1 = thresh[1]; thresh2 = thresh[2]; thresh3 = thresh[3]; threshs = _mm_set1_epi32(((SAT_U8(thresh3) & 0xff) << 24) | ((SAT_U8(thresh2) & 0xff) << 16) | ((SAT_U8(thresh1) & 0xff) << 8) | (SAT_U8(thresh0) & 0xff)); threshs = _mm_xor_si128(threshs, xormask); if ((((mlib_addr)dst | dlb | (mlib_addr)src | slb | (mlib_addr)mask | mlb) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 16); i += 16) { MLIB_S_COPYIMAGEMASK( _mm_cmplt_epi8, _mm_store_si128, _mm_load_si128, _mm_load_si128, _mm_load_si128); } for (; i < nsize; i += 4) { if (mask[i] <= thresh0) { dst[i] = src[i]; } if (mask[i + 1] <= thresh1) { dst[i + 1] = src[i + 1]; } if (mask[i + 2] <= thresh2) { dst[i + 2] = src[i + 2]; } if (mask[i + 3] <= thresh3) { dst[i + 3] = src[i + 3]; } } src = (mlib_u8 *)((mlib_u8 *)src + slb); mask = (mlib_u8 *)((mlib_u8 *)mask + mlb); dst = (mlib_u8 *)((mlib_u8 *)dst + dlb); } } else { for (j = 0; j < ysize; j ++) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 16); i += 16) { MLIB_S_COPYIMAGEMASK( _mm_cmplt_epi8, _mm_storeu_si128, _mm_loadu_si128, _mm_loadu_si128, _mm_loadu_si128); } for (; i < nsize; i += 4) { if (mask[i] <= thresh0) { dst[i] = src[i]; } if (mask[i + 1] <= thresh1) { dst[i + 1] = src[i + 1]; } if (mask[i + 2] <= thresh2) { dst[i + 2] = src[i + 2]; } if (mask[i + 3] <= thresh3) { dst[i + 3] = src[i + 3]; } } src = (mlib_u8 *)((mlib_u8 *)src + slb); mask = (mlib_u8 *)((mlib_u8 *)mask + mlb); dst = (mlib_u8 *)((mlib_u8 *)dst + dlb); } } break; } }