__m128i test_mm_adds_epi16(__m128i A, __m128i B) { // DAG-LABEL: test_mm_adds_epi16 // DAG: call <8 x i16> @llvm.x86.sse2.padds.w // // ASM-LABEL: test_mm_adds_epi16 // ASM: paddsw return _mm_adds_epi16(A, B); }
SIMDValue SIMDInt16x8Operation::OpAddSaturate(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_adds_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); // a + b saturates return X86SIMDValue::ToSIMDValue(x86Result); }
void multadd_real_vector_complex_scalar(int16_t *x, int16_t *alpha, int16_t *y, uint32_t N) { uint32_t i; // do 8 multiplications at a time simd_q15_t alpha_r_128,alpha_i_128,yr,yi,*x_128=(simd_q15_t*)x,*y_128=(simd_q15_t*)y; int j; // printf("alpha = %d,%d\n",alpha[0],alpha[1]); alpha_r_128 = set1_int16(alpha[0]); alpha_i_128 = set1_int16(alpha[1]); j=0; for (i=0; i<N>>3; i++) { yr = mulhi_s1_int16(alpha_r_128,x_128[i]); yi = mulhi_s1_int16(alpha_i_128,x_128[i]); #if defined(__x86_64__) || defined(__i386__) y_128[j] = _mm_adds_epi16(y_128[j],_mm_unpacklo_epi16(yr,yi)); j++; y_128[j] = _mm_adds_epi16(y_128[j],_mm_unpackhi_epi16(yr,yi)); j++; #elif defined(__arm__) int16x8x2_t yint; yint = vzipq_s16(yr,yi); y_128[j] = adds_int16(y_128[j],yint.val[0]); j++; y_128[j] = adds_int16(y_128[j],yint.val[1]); j++; #endif } _mm_empty(); _m_empty(); }
__m64 _m_paddsw(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_adds_epi16(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); const __m128i s1s0 = _mm_unpacklo_epi8(A, B); const __m128i s3s2 = _mm_unpacklo_epi8(C, D); const __m128i s5s4 = _mm_unpacklo_epi8(E, F); const __m128i s7s6 = _mm_unpacklo_epi8(G, H); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); }
static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT) { v16 dst, src, vco; v16 max, min; src = _mm_load_si128((v16 *)VS); dst = _mm_load_si128((v16 *)VT); vco = _mm_load_si128((v16 *)cf_co); /* * Due to premature clamping in between adds, sometimes we need to add the * LESSER of two integers, either VS or VT, to the carry-in flag matching the * current vector register slice, BEFORE finally adding the greater integer. */ max = _mm_max_epi16(dst, src); min = _mm_min_epi16(dst, src); min = _mm_adds_epi16(min, vco); max = _mm_adds_epi16(max, min); _mm_store_si128((v16 *)VD, max); return; }
static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { __m128i xmm0, xmmY1, xmmY2; __m128 xmmY; while (width >= 2) { xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); xmmY2 = _mm_adds_epi16(xmmY2, xmm0); xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 0x44); xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); rgb_buf += 8; width -= 2; } if (width) { xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY1 = _mm_srai_epi16(xmmY1, 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); } }
void ulsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, int **rxdataF_comp, int **ul_ch_mag, int **ul_ch_magb, unsigned char symbol, unsigned short nb_rb) { __m128i *rxdataF_comp128_0,*ul_ch_mag128_0,*ul_ch_mag128_0b; __m128i *rxdataF_comp128_1,*ul_ch_mag128_1,*ul_ch_mag128_1b; int i; if (frame_parms->nb_antennas_rx>1) { rxdataF_comp128_0 = (__m128i *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp[1][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_0 = (__m128i *)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_1 = (__m128i *)&ul_ch_mag[1][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_0b = (__m128i *)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_1b = (__m128i *)&ul_ch_magb[1][symbol*frame_parms->N_RB_DL*12]; // MRC on each re of rb, both on MF output and magnitude (for 16QAM/64QAM llr computation) for (i=0;i<nb_rb*3;i++) { rxdataF_comp128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_0[i],1),_mm_srai_epi16(rxdataF_comp128_1[i],1)); ul_ch_mag128_0[i] = _mm_adds_epi16(_mm_srai_epi16(ul_ch_mag128_0[i],1),_mm_srai_epi16(ul_ch_mag128_1[i],1)); ul_ch_mag128_0b[i] = _mm_adds_epi16(_mm_srai_epi16(ul_ch_mag128_0b[i],1),_mm_srai_epi16(ul_ch_mag128_1b[i],1)); } // remove any bias (DC component after IDFT) ((u32*)rxdataF_comp128_0)[0]=0; } _mm_empty(); _m_empty(); }
static void aom_filter_block1d4_h4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; unsigned int i; src_ptr -= 3; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); filt1Reg = _mm_load_si128((__m128i const *)(filtd4)); for (i = output_height; i > 0; i -= 1) { // load the 2 strides of source srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg); // multiply 4 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); src_ptr += src_pixels_per_line; *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); output_ptr += output_pitch; } }
static void TransformAC3(const int16_t* in, uint8_t* dst) { static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; const __m128i A = _mm_set1_epi16(in[0] + 4); const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2)); const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1)); const int c1 = MUL(in[1], kC2); const int d1 = MUL(in[1], kC1); const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1); const __m128i B = _mm_adds_epi16(A, CD); const __m128i m0 = _mm_adds_epi16(B, d4); const __m128i m1 = _mm_adds_epi16(B, c4); const __m128i m2 = _mm_subs_epi16(B, c4); const __m128i m3 = _mm_subs_epi16(B, d4); const __m128i zero = _mm_setzero_si128(); // Load the source pixels. __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); dst2 = _mm_unpacklo_epi8(dst2, zero); dst3 = _mm_unpacklo_epi8(dst3, zero); // Add the inverse transform. dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3)); dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3)); dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3)); dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3)); // Unsigned saturate to 8b. dst0 = _mm_packus_epi16(dst0, dst0); dst1 = _mm_packus_epi16(dst1, dst1); dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); }
static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width, int source_dx) { __m128i xmm0, xmmY1, xmmY2; __m128 xmmY; uint8 u0, u1, v0, v1, y0, y1; uint32 uv_frac, y_frac, u, v, y; int x = 0; if (source_dx >= 0x20000) { x = 32768; } while(width >= 2) { u0 = u_buf[x >> 17]; u1 = u_buf[(x >> 17) + 1]; v0 = v_buf[x >> 17]; v1 = v_buf[(x >> 17) + 1]; y0 = y_buf[x >> 16]; y1 = y_buf[(x >> 16) + 1]; uv_frac = (x & 0x1fffe); y_frac = (x & 0xffff); u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; x += source_dx; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); y0 = y_buf[x >> 16]; y1 = y_buf[(x >> 16) + 1]; y_frac = (x & 0xffff); y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; x += source_dx; xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY2 = _mm_adds_epi16(xmmY2, xmm0); xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 0x44); xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); rgb_buf += 8; width -= 2; } if (width) { u = u_buf[x >> 17]; v = v_buf[x >> 17]; y = y_buf[x >> 16]; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY1 = _mm_srai_epi16(xmmY1, 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); } }
void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; __m128i thr; int16_t nzflag; (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); if (!skip_block) { __m128i eob; __m128i round, quant, dequant; { __m128i coeff0, coeff1; // Setup global values { round = _mm_load_si128((const __m128i *)round_ptr); quant = _mm_load_si128((const __m128i *)quant_ptr); dequant = _mm_load_si128((const __m128i *)dequant_ptr); } { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; // Do DC and first 15 AC coeff0 = load_tran_low(coeff_ptr + n_coeffs); coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); qcoeff0 = _mm_adds_epi16(qcoeff0, round); round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); // Reinsert signs qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob = _mm_max_epi16(eob, eob1); } n_coeffs += 8 * 2; } thr = _mm_srai_epi16(dequant, 1); // AC only loop while (n_coeffs < 0) { __m128i coeff0, coeff1; { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; coeff0 = load_tran_low(coeff_ptr + n_coeffs); coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); if (nzflag) { qcoeff0 = _mm_adds_epi16(qcoeff0, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); // Reinsert signs qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } else { store_zero_tran_low(qcoeff_ptr + n_coeffs); store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); store_zero_tran_low(dqcoeff_ptr + n_coeffs); store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); } } if (nzflag) { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob0, eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob0 = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob0 = _mm_max_epi16(eob0, eob1); eob = _mm_max_epi16(eob, eob0); } n_coeffs += 8 * 2; } // Accumulate EOB { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); *eob_ptr = _mm_extract_epi16(eob, 1); } } else { do { store_zero_tran_low(qcoeff_ptr + n_coeffs); store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); store_zero_tran_low(dqcoeff_ptr + n_coeffs); store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; } }
void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t* zbin_ptr, const int16_t* round_ptr, const int16_t* quant_ptr, const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr, tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr, uint16_t* eob_ptr, const int16_t* scan_ptr, const int16_t* iscan_ptr) { __m128i zero; (void)scan_ptr; coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); if (!skip_block) { __m128i eob; __m128i zbin; __m128i round, quant, dequant, shift; { __m128i coeff0, coeff1; // Setup global values { __m128i pw_1; zbin = _mm_load_si128((const __m128i*)zbin_ptr); round = _mm_load_si128((const __m128i*)round_ptr); quant = _mm_load_si128((const __m128i*)quant_ptr); pw_1 = _mm_set1_epi16(1); zbin = _mm_sub_epi16(zbin, pw_1); dequant = _mm_load_si128((const __m128i*)dequant_ptr); shift = _mm_load_si128((const __m128i*)quant_shift_ptr); } { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; // Do DC and first 15 AC coeff0 = load_coefficients(coeff_ptr + n_coeffs); coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); qcoeff0 = _mm_adds_epi16(qcoeff0, round); round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); shift = _mm_unpackhi_epi64(shift, shift); qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); // Reinsert signs qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob = _mm_max_epi16(eob, eob1); } n_coeffs += 8 * 2; } // AC only loop while (n_coeffs < 0) { __m128i coeff0, coeff1; { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; coeff0 = load_coefficients(coeff_ptr + n_coeffs); coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); qcoeff0 = _mm_adds_epi16(qcoeff0, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); // Reinsert signs qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob0, eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob0 = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob0 = _mm_max_epi16(eob0, eob1); eob = _mm_max_epi16(eob, eob0); } n_coeffs += 8 * 2; } // Accumulate EOB { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); *eob_ptr = _mm_extract_epi16(eob, 1); } } else { do { store_coefficients(zero, dqcoeff_ptr + n_coeffs); store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); store_coefficients(zero, qcoeff_ptr + n_coeffs); store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; } }
static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in, const int stride) { const __m128i final_rounding = _mm_set1_epi16(1 << 5); // Final rounding and shift in[0] = _mm_adds_epi16(in[0], final_rounding); in[1] = _mm_adds_epi16(in[1], final_rounding); in[2] = _mm_adds_epi16(in[2], final_rounding); in[3] = _mm_adds_epi16(in[3], final_rounding); in[4] = _mm_adds_epi16(in[4], final_rounding); in[5] = _mm_adds_epi16(in[5], final_rounding); in[6] = _mm_adds_epi16(in[6], final_rounding); in[7] = _mm_adds_epi16(in[7], final_rounding); in[8] = _mm_adds_epi16(in[8], final_rounding); in[9] = _mm_adds_epi16(in[9], final_rounding); in[10] = _mm_adds_epi16(in[10], final_rounding); in[11] = _mm_adds_epi16(in[11], final_rounding); in[12] = _mm_adds_epi16(in[12], final_rounding); in[13] = _mm_adds_epi16(in[13], final_rounding); in[14] = _mm_adds_epi16(in[14], final_rounding); in[15] = _mm_adds_epi16(in[15], final_rounding); in[0] = _mm_srai_epi16(in[0], 6); in[1] = _mm_srai_epi16(in[1], 6); in[2] = _mm_srai_epi16(in[2], 6); in[3] = _mm_srai_epi16(in[3], 6); in[4] = _mm_srai_epi16(in[4], 6); in[5] = _mm_srai_epi16(in[5], 6); in[6] = _mm_srai_epi16(in[6], 6); in[7] = _mm_srai_epi16(in[7], 6); in[8] = _mm_srai_epi16(in[8], 6); in[9] = _mm_srai_epi16(in[9], 6); in[10] = _mm_srai_epi16(in[10], 6); in[11] = _mm_srai_epi16(in[11], 6); in[12] = _mm_srai_epi16(in[12], 6); in[13] = _mm_srai_epi16(in[13], 6); in[14] = _mm_srai_epi16(in[14], 6); in[15] = _mm_srai_epi16(in[15], 6); recon_and_store(dest + 0 * stride, in[0]); recon_and_store(dest + 1 * stride, in[1]); recon_and_store(dest + 2 * stride, in[2]); recon_and_store(dest + 3 * stride, in[3]); recon_and_store(dest + 4 * stride, in[4]); recon_and_store(dest + 5 * stride, in[5]); recon_and_store(dest + 6 * stride, in[6]); recon_and_store(dest + 7 * stride, in[7]); recon_and_store(dest + 8 * stride, in[8]); recon_and_store(dest + 9 * stride, in[9]); recon_and_store(dest + 10 * stride, in[10]); recon_and_store(dest + 11 * stride, in[11]); recon_and_store(dest + 12 * stride, in[12]); recon_and_store(dest + 13 * stride, in[13]); recon_and_store(dest + 14 * stride, in[14]); recon_and_store(dest + 15 * stride, in[15]); }
void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[8]; const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data in[0] = load_input_data8(input); in[1] = load_input_data8(input + 8 * 1); in[2] = load_input_data8(input + 8 * 2); in[3] = load_input_data8(input + 8 * 3); in[4] = load_input_data8(input + 8 * 4); in[5] = load_input_data8(input + 8 * 5); in[6] = load_input_data8(input + 8 * 6); in[7] = load_input_data8(input + 8 * 7); switch (tx_type) { case DCT_DCT: vpx_idct8_sse2(in); vpx_idct8_sse2(in); break; case ADST_DCT: vpx_idct8_sse2(in); iadst8_sse2(in); break; case DCT_ADST: iadst8_sse2(in); vpx_idct8_sse2(in); break; default: assert(tx_type == ADST_ADST); iadst8_sse2(in); iadst8_sse2(in); break; } // Final rounding and shift in[0] = _mm_adds_epi16(in[0], final_rounding); in[1] = _mm_adds_epi16(in[1], final_rounding); in[2] = _mm_adds_epi16(in[2], final_rounding); in[3] = _mm_adds_epi16(in[3], final_rounding); in[4] = _mm_adds_epi16(in[4], final_rounding); in[5] = _mm_adds_epi16(in[5], final_rounding); in[6] = _mm_adds_epi16(in[6], final_rounding); in[7] = _mm_adds_epi16(in[7], final_rounding); in[0] = _mm_srai_epi16(in[0], 5); in[1] = _mm_srai_epi16(in[1], 5); in[2] = _mm_srai_epi16(in[2], 5); in[3] = _mm_srai_epi16(in[3], 5); in[4] = _mm_srai_epi16(in[4], 5); in[5] = _mm_srai_epi16(in[5], 5); in[6] = _mm_srai_epi16(in[6], 5); in[7] = _mm_srai_epi16(in[7], 5); recon_and_store(dest + 0 * stride, in[0]); recon_and_store(dest + 1 * stride, in[1]); recon_and_store(dest + 2 * stride, in[2]); recon_and_store(dest + 3 * stride, in[3]); recon_and_store(dest + 4 * stride, in[4]); recon_and_store(dest + 5 * stride, in[5]); recon_and_store(dest + 6 * stride, in[6]); recon_and_store(dest + 7 * stride, in[7]); }
static FORCE_INLINE void warp_mmword_u8_sse2(const uint8_t *srcp, const uint8_t *edgep, uint8_t *dstp, int src_stride, int edge_stride, int height, int x, int y, const __m128i &depth, const __m128i &zero, const __m128i &x_limit_min, const __m128i &x_limit_max, const __m128i &y_limit_min, const __m128i &y_limit_max, const __m128i &word_64, const __m128i &word_127, const __m128i &word_128, const __m128i &word_255, const __m128i &one_stride) { int SMAG = 1 << SMAGL; // calculate displacement __m128i above = _mm_loadl_epi64((const __m128i *)(edgep + x - (y ? edge_stride : 0))); __m128i below = _mm_loadl_epi64((const __m128i *)(edgep + x + (y < height - 1 ? edge_stride : 0))); __m128i left = _mm_loadl_epi64((const __m128i *)(edgep + x - 1)); __m128i right = _mm_loadl_epi64((const __m128i *)(edgep + x + 1)); above = _mm_unpacklo_epi8(above, zero); below = _mm_unpacklo_epi8(below, zero); left = _mm_unpacklo_epi8(left, zero); right = _mm_unpacklo_epi8(right, zero); __m128i h = _mm_sub_epi16(left, right); __m128i v = _mm_sub_epi16(above, below); h = _mm_slli_epi16(h, 7); v = _mm_slli_epi16(v, 7); h = _mm_mulhi_epi16(h, depth); v = _mm_mulhi_epi16(v, depth); v = _mm_max_epi16(v, y_limit_min); v = _mm_min_epi16(v, y_limit_max); __m128i remainder_h = h; __m128i remainder_v = v; if (SMAGL) { remainder_h = _mm_slli_epi16(remainder_h, SMAGL); remainder_v = _mm_slli_epi16(remainder_v, SMAGL); } remainder_h = _mm_and_si128(remainder_h, word_127); remainder_v = _mm_and_si128(remainder_v, word_127); h = _mm_srai_epi16(h, 7 - SMAGL); v = _mm_srai_epi16(v, 7 - SMAGL); __m128i xx = _mm_set1_epi32(x << SMAGL); xx = _mm_packs_epi32(xx, xx); h = _mm_adds_epi16(h, xx); remainder_h = _mm_and_si128(remainder_h, _mm_cmpgt_epi16(x_limit_max, h)); remainder_h = _mm_andnot_si128(_mm_cmpgt_epi16(x_limit_min, h), remainder_h); h = _mm_max_epi16(h, x_limit_min); h = _mm_min_epi16(h, x_limit_max); // h and v contain the displacement now. __m128i disp_lo = _mm_unpacklo_epi16(v, h); __m128i disp_hi = _mm_unpackhi_epi16(v, h); disp_lo = _mm_madd_epi16(disp_lo, one_stride); disp_hi = _mm_madd_epi16(disp_hi, one_stride); __m128i line0 = _mm_setzero_si128(); __m128i line1 = _mm_setzero_si128(); int offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset), 0); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride), 0); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 1 * SMAG), 1); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 1 * SMAG), 1); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 2 * SMAG), 2); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 2 * SMAG), 2); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 3 * SMAG), 3); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 3 * SMAG), 3); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 4 * SMAG), 4); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 4 * SMAG), 4); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 5 * SMAG), 5); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 5 * SMAG), 5); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 6 * SMAG), 6); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 6 * SMAG), 6); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 7 * SMAG), 7); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 7 * SMAG), 7); __m128i left0 = _mm_and_si128(line0, word_255); __m128i left1 = _mm_and_si128(line1, word_255); __m128i right0 = _mm_srli_epi16(line0, 8); __m128i right1 = _mm_srli_epi16(line1, 8); left0 = _mm_mullo_epi16(left0, _mm_sub_epi16(word_128, remainder_h)); left1 = _mm_mullo_epi16(left1, _mm_sub_epi16(word_128, remainder_h)); right0 = _mm_mullo_epi16(right0, remainder_h); right1 = _mm_mullo_epi16(right1, remainder_h); line0 = _mm_add_epi16(left0, right0); line1 = _mm_add_epi16(left1, right1); line0 = _mm_add_epi16(line0, word_64); line1 = _mm_add_epi16(line1, word_64); line0 = _mm_srai_epi16(line0, 7); line1 = _mm_srai_epi16(line1, 7); line0 = _mm_mullo_epi16(line0, _mm_sub_epi16(word_128, remainder_v)); line1 = _mm_mullo_epi16(line1, remainder_v); __m128i result = _mm_add_epi16(line0, line1); result = _mm_add_epi16(result, word_64); result = _mm_srai_epi16(result, 7); result = _mm_packus_epi16(result, result); _mm_storel_epi64((__m128i *)(dstp + x), result); }
void ulsch_alamouti(LTE_DL_FRAME_PARMS *frame_parms,// For Distributed Alamouti Receiver Combining int **rxdataF_comp, int **rxdataF_comp_0, int **rxdataF_comp_1, int **ul_ch_mag, int **ul_ch_magb, int **ul_ch_mag_0, int **ul_ch_magb_0, int **ul_ch_mag_1, int **ul_ch_magb_1, unsigned char symbol, unsigned short nb_rb) { short *rxF,*rxF0,*rxF1; __m128i *ch_mag,*ch_magb,*ch_mag0,*ch_mag1,*ch_mag0b,*ch_mag1b; unsigned char rb,re,aarx; int jj=(symbol*frame_parms->N_RB_DL*12); for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { rxF = (short*)&rxdataF_comp[aarx][jj]; rxF0 = (short*)&rxdataF_comp_0[aarx][jj]; // Contains (y)*(h0*) rxF1 = (short*)&rxdataF_comp_1[aarx][jj]; // Contains (y*)*(h1) ch_mag = (__m128i *)&ul_ch_mag[aarx][jj]; ch_mag0 = (__m128i *)&ul_ch_mag_0[aarx][jj]; ch_mag1 = (__m128i *)&ul_ch_mag_1[aarx][jj]; ch_magb = (__m128i *)&ul_ch_magb[aarx][jj]; ch_mag0b = (__m128i *)&ul_ch_magb_0[aarx][jj]; ch_mag1b = (__m128i *)&ul_ch_magb_1[aarx][jj]; for (rb=0;rb<nb_rb;rb++) { for (re=0;re<12;re+=2) { // Alamouti RX combining rxF[0] = rxF0[0] + rxF1[2]; // re((y0)*(h0*))+ re((y1*)*(h1)) = re(x0) rxF[1] = rxF0[1] + rxF1[3]; // im((y0)*(h0*))+ im((y1*)*(h1)) = im(x0) rxF[2] = rxF0[2] - rxF1[0]; // re((y1)*(h0*))- re((y0*)*(h1)) = re(x1) rxF[3] = rxF0[3] - rxF1[1]; // im((y1)*(h0*))- im((y0*)*(h1)) = im(x1) rxF+=4; rxF0+=4; rxF1+=4; } // compute levels for 16QAM or 64 QAM llr unit ch_mag[0] = _mm_adds_epi16(ch_mag0[0],ch_mag1[0]); ch_mag[1] = _mm_adds_epi16(ch_mag0[1],ch_mag1[1]); ch_mag[2] = _mm_adds_epi16(ch_mag0[2],ch_mag1[2]); ch_magb[0] = _mm_adds_epi16(ch_mag0b[0],ch_mag1b[0]); ch_magb[1] = _mm_adds_epi16(ch_mag0b[1],ch_mag1b[1]); ch_magb[2] = _mm_adds_epi16(ch_mag0b[2],ch_mag1b[2]); ch_mag+=3; ch_mag0+=3; ch_mag1+=3; ch_magb+=3; ch_mag0b+=3; ch_mag1b+=3; } } _mm_empty(); _m_empty(); }
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm256_castsi256_si128(srcReg32b2), 1); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm256_castsi256_si128(srcReg32b3), 1); srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, _mm256_castsi256_si128(srcReg32b4), 1); srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, _mm256_castsi256_si128(srcReg32b5), 1); srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, _mm256_castsi256_si128(srcReg32b6), 1); srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, _mm256_castsi256_si128(srcReg32b7), 1); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); // save srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); // save srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i-=2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); // merge every two consecutive registers // save srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); // multiply 2 adjacent elements with the filter and add the result srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_max_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_max_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); // shift by 7 bit each 16 bit srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcReg32b1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+out_pitch), _mm256_extractf128_si256(srcReg32b1, 1)); output_ptr+=dst_stride; // save part of the registers for next strides srcReg32b10 = srcReg32b11; srcReg32b1 = srcReg32b3; srcReg32b11 = srcReg32b2; srcReg32b3 = srcReg32b5; srcReg32b2 = srcReg32b4; srcReg32b5 = srcReg32b7; srcReg32b7 = srcReg32b9; } if (i > 0) { __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); srcRegFilt7 = _mm_unpackhi_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), _mm256_castsi256_si128(firstFilters)); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), _mm256_castsi256_si128(firstFilters)); srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), _mm256_castsi256_si128(secondFilters)); srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), _mm256_castsi256_si128(secondFilters)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), _mm256_castsi256_si128(thirdFilters)); srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); } }
static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr, ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel) { // Register for source s[-1:3, :] __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; // Interleaved rows of the source. lo is first half, hi second __m128i src_reg_m10_lo, src_reg_01_lo; __m128i src_reg_12_lo, src_reg_23_lo; // Half of half of the interleaved rows __m128i src_reg_m10_lo_1; __m128i src_reg_01_lo_1; __m128i src_reg_12_lo_1; __m128i src_reg_23_lo_1; __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used // Result after multiply and add __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; __m128i res_reg_m1012, res_reg_0123; __m128i res_reg_m1012_lo, res_reg_0123_lo; const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding const __m128i reg_zero = _mm_setzero_si128(); // We will compute the result two rows at a time const ptrdiff_t src_stride_unrolled = src_stride << 1; const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit // words, // shuffle the data into the form // ... s[0,1] s[-1,1] s[0,0] s[-1,0] // ... s[0,7] s[-1,7] s[0,6] s[-1,6] // ... s[0,9] s[-1,9] s[0,8] s[-1,8] // ... s[0,13] s[-1,13] s[0,12] s[-1,12] // so that we can call multiply and add with the kernel to get 32-bit words of // the form // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] // Finally, we can add multiple rows together to get the desired output. // First shuffle the data src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128()); // More shuffling src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128()); for (h = height; h > 1; h -= 2) { src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); // Partial output res_reg_m10_lo = mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, ®_zero, &kernel_reg_23); res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, ®_zero, &kernel_reg_23); src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, ®_zero, &kernel_reg_45); src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, ®_zero, &kernel_reg_45); // Add to get results res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); // Round the words res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); // Convert to 8-bit words res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero); res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero); // Save only half of the register (8 words) *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012); *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123); // Update the source by two rows src_ptr += src_stride_unrolled; dst_ptr += dst_stride_unrolled; src_reg_m10_lo_1 = src_reg_12_lo_1; src_reg_01_lo_1 = src_reg_23_lo_1; src_reg_1 = src_reg_3; } }
static alignment_end* sw_sse2_word (const int8_t* ref, int8_t ref_dir, // 0: forward ref; 1: reverse ref int32_t refLen, int32_t readLen, const uint8_t weight_gapO, /* will be used as - */ const uint8_t weight_gapE, /* will be used as - */ const __m128i* vProfile, uint16_t terminate, int32_t maskLen) { #define max8(m, vm) (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 8)); \ (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 4)); \ (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 2)); \ (m) = _mm_extract_epi16((vm), 0) uint16_t max = 0; /* the max alignment score */ int32_t end_read = readLen - 1; int32_t end_ref = 0; /* 1_based best alignment ending point; Initialized as isn't aligned - 0. */ int32_t segLen = (readLen + 7) / 8; /* number of segment */ /* array to record the largest score of each reference position */ uint16_t* maxColumn = (uint16_t*) calloc(refLen, 2); /* array to record the alignment read ending position of the largest score of each reference position */ int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t)); /* Define 16 byte 0 vector. */ __m128i vZero = _mm_set1_epi32(0); __m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i)); __m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i)); __m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i)); __m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i)); int32_t i, j, k; /* 16 byte insertion begin vector */ __m128i vGapO = _mm_set1_epi16(weight_gapO); /* 16 byte insertion extension vector */ __m128i vGapE = _mm_set1_epi16(weight_gapE); __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */ __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */ __m128i vTemp; int32_t edge, begin = 0, end = refLen, step = 1; /* outer loop to process the reference sequence */ if (ref_dir == 1) { begin = refLen - 1; end = -1; step = -1; } for (i = begin; LIKELY(i != end); i += step) { int32_t cmp; __m128i e, vF = vZero; /* Initialize F value to 0. Any errors to vH values will be corrected in the Lazy_F loop. */ __m128i vH = pvHStore[segLen - 1]; vH = _mm_slli_si128 (vH, 2); /* Shift the 128-bit value in vH left by 2 byte. */ /* Swap the 2 H buffers. */ __m128i* pv = pvHLoad; __m128i vMaxColumn = vZero; /* vMaxColumn is used to record the max values of column i. */ const __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */ pvHLoad = pvHStore; pvHStore = pv; /* inner loop to process the query sequence */ for (j = 0; LIKELY(j < segLen); j ++) { vH = _mm_adds_epi16(vH, _mm_load_si128(vP + j)); /* Get max from vH, vE and vF. */ e = _mm_load_si128(pvE + j); vH = _mm_max_epi16(vH, e); vH = _mm_max_epi16(vH, vF); vMaxColumn = _mm_max_epi16(vMaxColumn, vH); /* Save vH values. */ _mm_store_si128(pvHStore + j, vH); /* Update vE value. */ vH = _mm_subs_epu16(vH, vGapO); /* saturation arithmetic, result >= 0 */ e = _mm_max_epi16(e, vH); e = _mm_subs_epu16(e, vGapE); _mm_store_si128(pvE + j, e); /* Update vF value. */ vF = _mm_max_epi16(vF, vH); vF = _mm_subs_epu16(vF, vGapE); /* Load the next vH. */ vH = _mm_load_si128(pvHLoad + j); } /* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */ for (k = 0; LIKELY(k < 8); ++k) { vF = _mm_slli_si128 (vF, 2); for (j = 0; LIKELY(j < segLen); ++j) { vH = _mm_load_si128(pvHStore + j); vH = _mm_max_epi16(vH, vF); _mm_store_si128(pvHStore + j, vH); vH = _mm_subs_epu16(vH, vGapO); vF = _mm_subs_epu16(vF, vGapE); if (UNLIKELY(! _mm_movemask_epi8(_mm_cmpgt_epi16(vF, vH)))) goto end; } } end: vMaxScore = _mm_max_epi16(vMaxScore, vMaxColumn); vTemp = _mm_cmpeq_epi16(vMaxMark, vMaxScore); cmp = _mm_movemask_epi8(vTemp); if (cmp != 0xffff) { uint16_t temp; vMaxMark = vMaxScore; max8(temp, vMaxScore); vMaxScore = vMaxMark; if (LIKELY(temp > max)) { max = temp; end_ref = i; for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j]; } } /* Record the max score of current column. */ max8(maxColumn[i], vMaxColumn); if (maxColumn[i] == terminate) break; } /* Trace the alignment ending position on read. */ uint16_t *t = (uint16_t*)pvHmax; int32_t column_len = segLen * 8; for (i = 0; LIKELY(i < column_len); ++i, ++t) { int32_t temp; if (*t == max) { temp = i / 8 + i % 8 * segLen; if (temp < end_read) end_read = temp; } } free(pvHmax); free(pvE); free(pvHLoad); free(pvHStore); /* Find the most possible 2nd best alignment. */ alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end)); bests[0].score = max; bests[0].ref = end_ref; bests[0].read = end_read; bests[1].score = 0; bests[1].ref = 0; bests[1].read = 0; edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0; for (i = 0; i < edge; i ++) { if (maxColumn[i] > bests[1].score) { bests[1].score = maxColumn[i]; bests[1].ref = i; } } edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen); for (i = edge; i < refLen; i ++) { if (maxColumn[i] > bests[1].score) { bests[1].score = maxColumn[i]; bests[1].ref = i; } } free(maxColumn); free(end_read_column); return bests; }
void aom_filter_block1d4_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, srcReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter into the first lane firstFilters = _mm_shufflelo_epi16(filtersReg, 0); // duplicate only the third 16 bit in the filter into the first lane secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); // duplicate only the seconds 16 bits in the filter into the second lane // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); // duplicate only the forth 16 bits in the filter into the second lane // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); // loading the local filters shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // extract the higher half of the lane srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); // add and saturate all the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pixels_per_line; // save only 4 bytes *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); output_ptr += output_pitch; } }
void aom_filter_block1d8_v8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i addFilterReg64, filtersReg, minReg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; __m128i srcReg8; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits in the filter secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits in the filter thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits in the filter forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 8 bytes srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 8 bytes srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); // merge the result together srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); // add and saturate the results together minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pitch; // shift down a row srcReg1 = srcReg2; srcReg2 = srcReg3; srcReg3 = srcReg4; srcReg4 = srcReg5; srcReg5 = srcReg6; srcReg6 = srcReg7; srcReg7 = srcReg8; // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); output_ptr += out_pitch; } }
void aom_filter_block1d8_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits (first and second byte) // across 128 bit register firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 128 bit register forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); filt1Reg = _mm_load_si128((__m128i const *)filt1_global); filt2Reg = _mm_load_si128((__m128i const *)filt2_global); filt3Reg = _mm_load_si128((__m128i const *)filt3_global); filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); // add and saturate all the results together minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pixels_per_line; // save only 8 bytes _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); output_ptr += output_pitch; } }
int smith_waterman_sse2_word(const unsigned char * query_sequence, unsigned short * query_profile_word, const int query_length, const unsigned char * db_sequence, const int db_length, unsigned short gap_open, unsigned short gap_extend, struct f_struct * f_str) { int i, j, k; short score; int cmp; int iter = (query_length + 7) / 8; __m128i *p; __m128i *workspace = (__m128i *) f_str->workspace; __m128i E, F, H; __m128i v_maxscore; __m128i v_gapopen; __m128i v_gapextend; __m128i v_min; __m128i v_minimums; __m128i v_temp; __m128i *pHLoad, *pHStore; __m128i *pE; __m128i *pScore; /* Load gap opening penalty to all elements of a constant */ v_gapopen = _mm_setzero_si128(); /* Apple Devel */ v_gapopen = _mm_insert_epi16 (v_gapopen, gap_open, 0); v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0); v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0); /* Load gap extension penalty to all elements of a constant */ v_gapextend = _mm_setzero_si128(); /* Apple Devel */ v_gapextend = _mm_insert_epi16 (v_gapextend, gap_extend, 0); v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0); v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0); /* load v_maxscore with the zeros. since we are using signed */ /* math, we will bias the maxscore to -32768 so we have the */ /* full range of the short. */ v_maxscore = _mm_setzero_si128(); /* Apple Devel */ v_maxscore = _mm_cmpeq_epi16 (v_maxscore, v_maxscore); v_maxscore = _mm_slli_epi16 (v_maxscore, 15); v_minimums = _mm_shuffle_epi32 (v_maxscore, 0); v_min = _mm_shuffle_epi32 (v_maxscore, 0); v_min = _mm_srli_si128 (v_min, 14); /* Zero out the storage vector */ k = 2 * iter; p = workspace; for (i = 0; i < k; i++) { _mm_store_si128 (p++, v_maxscore); } pE = workspace; pHStore = pE + iter; pHLoad = pHStore + iter; for (i = 0; i < db_length; ++i) { /* fetch first data asap. */ pScore = (__m128i *) query_profile_word + db_sequence[i] * iter; /* bias all elements in F to -32768 */ F = _mm_setzero_si128(); /* Apple Devel */ F = _mm_cmpeq_epi16 (F, F); F = _mm_slli_epi16 (F, 15); /* load the next h value */ H = _mm_load_si128 (pHStore + iter - 1); H = _mm_slli_si128 (H, 2); H = _mm_or_si128 (H, v_min); p = pHLoad; pHLoad = pHStore; pHStore = p; for (j = 0; j < iter; j++) { /* load E values */ E = _mm_load_si128 (pE + j); /* add score to H */ H = _mm_adds_epi16 (H, *pScore++); /* Update highest score encountered this far */ v_maxscore = _mm_max_epi16 (v_maxscore, H); /* get max from H, E and F */ H = _mm_max_epi16 (H, E); H = _mm_max_epi16 (H, F); /* save H values */ _mm_store_si128 (pHStore + j, H); /* subtract the gap open penalty from H */ H = _mm_subs_epi16 (H, v_gapopen); /* update E value */ E = _mm_subs_epi16 (E, v_gapextend); E = _mm_max_epi16 (E, H); /* update F value */ F = _mm_subs_epi16 (F, v_gapextend); F = _mm_max_epi16 (F, H); /* save E values */ _mm_store_si128 (pE + j, E); /* load the next h value */ H = _mm_load_si128 (pHLoad + j); } /* reset pointers to the start of the saved data */ j = 0; H = _mm_load_si128 (pHStore + j); /* the computed F value is for the given column. since */ /* we are at the end, we need to shift the F value over */ /* to the next column. */ F = _mm_slli_si128 (F, 2); F = _mm_or_si128 (F, v_min); v_temp = _mm_subs_epi16 (H, v_gapopen); v_temp = _mm_cmpgt_epi16 (F, v_temp); cmp = _mm_movemask_epi8 (v_temp); while (cmp != 0x0000) { E = _mm_load_si128 (pE + j); H = _mm_max_epi16 (H, F); /* save H values */ _mm_store_si128 (pHStore + j, H); /* update E in case the new H value would change it */ H = _mm_subs_epi16 (H, v_gapopen); E = _mm_max_epi16 (E, H); _mm_store_si128 (pE + j, E); /* update F value */ F = _mm_subs_epi16 (F, v_gapextend); j++; if (j >= iter) { j = 0; F = _mm_slli_si128 (F, 2); F = _mm_or_si128 (F, v_min); } H = _mm_load_si128 (pHStore + j); v_temp = _mm_subs_epi16 (H, v_gapopen); v_temp = _mm_cmpgt_epi16 (F, v_temp); cmp = _mm_movemask_epi8 (v_temp); } } /* find largest score in the v_maxscore vector */ v_temp = _mm_srli_si128 (v_maxscore, 8); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); v_temp = _mm_srli_si128 (v_maxscore, 4); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); v_temp = _mm_srli_si128 (v_maxscore, 2); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); /* extract the largest score */ score = _mm_extract_epi16 (v_maxscore, 0); /* return largest score biased by 32768 */ /* fix for Mac OSX clang 4.1 */ /* #ifdef __clang__ if (score < 0) score += 32768; return score; #else */ return score + 32768; /* #endif */ }
pstatus_t sse2_alphaComp_argb( const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2, UINT32 src2Step, BYTE* pDst, UINT32 dstStep, UINT32 width, UINT32 height) { const UINT32* sptr1 = (const UINT32*) pSrc1; const UINT32* sptr2 = (const UINT32*) pSrc2; UINT32* dptr; int linebytes, src1Jump, src2Jump, dstJump; UINT32 y; __m128i xmm0, xmm1; if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS; if (width < 4) /* pointless if too small */ { return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width, height); } dptr = (UINT32*) pDst; linebytes = width * sizeof(UINT32); src1Jump = (src1Step - linebytes) / sizeof(UINT32); src2Jump = (src2Step - linebytes) / sizeof(UINT32); dstJump = (dstStep - linebytes) / sizeof(UINT32); xmm0 = _mm_set1_epi32(0); xmm1 = _mm_set1_epi16(1); for (y = 0; y < height; ++y) { int pixels = width; int count; /* Get to the 16-byte boundary now. */ int leadIn = 0; switch ((ULONG_PTR) dptr & 0x0f) { case 0: leadIn = 0; break; case 4: leadIn = 3; break; case 8: leadIn = 2; break; case 12: leadIn = 1; break; default: /* We'll never hit a 16-byte boundary, so do the whole * thing the slow way. */ leadIn = width; break; } if (leadIn) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, leadIn, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += leadIn; sptr2 += leadIn; dptr += leadIn; pixels -= leadIn; } /* Use SSE registers to do 4 pixels at a time. */ count = pixels >> 2; pixels -= count << 2; while (count--) { __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ xmm2 = LOAD_SI128(sptr1); sptr1 += 4; /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ xmm3 = LOAD_SI128(sptr2); sptr2 += 4; /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); /* subtract */ xmm6 = _mm_subs_epi16(xmm4, xmm5); /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); /* Add one to alphas */ xmm4 = _mm_adds_epi16(xmm4, xmm1); /* Multiply and take low word */ xmm4 = _mm_mullo_epi16(xmm4, xmm6); /* Shift 8 right */ xmm4 = _mm_srai_epi16(xmm4, 8); /* Add xmm5 */ xmm4 = _mm_adds_epi16(xmm4, xmm5); /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); /* subtract */ xmm7 = _mm_subs_epi16(xmm5, xmm6); /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); /* Add one to alphas */ xmm5 = _mm_adds_epi16(xmm5, xmm1); /* Multiply and take low word */ xmm5 = _mm_mullo_epi16(xmm5, xmm7); /* Shift 8 right */ xmm5 = _mm_srai_epi16(xmm5, 8); /* Add xmm6 */ xmm5 = _mm_adds_epi16(xmm5, xmm6); /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ /* Must mask off remainders or pack gets confused */ xmm3 = _mm_set1_epi16(0x00ffU); xmm4 = _mm_and_si128(xmm4, xmm3); xmm5 = _mm_and_si128(xmm5, xmm3); /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ xmm5 = _mm_packus_epi16(xmm5, xmm4); _mm_store_si128((__m128i*) dptr, xmm5); dptr += 4; } /* Finish off the remainder. */ if (pixels) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, pixels, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += pixels; sptr2 += pixels; dptr += pixels; } /* Jump to next row. */ sptr1 += src1Jump; sptr2 += src2Jump; dptr += dstJump; } return PRIMITIVES_SUCCESS; }
void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[8]; const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data in[0] = load_input_data(input); in[1] = load_input_data(input + 8 * 1); in[2] = load_input_data(input + 8 * 2); in[3] = load_input_data(input + 8 * 3); in[4] = load_input_data(input + 8 * 4); in[5] = load_input_data(input + 8 * 5); in[6] = load_input_data(input + 8 * 6); in[7] = load_input_data(input + 8 * 7); switch (tx_type) { case 0: // DCT_DCT idct8_sse2(in); idct8_sse2(in); break; case 1: // ADST_DCT idct8_sse2(in); iadst8_sse2(in); break; case 2: // DCT_ADST iadst8_sse2(in); idct8_sse2(in); break; case 3: // ADST_ADST iadst8_sse2(in); iadst8_sse2(in); break; default: assert(0); break; } // Final rounding and shift in[0] = _mm_adds_epi16(in[0], final_rounding); in[1] = _mm_adds_epi16(in[1], final_rounding); in[2] = _mm_adds_epi16(in[2], final_rounding); in[3] = _mm_adds_epi16(in[3], final_rounding); in[4] = _mm_adds_epi16(in[4], final_rounding); in[5] = _mm_adds_epi16(in[5], final_rounding); in[6] = _mm_adds_epi16(in[6], final_rounding); in[7] = _mm_adds_epi16(in[7], final_rounding); in[0] = _mm_srai_epi16(in[0], 5); in[1] = _mm_srai_epi16(in[1], 5); in[2] = _mm_srai_epi16(in[2], 5); in[3] = _mm_srai_epi16(in[3], 5); in[4] = _mm_srai_epi16(in[4], 5); in[5] = _mm_srai_epi16(in[5], 5); in[6] = _mm_srai_epi16(in[6], 5); in[7] = _mm_srai_epi16(in[7], 5); RECON_AND_STORE(dest + 0 * stride, in[0]); RECON_AND_STORE(dest + 1 * stride, in[1]); RECON_AND_STORE(dest + 2 * stride, in[2]); RECON_AND_STORE(dest + 3 * stride, in[3]); RECON_AND_STORE(dest + 4 * stride, in[4]); RECON_AND_STORE(dest + 5 * stride, in[5]); RECON_AND_STORE(dest + 6 * stride, in[6]); RECON_AND_STORE(dest + 7 * stride, in[7]); }
static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i-=2) { // load the 2 strides of source srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr - 3))); srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line-3)), 1); // filter the source buffer srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + 5))); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line+5)), 1); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcRegFilt32b1_1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+output_pitch), _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); output_ptr+=dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); } }
void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, unsigned int src_pixels_per_line, unsigned char *output_ptr, unsigned int output_pitch, unsigned int output_height, int16_t *filter) { __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((__m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits (first and second byte) // across 128 bit register firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 128 bit register forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); filt1Reg = _mm_load_si128((__m128i const *)filt1_global); filt2Reg = _mm_load_si128((__m128i const *)filt2_global); filt3Reg = _mm_load_si128((__m128i const *)filt3_global); filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); // filter the source buffer srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes. // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); src_ptr+=src_pixels_per_line; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); output_ptr+=output_pitch; } }
int viterbi_stream_word_partitioned(DATA_STREAM* dstream, float* opt_res, int thrid) { // if (NTHREADS > 1) pthread_barrier_wait(&dstream->barrier); return 0; int L = dstream->L; P7_PROFILE* gm = dstream->gm; ESL_DSQ** ddsq = dstream->seqs; int M = gm->M, i, k, v, t, j; const int PARTITION = dstream->partition; __m128i** oprmsc = (__m128i**) dstream->rsc_msc; __m128i* xmxEv = dstream->xmxE; __m128i xmxB, xmxE, xmxC, moveC, Vinf = _mm_set1_epi16(-WORDMAX); __m128i dmx[PARTITION]; __m128i mmx[PARTITION]; __m128i imx[PARTITION]; __m128i xmm[24]; __m128i *mscore[8]; __m128i overflowlimit, overflows; overflowlimit = overflows = Vinf; if (thrid == NTHREADS-1) { overflowlimit = _mm_set1_epi16(WORDMAX-1); overflows= _mm_xor_si128(overflows,overflows); // zero out } t = ((dstream->Npartitions+thrid)%NTHREADS)*PARTITION; tprintf("START viterbiThr %d in %d L %d | Seq %d\n", thrid, t, L, 0); // ccount[thrid]++); xmxC = Vinf; moveC = _mm_set1_epi16(discretize(dstream->scale, gm->xsc[p7P_C][p7P_MOVE])); xmxB = _mm_set1_epi16(dstream->wordoffset + discretize(dstream->scale, gm->xsc[p7P_N][p7P_MOVE])); for ( ; t < M; t += NTHREADS*PARTITION) { volatile uchar* synchflags1 = dstream->synchflags[t/PARTITION]; volatile uchar* synchflags2 = dstream->synchflags[t/PARTITION+1]; int t8 = t/8; for (k = 0; k < PARTITION; k++) dmx[k] = mmx[k] = imx[k] = Vinf; for (i = 1; i <= L; i++) { // tprintf("Iter Thr %d t %d: I %d\n", thrid, t, i); __m128i sc, dcv, temp, mpv, ipv, dpv; __m128i *ttsc = dstream->tsc_all + t*8; v = i-1; ttsc += 3; if (t == 0) xmxE = mpv = dpv = ipv = sc = dcv = Vinf; else { if (NTHREADS > 1) while (!synchflags1[v]) sched_yield(); xmxE = xmxEv[v]; dcv = dstream->pdcv[v]; sc = dstream->psc[v]; } for (j = 0; j < 8; j++) mscore[j] = oprmsc[ddsq[j][i]] + t8; for (k = 0; k < PARTITION && t+k < M; ) { #if 0 #define EMLOAD(i) xmm[i+24] = _mm_load_si128(mscore[i]); EMLOAD(0) EMLOAD(1) EMLOAD(2) EMLOAD(3) EMLOAD(4) EMLOAD(5) EMLOAD(6) EMLOAD(7) #define MIX16(i,r,range) \ xmm[r ] = _mm_unpacklo_epi##range(xmm[24+i], xmm[24+i+1]); \ xmm[r+1] = _mm_unpackhi_epi##range(xmm[24+i], xmm[24+i+1]); MIX16(0,0,16) MIX16(2,2,16) MIX16(4,4,16) MIX16(6,6,16) #else #define MMLOAD(a,b) \ xmm[a] = _mm_unpacklo_epi16(*mscore[a], *mscore[b]); \ xmm[b] = _mm_unpackhi_epi16(*mscore[a], *mscore[b]); MMLOAD(0,1) MMLOAD(2,3) MMLOAD(4,5) MMLOAD(6,7) #endif #define MIX(i,r,range) \ xmm[r ] = _mm_unpacklo_epi##range(xmm[i], xmm[i+2]); \ xmm[r+1] = _mm_unpackhi_epi##range(xmm[i], xmm[i+2]); MIX(0, 8,32) MIX(1,12,32) MIX(4,10,32) MIX(5,14,32) MIX( 8,16,64) MIX( 9,18,64) MIX(12,20,64) MIX(13,22,64) #define TRIPLETCOMPUTE(k,j) \ { /* Calculate new M(k), delay store */ \ sc = _mm_max_epi16(sc, _mm_adds_epi16(xmxB, *ttsc)); ttsc++; \ sc = _mm_adds_epi16(sc, xmm[j]); \ /* Update E */ \ xmxE = _mm_max_epi16(xmxE, sc); \ \ /* Pre-emptive load of M, D, I */ \ dpv = dmx[k]; \ ipv = imx[k]; \ mpv = mmx[k]; \ \ /* Calculate current I(k) */ \ temp = _mm_adds_epi16(mpv, *ttsc); ttsc++; \ imx[k] = _mm_max_epi16(temp, _mm_adds_epi16(ipv, *ttsc)); ttsc++;\ \ /* Delayed stores of M and D */ \ mmx[k] = sc; \ dmx[k] = dcv; \ \ /* Calculate next D, D(k+1) */ \ sc = _mm_adds_epi16(sc, *ttsc); ttsc++; \ dcv = _mm_max_epi16(sc, _mm_adds_epi16(dcv, *ttsc));ttsc++; \ \ /* Pre-emptive partial calculation of M(k+1) */ \ sc = _mm_adds_epi16(mpv, *ttsc); ttsc++; \ sc = _mm_max_epi16(sc, _mm_adds_epi16(ipv, *ttsc)); ttsc++; \ sc = _mm_max_epi16(sc, _mm_adds_epi16(dpv, *ttsc)); ttsc++; \ k++; \ } TRIPLETCOMPUTE(k,16+0) TRIPLETCOMPUTE(k,16+1) TRIPLETCOMPUTE(k,16+2) TRIPLETCOMPUTE(k,16+3) TRIPLETCOMPUTE(k,16+4) TRIPLETCOMPUTE(k,16+5) TRIPLETCOMPUTE(k,16+6) TRIPLETCOMPUTE(k,16+7) mscore[0]++; mscore[1]++; mscore[2]++; mscore[3]++; mscore[4]++; mscore[5]++; mscore[6]++; mscore[7]++; } if (t+k < M) { v = i-1; xmxEv[v] = xmxE; dstream->pdcv[v] = dcv; dstream->psc [v] = sc; if (NTHREADS > 1) synchflags2[v] = 1; } else // executed only by main thread (NTHRS-1) { __m128i overfs = _mm_cmpgt_epi16(xmxE, overflowlimit); overflows = _mm_or_si128(overflows, overfs); // select the overflowed channels xmxC = _mm_max_epi16(xmxC, xmxE); } } } xmxC = _mm_adds_epi16(xmxC, moveC); if (opt_res != NULL) { float offset = (float) dstream->wordoffset; int16_t res[8] __attribute__ ((aligned (16))); int16_t ovs[8] __attribute__ ((aligned (16))); memmove(res, &xmxC, sizeof(xmxC)); memmove(ovs, &overflows, sizeof(overflows)); for (i = 0; i < 8; i++) if (ovs[i]) opt_res[i] = eslINFINITY; // signal overflow else opt_res[i] = ((float) res[i] - offset) / dstream->scale - 2.0; // 2.0 nat approximation, UNILOCAL mode } tprintf("END viterbi Thr %d - t %d\n", thrid, t); return eslOK; }
static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *filter, int w) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); int i; for (i = 0; i < w; i += 16) { const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); const __m128i C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); const __m128i D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); const __m128i E = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); const __m128i F = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); const __m128i G = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); const __m128i H = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); // multiply 2 adjacent elements with the filter and add the result const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); // add and saturate the results together const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); // merge the result together const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); // multiply 2 adjacent elements with the filter and add the result const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); // merge the result together const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); // multiply 2 adjacent elements with the filter and add the result const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); // add and saturate the results together __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); // add and saturate the results together temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); // round and shift by 7 bit each 16 bit temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result temp_hi = _mm_packus_epi16(temp_lo, temp_hi); src_ptr += 16; // save 16 bytes convolve result _mm_store_si128((__m128i *)&dst[i], temp_hi); } }