void png_read_filter_row_paeth3_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; png_bytep rp = row; png_const_bytep prp = prev_row; __m128i npix = _mm_cvtsi32_si128(*(uint32_t*)rp); __m128i ppix = _mm_setzero_si128(); // Same as 'a' in C version. __m128i prppix = _mm_setzero_si128(); // Same as 'c' in C version. const __m128i zero = _mm_setzero_si128(); for (i = 0; i < row_info->rowbytes; i += 3, rp += 3, prp += 3) { __m128i prpix = _mm_cvtsi32_si128(*(uint32_t*)prp); // Same as 'b' in C ver. __m128i pix, pa, pb, pc, temp; prpix = _mm_unpacklo_epi8(prpix, zero); temp = _mm_sub_epi16(prpix, prppix); // p = b - c pc = _mm_sub_epi16(ppix, prppix); // pc = a - c #ifndef __SSSE3__ pa = _mm_max_epi16(temp, _mm_sub_epi16(prppix, prpix)); pb = _mm_max_epi16(pc, _mm_sub_epi16(prppix, ppix)); temp = _mm_add_epi16(temp, pc); pc = _mm_max_epi16(temp, _mm_sub_epi16(zero, temp)); #else pa = _mm_abs_epi16(temp); // pa = abs(p) pb = _mm_abs_epi16(pc); // pb = abs(pc) temp = _mm_add_epi16(temp, pc); pc = _mm_abs_epi16(temp); // pc = abs(p + pc) #endif temp = _mm_cmplt_epi16(pb, pa); // if (pb < pa) pa = pb, a = b pa = _mm_andnot_si128(temp, pa); pa = _mm_or_si128(pa, _mm_and_si128(temp, pb)); ppix = _mm_andnot_si128(temp, ppix); ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prpix)); pix = npix; npix = _mm_cvtsi32_si128(*(uint32_t*)(rp + 3)); temp = _mm_cmplt_epi16(pc, pa); // if (pc < pa) a = c ppix = _mm_andnot_si128(temp, ppix); ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prppix)); pix = _mm_unpacklo_epi8(pix, zero); prppix = prpix; ppix = _mm_add_epi16(ppix, pix); ppix = _mm_slli_epi16(ppix, 8); ppix = _mm_srli_epi16(ppix, 8); pix = _mm_packus_epi16(ppix, zero); *(uint32_t*)rp = _mm_cvtsi128_si32(pix); } }
__m128i test_mm_abs_epi16(__m128i a) { // CHECK-LABEL: test_mm_abs_epi16 // CHECK: [[SUB:%.+]] = sub <8 x i16> zeroinitializer, [[A:%.+]] // CHECK: [[CMP:%.+]] = icmp sgt <8 x i16> [[A]], zeroinitializer // CHECK: %{{.*}} = select <8 x i1> [[CMP]], <8 x i16> [[A]], <8 x i16> [[SUB]] return _mm_abs_epi16(a); }
static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3, int width, int height) { const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); const __m128i dc_q0 = _mm_set1_epi16(*dst); __m128i *row = (__m128i *)pred_buf_q3; const __m128i *row_end = row + height * CFL_BUF_LINE_I128; do { __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); if (width < 16) { res = _mm_packus_epi16(res, res); if (width == 4) _mm_storeh_epi32((__m128i *)dst, res); else _mm_storel_epi64((__m128i *)dst, res); } else { __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); res = _mm_packus_epi16(res, next); _mm_storeu_si128((__m128i *)dst, res); if (width == 32) { res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); res = _mm_packus_epi16(res, next); _mm_storeu_si128((__m128i *)(dst + 16), res); } } dst += dst_stride; } while ((row += CFL_BUF_LINE_I128) < row_end); }
static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12, __m128i alpha_sign, __m128i dc_q0) { __m128i ac_q3 = _mm_loadu_si128(input); __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); return _mm_add_epi16(scaled_luma_q0, dc_q0); }
static INLINE unsigned int highbd_masked_sad4xh_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packs_epi32(pred_l, pred_r); const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m128i pred = _mm_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, int **rxdataF_comp, short *ulsch_llr, int **ul_ch_mag, int **ul_ch_magb, unsigned char symbol, unsigned short nb_rb) { __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; __m128i *ch_mag,*ch_magb; int j=0,i; // unsigned char symbol_mod; if (symbol == 0) llrU = ulsch_llr; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; ch_magb =(__m128i*)&ul_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)]; for (i=0;i<(nb_rb*3);i++) { mmtmpU1 = _mm_abs_epi16(rxF[i]); mmtmpU1 = _mm_subs_epi16(mmtmpU1,ch_mag[i]); mmtmpU2 = _mm_abs_epi16(mmtmpU1); mmtmpU2 = _mm_subs_epi16(mmtmpU2,ch_magb[i]); for (j=0;j<8;j++) { llrU[0] = ((short *)&rxF[i])[j]; llrU[1] = ((short *)&mmtmpU1)[j]; llrU[2] = ((short *)&mmtmpU2)[j]; llrU+=3; } } _mm_empty(); _m_empty(); }
static unsigned satd_8bit_4x4_avx2(const kvz_pixel *org, const kvz_pixel *cur) { __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org)); __m128i current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)cur)); __m128i diff_lo = _mm_sub_epi16(current, original); original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(org + 8))); current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(cur + 8))); __m128i diff_hi = _mm_sub_epi16(current, original); //Hor __m128i row0 = _mm_hadd_epi16(diff_lo, diff_hi); __m128i row1 = _mm_hsub_epi16(diff_lo, diff_hi); __m128i row2 = _mm_hadd_epi16(row0, row1); __m128i row3 = _mm_hsub_epi16(row0, row1); //Ver row0 = _mm_hadd_epi16(row2, row3); row1 = _mm_hsub_epi16(row2, row3); row2 = _mm_hadd_epi16(row0, row1); row3 = _mm_hsub_epi16(row0, row1); //Abs and sum row2 = _mm_abs_epi16(row2); row3 = _mm_abs_epi16(row3); row3 = _mm_add_epi16(row2, row3); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) )); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) )); row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) )); unsigned sum = _mm_extract_epi16(row3, 0); unsigned satd = (sum + 1) >> 1; return satd; }
// Returns |x| for 16-bit lanes. static __m128i abs_i16(__m128i x) { #if defined(__SSSE3__) return _mm_abs_epi16(x); #else // Read this all as, return x<0 ? -x : x. // To negate two's complement, you flip all the bits then add 1. __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); x = _mm_xor_si128(x, is_negative); // Flip negative lanes. x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); // +1 to negative lanes, else +0. return x; #endif }
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, int start_block, int end_block, VP8Histogram* const histo) { const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { int16_t out[16]; int k; VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); // Convert coefficients to bin (within out[]). { // Load. const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); // v = abs(out) >> 3 const __m128i abs0 = _mm_abs_epi16(out0); const __m128i abs1 = _mm_abs_epi16(out1); const __m128i v0 = _mm_srai_epi16(abs0, 3); const __m128i v1 = _mm_srai_epi16(abs1, 3); // bin = min(v, MAX_COEFF_THRESH) const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); // Store. _mm_storeu_si128((__m128i*)&out[0], bin0); _mm_storeu_si128((__m128i*)&out[8], bin1); } // Convert coefficients to bin. for (k = 0; k < 16; ++k) { ++distribution[out[k]]; } } VP8SetHistogramData(distribution, histo); }
/* Returns |x| for 16-bit lanes. */ static __m128i abs_i16(__m128i x) { #if PNG_INTEL_SSE_IMPLEMENTATION >= 2 return _mm_abs_epi16(x); #else /* Read this all as, return x<0 ? -x : x. * To negate two's complement, you flip all the bits then add 1. */ __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); /* Flip negative lanes. */ x = _mm_xor_si128(x, is_negative); /* +1 to negative lanes, else +0. */ x = _mm_sub_epi16(x, is_negative); return x; #endif }
void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) { float *symbolsPtr = (float*) symbols; __m128i *resultPtr = (__m128i*) llr; __m128 symbol1, symbol2; __m128i symbol_i1, symbol_i2, symbol_i, symbol_abs; __m128i offset = _mm_set1_epi16(2*SCALE_SHORT_CONV_QAM16/sqrt(10)); __m128i result11, result12, result22, result21; __m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV_QAM16); __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0); __m128i shuffle_abs_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff); __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8); __m128i shuffle_abs_2 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff); for (int i=0;i<nsymbols/4;i++) { symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v)); symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v)); symbol_i = _mm_packs_epi32(symbol_i1, symbol_i2); symbol_abs = _mm_abs_epi16(symbol_i); symbol_abs = _mm_sub_epi16(symbol_abs, offset); result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); _mm_store_si128(resultPtr, _mm_or_si128(result11, result12)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(result21, result22)); resultPtr++; } // Demodulate last symbols for (int i=4*(nsymbols/4);i<nsymbols;i++) { short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i])); short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i])); llr[4*i+0] = -yre; llr[4*i+1] = -yim; llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10); llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10); } }
void ulsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, int **rxdataF_comp, short *ulsch_llr, int **ul_ch_mag, unsigned char symbol, unsigned short nb_rb) { __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; __m128i *ch_mag; int i; // unsigned char symbol_mod; // printf("ulsch_rx.c: ulsch_16qam_llr: symbol %d\n",symbol); if (symbol == 0) llr128U = (__m128i*)&ulsch_llr[0]; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; for (i=0;i<(nb_rb*3);i++) { mmtmpU0 = _mm_abs_epi16(rxF[i]); // print_shorts("tmp0",&tmp0); mmtmpU0 = _mm_subs_epi16(mmtmpU0,ch_mag[i]); llr128U[0] = _mm_unpacklo_epi16(rxF[i],mmtmpU0); llr128U[1] = _mm_unpackhi_epi16(rxF[i],mmtmpU0); llr128U+=2; // print_bytes("rxF[i]",&rxF[i]); // print_bytes("rxF[i+1]",&rxF[i+1]); } _mm_empty(); _m_empty(); }
static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd, int width, int height) { const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); const __m128i dc_q0 = _mm_set1_epi16(*dst); const __m128i max = highbd_max_epi16(bd); const __m128i zeros = _mm_setzero_si128(); __m128i *row = (__m128i *)pred_buf_q3; const __m128i *row_end = row + height * CFL_BUF_LINE_I128; do { __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); res = highbd_clamp_epi16(res, zeros, max); if (width == 4) { _mm_storel_epi64((__m128i *)dst, res); } else { _mm_storeu_si128((__m128i *)dst, res); } if (width >= 16) { const __m128i res_1 = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); _mm_storeu_si128(((__m128i *)dst) + 1, highbd_clamp_epi16(res_1, zeros, max)); } if (width == 32) { const __m128i res_2 = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); _mm_storeu_si128((__m128i *)(dst + 16), highbd_clamp_epi16(res_2, zeros, max)); const __m128i res_3 = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); _mm_storeu_si128((__m128i *)(dst + 24), highbd_clamp_epi16(res_3, zeros, max)); } dst += dst_stride; } while ((row += CFL_BUF_LINE_I128) < row_end); }
INLINE static void haddwd_accumulate_avx2(__m128i *accumulate, __m128i *ver_row) { __m128i abs_value = _mm_abs_epi16(*ver_row); *accumulate = _mm_add_epi32(*accumulate, _mm_madd_epi16(abs_value, _mm_set1_epi16(1))); }
void demod_64qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) { float *symbolsPtr = (float*) symbols; __m128i *resultPtr = (__m128i*) llr; __m128 symbol1, symbol2; __m128i symbol_i1, symbol_i2, symbol_i, symbol_abs, symbol_abs2; __m128i offset1 = _mm_set1_epi16(4*SCALE_SHORT_CONV_QAM64/sqrt(42)); __m128i offset2 = _mm_set1_epi16(2*SCALE_SHORT_CONV_QAM64/sqrt(42)); __m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV_QAM64); __m128i result11, result12, result13, result22, result21,result23, result31, result32, result33; __m128i shuffle_negated_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,3,2,1,0); __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); __m128i shuffle_negated_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff); __m128i shuffle_abs_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff); __m128i shuffle_abs_2 = _mm_set_epi8(11,10,9,8,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,7,6,5,4); __m128i shuffle_abs_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); __m128i shuffle_abs2_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); __m128i shuffle_abs2_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff); __m128i shuffle_abs2_3 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,9,8); for (int i=0;i<nsymbols/4;i++) { symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v)); symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v)); symbol_i = _mm_packs_epi32(symbol_i1, symbol_i2); symbol_abs = _mm_abs_epi16(symbol_i); symbol_abs = _mm_sub_epi16(symbol_abs, offset1); symbol_abs2 = _mm_sub_epi16(_mm_abs_epi16(symbol_abs), offset2); result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); result13 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_1); result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); result23 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_2); result31 = _mm_shuffle_epi8(symbol_i, shuffle_negated_3); result32 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_3); result33 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_3); _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++; } for (int i=4*(nsymbols/4);i<nsymbols;i++) { float yre = (short) (SCALE_SHORT_CONV_QAM64*crealf(symbols[i])); float yim = (short) (SCALE_SHORT_CONV_QAM64*cimagf(symbols[i])); llr[6*i+0] = -yre; llr[6*i+1] = -yim; llr[6*i+2] = abs(yre)-4*SCALE_SHORT_CONV_QAM64/sqrt(42); llr[6*i+3] = abs(yim)-4*SCALE_SHORT_CONV_QAM64/sqrt(42); llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_SHORT_CONV_QAM64/sqrt(42); llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_SHORT_CONV_QAM64/sqrt(42); } }
static void ssse3_fetch_horizontal (bits_image_t *image, line_t *line, int y, pixman_fixed_t x, pixman_fixed_t ux, int n) { uint32_t *bits = image->bits + y * image->rowstride; __m128i vx = _mm_set_epi16 ( - (x + 1), x, - (x + 1), x, - (x + ux + 1), x + ux, - (x + ux + 1), x + ux); __m128i vux = _mm_set_epi16 ( - 2 * ux, 2 * ux, - 2 * ux, 2 * ux, - 2 * ux, 2 * ux, - 2 * ux, 2 * ux); __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0); __m128i *b = (__m128i *)line->buffer; __m128i vrl0, vrl1; while ((n -= 2) >= 0) { __m128i vw, vr, s; vrl1 = _mm_loadl_epi64 ( (__m128i *)(bits + pixman_fixed_to_int (x + ux))); /* vrl1: R1, L1 */ final_pixel: vrl0 = _mm_loadl_epi64 ( (__m128i *)(bits + pixman_fixed_to_int (x))); /* vrl0: R0, L0 */ /* The weights are based on vx which is a vector of * * - (x + 1), x, - (x + 1), x, * - (x + ux + 1), x + ux, - (x + ux + 1), x + ux * * so the 16 bit weights end up like this: * * iw0, w0, iw0, w0, iw1, w1, iw1, w1 * * and after shifting and packing, we get these bytes: * * iw0, w0, iw0, w0, iw1, w1, iw1, w1, * iw0, w0, iw0, w0, iw1, w1, iw1, w1, * * which means the first and the second input pixel * have to be interleaved like this: * * la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 * * before maddubsw can be used. */ vw = _mm_add_epi16 ( vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS)); /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1 */ vw = _mm_packus_epi16 (vw, vw); /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1, * iw0, w0, iw0, w0, iw1, w1, iw1, w1 */ vx = _mm_add_epi16 (vx, vux); x += 2 * ux; vr = _mm_unpacklo_epi16 (vrl1, vrl0); /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */ s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2)); /* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */ vr = _mm_unpackhi_epi8 (vr, s); /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 */ vr = _mm_maddubs_epi16 (vr, vw); /* When the weight is 0, the inverse weight is * 128 which can't be represented in a signed byte. * As a result maddubsw computes the following: * * r = l * -128 + r * 0 * * rather than the desired * * r = l * 128 + r * 0 * * We fix this by taking the absolute value of the * result. */ vr = _mm_abs_epi16 (vr); /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */ _mm_store_si128 (b++, vr); } if (n == -1) { vrl1 = _mm_setzero_si128(); goto final_pixel; } line->y = y; }
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], const uint16_t* const sharpen, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]); const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]); // coeff = abs(in) __m128i coeff0 = _mm_abs_epi16(in0); __m128i coeff8 = _mm_abs_epi16(in8); // coeff = abs(in) + sharpen if (sharpen != NULL) { const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]); const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]); coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); } // out = (coeff * iQ + B) >> QFIX { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // out = (coeff * iQ + B) const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]); const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]); const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]); const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]); out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = QUANTDIV(coeff, iQ, B, QFIX) out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); // if (coeff > 2047) coeff = 2047 out0 = _mm_min_epi16(out0, max_coeff_2047); out8 = _mm_min_epi16(out8, max_coeff_2047); } // put sign back out0 = _mm_sign_epi16(out0, in0); out8 = _mm_sign_epi16(out8, in8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); // zigzag the output before storing it. The re-ordering is: // 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15 // There's only two misplaced entries ([8] and [7]) that are crossing the // reg's boundaries. // We use pshufb instead of pshuflo/pshufhi. { const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6); const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1); const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo); const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7); // extract #7 const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7); const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1); const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi); const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8); // extract #8 const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8); const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7); _mm_storeu_si128((__m128i*)&out[0], out_z0); _mm_storeu_si128((__m128i*)&out[8], out_z8); packed_out = _mm_packs_epi16(out_z0, out_z8); } // detect if all 'out' values are zeroes or not return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); }
// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransform(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { __m128i tmp_0, tmp_1, tmp_2, tmp_3; // Load, combine and transpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_cvtepu8_epi16(transpose1_0); tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8)); tmp_2 = _mm_cvtepu8_epi16(transpose1_1); tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8)); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); A_b0 = _mm_abs_epi16(A_b0); A_b2 = _mm_abs_epi16(A_b2); B_b0 = _mm_abs_epi16(B_b0); B_b2 = _mm_abs_epi16(B_b2); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b2 = _mm_sub_epi32(A_b0, B_b0); // cascading summation of the differences B_b0 = _mm_hadd_epi32(A_b2, A_b2); B_b2 = _mm_hadd_epi32(B_b0, B_b0); return _mm_cvtsi128_si32(B_b2); } }
// Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; // Load and combine inputs. { const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]); // In SSE4.1, with gcc 4.8 at least (maybe other versions), // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump // of inA and inB, _mm_loadl_epi64 is still used not to have an out of // bound read. const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3); tmp_0 = _mm_cvtepu8_epi16(inAB_0); tmp_1 = _mm_cvtepu8_epi16(inAB_1); tmp_2 = _mm_cvtepu8_epi16(inAB_2); tmp_3 = _mm_cvtepu8_epi16(inAB_3); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 } // Vertical pass first to avoid a transpose (vertical and horizontal passes // are commutative because w/kWeightY is symmetric) and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3); } // Horizontal pass and difference of weighted sums. { // Load all inputs. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); A_b0 = _mm_abs_epi16(A_b0); A_b2 = _mm_abs_epi16(A_b2); B_b0 = _mm_abs_epi16(B_b0); B_b2 = _mm_abs_epi16(B_b2); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b2 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b2); } return sum[0] + sum[1] + sum[2] + sum[3]; }
/***************************************************************************** * This function utilises 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * * vp9_encoder.c. * * For the joint cost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * * For the component costs: * * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * * If these do not hold, then this function cannot be used without * * modification, in which case you can revert to using the C implementation, * * which does not rely on these properties. * *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min); const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); // search_param determines the length of the initial step and hence the number // of iterations. // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; const int tot_steps = cfg->total_steps - search_param; const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; const uint8_t *const what = x->plane[0].src.buf; const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; #if ARCH_X86_64 __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif unsigned int best_sad; int i; int j; int step; // Check the prerequisite cost function properties that are easy to check // in an assert. See the function-level documentation for details on all // prerequisites. assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); // Check the starting position best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { __m128i v_sad_d; __m128i v_cost_d; __m128i v_outside_d; __m128i v_inside_d; __m128i v_diff_mv_w; #if ARCH_X86_64 __m128i v_blocka[2]; #else __m128i v_blocka[1]; #endif // Compute the candidate motion vectors const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]); const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); // Clamp them to the search bounds __m128i v_these_mv_clamp_w = v_these_mv_w; v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); // The ones that did not change are inside the search area v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); // If none of them are inside, then move on if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) { continue; } // The inverse mask indicates which of the MVs are outside v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); // Shift right to keep the sign bit clear, we will use this later // to set the cost to the maximum value. v_outside_d = _mm_srli_epi32(v_outside_d, 1); // Compute the difference MV v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); // We utilise the fact that the cost function is even, and use the // absolute difference. This allows us to use unsigned indexes later // and reduces cache pressure somewhat as only a half of the table // is ever referenced. v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); // Compute the SIMD pointer offsets. { #if ARCH_X86_64 // sizeof(intptr_t) == 8 // Load the offsets __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]); __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]); // Set the ones falling outside to zero v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); v_bo32_q = _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); // Compute the candidate addresses v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); #else // ARCH_X86 // sizeof(intptr_t) == 4 __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]); v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); #endif } fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], in_what_stride, (uint32_t*)&v_sad_d); // Look up the component cost of the residual motion vector { const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); // Note: This is a use case for vpgather in AVX2 const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; __m128i v_cost_10_d, v_cost_32_d; v_cost_10_d = _mm_cvtsi32_si128(cost0); v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); v_cost_32_d = _mm_cvtsi32_si128(cost2); v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); } // Now add in the joint cost { const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); } // Multiply by sad_per_bit v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); // ROUND_POWER_OF_TWO(v_cost_d, 8) v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80)); v_cost_d = _mm_srai_epi32(v_cost_d, 8); // Add the cost to the sad v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); // Make the motion vectors outside the search area have max cost // by or'ing in the comparison mask, this way the minimum search won't // pick them. v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); // Find the minimum value and index horizontally in v_sad_d { // Try speculatively on 16 bits, so we can use the minpos intrinsic const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); // If the local best value is not saturated, just use it, otherwise // find the horizontal minimum again the hard way on 32 bits. // This is executed rarely. if (__unlikely__(local_best_sad == 0xffff)) { __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; v_loval_d = v_sad_d; v_loidx_d = _mm_set_epi32(3, 2, 1, 0); v_hival_d = _mm_srli_si128(v_loval_d, 8); v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); v_hival_d = _mm_srli_si128(v_loval_d, 4); v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); local_best_sad = _mm_extract_epi32(v_loval_d, 0); local_best_idx = _mm_extract_epi32(v_loidx_d, 0); } // Update the global minimum if the local minimum is smaller if (__likely__(local_best_sad < best_sad)) { new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; best_sad = local_best_sad; } } } bmv = new_bmv; best_address = new_best_address; v_bmv_w = _mm_set1_epi32(bmv.as_int); #if ARCH_X86_64 v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif if (__unlikely__(best_address == in_what)) { (*num00)++; } } *best_mv = bmv.as_mv; return best_sad; }
__m128i test_mm_abs_epi16(__m128i a) { // CHECK-LABEL: test_mm_abs_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.pabs.w.128 return _mm_abs_epi16(a); }