static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int width, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); assert(width >= 8); assert(IS_POWER_OF_TWO(width)); do { const __m128i v_p1_w = xx_loadl_64(pre + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4); const __m128i v_p0_w = xx_loadl_64(pre + n); const __m128i v_m0_d = xx_load_128(mask + n); const __m128i v_w0_d = xx_load_128(wsrc + n); const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); // Rounded absolute difference const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); n += 8; if (n % width == 0) pre += pre_step; } while (n < width * height); return xx_hsum_epi32_si32(v_sad_d); }
// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, uint16_t *count, uint32_t *accumulator) { const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); const __m128i zero = _mm_setzero_si128(); __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); __m128i pred_0_u32, pred_1_u32; __m128i accum_0_u32, accum_1_u32; count_u16 = _mm_adds_epu16(count_u16, sum_u16); _mm_storeu_si128((__m128i *)count, count_u16); pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); }
inline __m128i load_aligned_int32(const uint16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepu16_epi32(tmp); #else __m128i res = _mm_unpacklo_epi16(tmp, _mm_set1_epi16(0)); #endif return res; }
void ahd_interpolate_tile(int top, char * buffer) { int row, col, tr, tc, c, val; const int dir[4] = { -1, 1, -width, width }; __m128i ldiff[2], abdiff[2]; union hvrgbpix (*rgb)[width] = (union hvrgbpix (*)[width])buffer; union hvrgbpix *rix; union rgbpix * pix; union hvrgbpix (*lab)[width]; short (*lix)[8]; char (*h**o)[width][2]; lab = (union hvrgbpix (*)[width])(buffer + 16*width*TS); h**o = (char (*)[width][2])(buffer + 32*width*TS); const int left=2; if ((uintptr_t)(image+top*width)&0xf || (uintptr_t)buffer&0xf) { fprintf(stderr, "unaligned buffers defeat speed!\n"); abort(); } /* Interpolate gren horz&vert, red and blue, and convert to CIELab: */ //do the first two rows of green first. //then one green, and rgb through the tile.. this because R/B needs down-right green value for (row=top; row < top+2 && row < height-2; row++) { col = left + (FC(row,left) & 1); for (c = FC(row,col); col < width-2; col+=2) { pix = (union rgbpix*)image + row*width+col; val = ((pix[-1].g + pix[0].c[c] + pix[1].g) * 2 - pix[-2].c[c] - pix[2].c[c]) >> 2; rgb[row-top][col-left].h.g = ULIM(val,pix[-1].g,pix[1].g); val = ((pix[-width].g + pix[0].c[c] + pix[width].g) * 2 - pix[-2*width].c[c] - pix[2*width].c[c]) >> 2; rgb[row-top][col-left].v.g = ULIM(val,pix[-width].g,pix[width].g); } } for (; row < top+TS && row < height-2; row++) { int rowx = row-1; if (FC(rowx,left+1)==1) { int c1 = FC(rowx+1,left+1), c2 = FC(rowx,left+2); pix = (union rgbpix*)image + row*width+left+1; rix = &rgb[row-top][1]; val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2; rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g); val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2; rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g); for (col=left+1; col < width-3; col+=2) { pix = (union rgbpix*)image + rowx*width+col+1; union hvrgbpix rixr, rix0; rix = &rgb[rowx-top][col-left]+1; signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1]; signed pix_ul = pix[-width-1].c[c1]; rixr.vec = _mm_set1_epi16(pix[-1].g); signed pix_lr = pix[-2].c[c2] + pix[0].c[c2]; rix0.h.c[c2] = rix0.v.c[c2] = pix[0].c[c2]; pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1] + 1; signed pix_dl = pix[width-1].c[c1]; //fully loaded __m128i rix_dr = _mm_setr_epi32(pix[width].g, pix[width-1].c[c1], pix[1].g, pix[-width+1].c[c1]); rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+1].c[c1], pix[width+3].c[c1], pix[width+1].c[c1], 0)); rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+2].g, 0, pix[2*width+1].g, pix[3*width+1].c[c1])); rix_dr = _mm_mullo_epi32(rix_dr,_mm_setr_epi32(2,1,2,1)); //half loaded rix_dr = _mm_hsub_epi32(rix_dr,_mm_setzero_si128()); rix_dr = _mm_srai_epi32(rix_dr,2); __m128i a = _mm_setr_epi32(pix[width].g,pix[1].g,0,0); __m128i b = _mm_setr_epi32(pix[width+2].g,pix[2*width+1].g,0,0); __m128i m = _mm_min_epi32(a,b); __m128i M = _mm_max_epi32(a,b); rix_dr = _mm_min_epi32(rix_dr,M); rix_dr = _mm_max_epi32(rix_dr,m); signed pix_udr = pix_ul + pix_dl; signed rix0_ul = rix[-width-1].h.g; signed rix1_ul = rix[-width-1].v.g; __m128i rix_ur = _mm_setr_epi32(rix[-width+1].h.g, rix[-width+1].v.g, 0, 0); signed rix0_rr = rix[-2].h.g; signed rix1_rr = rix[-2].v.g; rix0.h.g = rix[0].h.g; rix0.v.g = rix[0].v.g; signed rix0_dl = rix[width-1].h.g; signed rix1_dl = rix[width-1].v.g; // fully loaded __m128i rix_udr = _mm_setr_epi32(rix0_ul, rix1_ul, rix0_rr, rix1_rr); rix_udr = _mm_add_epi32(rix_udr, _mm_setr_epi32(rix0_dl, rix1_dl, rix0.h.g, rix0.v.g)); __m128i v2 = _mm_set_epi32(pix_lr, pix_lr, pix_udr, pix_udr); v2 = _mm_sub_epi32(v2, rix_udr); v2 = _mm_srai_epi32(v2,1); v2 = _mm_add_epi32(v2,_mm_cvtepu16_epi32(rixr.vec)); v2 = _mm_max_epi32(v2, _mm_setzero_si128()); v2 = _mm_min_epi32(v2, _mm_set1_epi32(0xffff)); rixr.h.c[c2] = _mm_extract_epi32(v2,2); rixr.v.c[c2] = _mm_extract_epi32(v2,3); rixr.h.c[c1] = _mm_extract_epi32(v2,0); rixr.v.c[c1] = _mm_extract_epi32(v2,1); // following only uses 64 bit __m128i v1 = _mm_set1_epi32(pix_diag); v1 = _mm_sub_epi32(v1, rix_ur); v1 = _mm_sub_epi32(v1, rix_dr); v1 = _mm_sub_epi32(v1, rix_udr); v1 = _mm_srai_epi32(v1,2); v1 = _mm_add_epi32(v1, _mm_setr_epi32(rix0.h.g, rix0.v.g, 0, 0)); v1 = _mm_max_epi32(v1, _mm_setzero_si128()); v1 = _mm_min_epi32(v1, _mm_set1_epi32(0xffff)); rix0.h.c[c1] = _mm_extract_epi32(v1,0); rix0.v.c[c1] = _mm_extract_epi32(v1,1); lab[rowx-top][col-left].vec = cielabv(rixr); lab[rowx-top][col-left+1].vec = cielabv(rix0); _mm_store_si128(&rix[-1].vec,rixr.vec); _mm_store_si128(&rix[0].vec,rix0.vec); rix[width+1].h.g = _mm_extract_epi32(rix_dr,0); rix[width+1].v.g = _mm_extract_epi32(rix_dr,1); } } else {
__m128i test_mm_cvtepu16_epi32(__m128i a) { // CHECK-LABEL: test_mm_cvtepu16_epi32 // CHECK: call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> {{.*}}) // CHECK-ASM: pmovzxwd %xmm{{.*}}, %xmm{{.*}} return _mm_cvtepu16_epi32(a); }
__m128i test_mm_cvtepu16_epi32(__m128i a) { // CHECK-LABEL: test_mm_cvtepu16_epi32 // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3> // CHECK: zext <4 x i16> {{.*}} to <4 x i32> return _mm_cvtepu16_epi32(a); }
void av1_highbd_jnt_convolve_2d_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; int im_h = h + filter_params_y->taps - 1; int im_stride = MAX_SB_SIZE; int i, j; const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); const __m128i wt1 = _mm_set1_epi32(w1); const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m128i offset_const = _mm_set1_epi32(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); const __m128i clip_pixel_to_bd = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); // Check that, even with 12-bit input, the intermediate values will fit // into an unsigned 16-bit intermediate array. assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_q4 & SUBPEL_MASK); const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); for (i = 0; i < im_h; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i data2 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); // Filter even-index pixels const __m128i res_0 = _mm_madd_epi16(data, coeff_01); const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); // Filter odd-index pixels const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 __m128i res = _mm_packs_epi32(res_even, res_odd); _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); } } } /* Vertical filter */ { const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_q4 & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( ((1 << conv_params->round_1) >> 1) - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const int16_t *data = &im_block[i * im_stride + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), *(__m128i *)(data + 1 * im_stride)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), *(__m128i *)(data + 3 * im_stride)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), *(__m128i *)(data + 5 * im_stride)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), *(__m128i *)(data + 7 * im_stride)); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), *(__m128i *)(data + 1 * im_stride)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), *(__m128i *)(data + 3 * im_stride)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), *(__m128i *)(data + 5 * im_stride)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), *(__m128i *)(data + 7 * im_stride)); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); const __m128i res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); const __m128i res_unsigned_lo = _mm_add_epi32(res_lo_round, offset_const); if (w < 8) { if (do_average) { const __m128i data_0 = _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0); const __m128i comp_avg_res = highbd_comp_avg_sse4_1( &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); const __m128i round_result = highbd_convolve_rounding_sse2( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b = _mm_packus_epi32(round_result, round_result); const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); } else { const __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b); } } else { const __m128i res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); const __m128i res_unsigned_hi = _mm_add_epi32(res_hi_round, offset_const); if (do_average) { const __m128i data_lo = _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_hi = _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4])); const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo); const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi); const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m128i round_result_hi = highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b = _mm_packus_epi32(round_result_lo, round_result_hi); const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); } else { const __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b); } } } } } }