template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int16_t(void *_idata, const int istride, const char *odata, const int ostride, const int iwidth, const int iheight, const int ooffset_x, const int ooffset_y, const int owidth, const int oheight) { int16_t *idata = (int16_t *)_idata; const int skip = 1; const __m128i ONE = _mm_set1_epi16(1); const __m128i OFFSET = _mm_set1_epi16(1 << (active_bits - 1)); const __m128i SHUF = _mm_set_epi8(15,14, 11,10, 7,6, 3,2, 13,12, 9,8, 5,4, 1,0); const __m128i CLIP = _mm_set1_epi16((1 << active_bits) - 1); const __m128i ZERO = _mm_set1_epi16(0); (void)iwidth; (void)iheight; for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) { for (int x = ooffset_x; x < ooffset_x + owidth; x += 16) { __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]); __m128i D8 = _mm_load_si128((__m128i *)&idata[y*istride + x + 8]); D0 = _mm_shuffle_epi8(D0, SHUF); D8 = _mm_shuffle_epi8(D8, SHUF); __m128i E0 = _mm_unpacklo_epi64(D0, D8); __m128i O1 = _mm_unpackhi_epi64(D0, D8); __m128i X0 = _mm_sub_epi16(E0, _mm_srai_epi16(_mm_add_epi16(O1, ONE), 1)); __m128i X1 = _mm_add_epi16(O1, X0); __m128i Z0 = _mm_unpacklo_epi16(X0, X1); __m128i Z8 = _mm_unpackhi_epi16(X0, X1); if (shift != 0) { Z0 = _mm_add_epi16(Z0, ONE); Z8 = _mm_add_epi16(Z8, ONE); Z0 = _mm_srai_epi16(Z0, shift); Z8 = _mm_srai_epi16(Z8, shift); } Z0 = _mm_add_epi16(Z0, OFFSET); Z8 = _mm_add_epi16(Z8, OFFSET); Z0 = _mm_min_epi16(Z0, CLIP); Z8 = _mm_min_epi16(Z8, CLIP); Z0 = _mm_max_epi16(Z0, ZERO); Z8 = _mm_max_epi16(Z8, ZERO); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 0 - ooffset_x)], Z0); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 8 - ooffset_x)], Z8); } } }
void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { /* Paeth tries to predict pixel d using the pixel to the left of it, a, * and two pixels from the previous row, b and c: * prev: c b * row: a d * The Paeth function predicts d to be whichever of a, b, or c is nearest to * p=a+b-c. * * The first pixel has no left context, and so uses an Up filter, p = b. * This works naturally with our main loop's p = a+b-c if we force a and c * to zero. * Here we zero b and d, which become c and a respectively at the start of * the loop. */ png_debug(1, "in png_read_filter_row_paeth4_sse2"); const __m128i zero = _mm_setzero_si128(); __m128i c, b = zero, a, d = zero; int rb = row_info->rowbytes; while (rb > 0) { /* It's easiest to do this math (particularly, deal with pc) with 16-bit * intermediates. */ c = b; b = _mm_unpacklo_epi8(load4(prev), zero); a = d; d = _mm_unpacklo_epi8(load4(row ), zero); /* (p-a) == (a+b-c - a) == (b-c) */ __m128i pa = _mm_sub_epi16(b,c); /* (p-b) == (a+b-c - b) == (a-c) */ __m128i pb = _mm_sub_epi16(a,c); /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ __m128i pc = _mm_add_epi16(pa,pb); pa = abs_i16(pa); /* |p-a| */ pb = abs_i16(pb); /* |p-b| */ pc = abs_i16(pc); /* |p-c| */ __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); /* Paeth breaks ties favoring a over b over c. */ __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); /* Note `_epi8`: we need addition to wrap modulo 255. */ d = _mm_add_epi8(d, nearest); store4(row, _mm_packus_epi16(d,d)); prev += 4; row += 4; rb -= 4; } }
static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); // TRANSPOSE... // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // // TO // // 00 10 20 30 // 01 11 21 31 // 02 12 22 32 // 03 13 23 33 // 04 14 24 34 // 05 15 25 35 // 06 16 26 36 // 07 17 27 37 // // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 02 03 12 13 22 23 32 33 const __m128i s3s2 = _mm_srli_si128(s1s0, 8); // 06 07 16 17 26 27 36 37 const __m128i s7s6 = _mm_srli_si128(s5s4, 8); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 4 bytes *(int *)dst = _mm_cvtsi128_si32(temp); }
static void clamphigh_s16_sse (int16_t *dest, const int16_t *src1, int n, const int16_t *src2_1) { __m128i xmm1; int16_t max = *src2_1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { int16_t x = *src1++; if (x > max) x = max; *dest++ = x; } xmm1 = _mm_set1_epi16(max); for (; n >= 8; n -= 8) { __m128i xmm0; xmm0 = _mm_loadu_si128((__m128i *)src1); xmm0 = _mm_min_epi16(xmm0, xmm1); _mm_store_si128((__m128i *)dest, xmm0); dest += 8; src1 += 8; } for (; n > 0; n--) { int16_t x = *src1++; if (x > max) x = max; *dest++ = x; } }
static void vpx_highbd_filter_block1d4_h4_sse2( const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { // We will load multiple shifted versions of the row and shuffle them into // 16-bit words of the form // ... s[2] s[1] s[0] s[-1] // ... s[4] s[3] s[2] s[1] // Then we call multiply and add to get partial results // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] // The two results are then added together to get the even output __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; __m128i res_reg; __m128i even, odd; __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used const __m128i reg_round = _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); const __m128i reg_zero = _mm_setzero_si128(); int h; // Start one pixel before as we need tap/2 - 1 = 1 sample from the past src_ptr -= 1; // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); for (h = height; h > 0; --h) { src_reg = _mm_loadu_si128((const __m128i *)src_ptr); src_reg_shift_1 = _mm_srli_si128(src_reg, 2); src_reg_shift_2 = _mm_srli_si128(src_reg, 4); src_reg_shift_3 = _mm_srli_si128(src_reg, 6); // Output 2 0 even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, &kernel_reg_45); // Output 3 1 odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst res_reg = _mm_unpacklo_epi32(even, odd); res_reg = mm_round_epi32_sse2(&res_reg, ®_round, CONV8_ROUNDING_BITS); res_reg = _mm_packs_epi32(res_reg, reg_zero); // Saturate the result and save res_reg = _mm_min_epi16(res_reg, reg_max); res_reg = _mm_max_epi16(res_reg, reg_zero); _mm_storel_epi64((__m128i *)dst_ptr, res_reg); src_ptr += src_stride; dst_ptr += dst_stride; } }
__m128i test_mm_min_epi16(__m128i A, __m128i B) { // DAG-LABEL: test_mm_min_epi16 // DAG: call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // // ASM-LABEL: test_mm_min_epi16 // ASM: pminsw return _mm_min_epi16(A, B); }
void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { // Paeth tries to predict pixel d using the pixel to the left of it, a, // and two pixels from the previous row, b and c: // prev: c b // row: a d // The Paeth function predicts d to be whichever of a, b, or c is nearest to p=a+b-c. // The first pixel has no left context, and so uses an Up filter, p = b. // This works naturally with our main loop's p = a+b-c if we force a and c to zero. // Here we zero b and d, which become c and a respectively at the start of the loop. const __m128i zero = _mm_setzero_si128(); __m128i c, b = zero, a, d = zero; int rb = row_info->rowbytes; while (rb > 0) { // It's easiest to do this math (particularly, deal with pc) with 16-bit intermediates. c = b; b = _mm_unpacklo_epi8(load<bpp>(prev), zero); a = d; d = _mm_unpacklo_epi8(load<bpp>(row ), zero); __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c) pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c) pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) pa = abs_i16(pa); // |p-a| pb = abs_i16(pb); // |p-b| pc = abs_i16(pc); // |p-c| __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); // Paeth breaks ties favoring a over b over c. __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to wrap modulo 255. store<bpp>(row, _mm_packus_epi16(d,d)); prev += bpp; row += bpp; rb -= bpp; } }
static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *x_filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); }
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, int start_block, int end_block, VP8Histogram* const histo) { const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { int16_t out[16]; int k; VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); // Convert coefficients to bin (within out[]). { // Load. const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); // sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign0 = _mm_srai_epi16(out0, 15); const __m128i sign1 = _mm_srai_epi16(out1, 15); // abs(out) = (out ^ sign) - sign const __m128i xor0 = _mm_xor_si128(out0, sign0); const __m128i xor1 = _mm_xor_si128(out1, sign1); const __m128i abs0 = _mm_sub_epi16(xor0, sign0); const __m128i abs1 = _mm_sub_epi16(xor1, sign1); // v = abs(out) >> 3 const __m128i v0 = _mm_srai_epi16(abs0, 3); const __m128i v1 = _mm_srai_epi16(abs1, 3); // bin = min(v, MAX_COEFF_THRESH) const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); // Store. _mm_storeu_si128((__m128i*)&out[0], bin0); _mm_storeu_si128((__m128i*)&out[8], bin1); } // Convert coefficients to bin. for (k = 0; k < 16; ++k) { ++distribution[out[k]]; } } VP8LSetHistogramData(distribution, histo); }
SIMDValue SIMDInt16x8Operation::OpMin(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_min_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); // min a b return X86SIMDValue::ToSIMDValue(x86Result); }
static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred, int start_block, int end_block) { int histo[MAX_COEFF_THRESH + 1] = { 0 }; int16_t out[16]; int j, k; const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); for (j = start_block; j < end_block; ++j) { VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); // Convert coefficients to bin (within out[]). { // Load. const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); // sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign0 = _mm_srai_epi16(out0, 15); const __m128i sign1 = _mm_srai_epi16(out1, 15); // abs(out) = (out ^ sign) - sign const __m128i xor0 = _mm_xor_si128(out0, sign0); const __m128i xor1 = _mm_xor_si128(out1, sign1); const __m128i abs0 = _mm_sub_epi16(xor0, sign0); const __m128i abs1 = _mm_sub_epi16(xor1, sign1); // v = abs(out) >> 2 const __m128i v0 = _mm_srai_epi16(abs0, 2); const __m128i v1 = _mm_srai_epi16(abs1, 2); // bin = min(v, MAX_COEFF_THRESH) const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); // Store. _mm_storeu_si128((__m128i*)&out[0], bin0); _mm_storeu_si128((__m128i*)&out[8], bin1); } // Use bin to update histogram. for (k = 0; k < 16; ++k) { histo[out[k]]++; } } return VP8GetAlpha(histo); }
static FORCE_INLINE __m128i mm_min_epu(const __m128i &a, const __m128i &b) { if (sizeof(PixelType) == 1) return _mm_min_epu8(a, b); else { __m128i word_32768 = _mm_set1_epi16(32768); __m128i a_minus = _mm_sub_epi16(a, word_32768); __m128i b_minus = _mm_sub_epi16(b, word_32768); return _mm_add_epi16(_mm_min_epi16(a_minus, b_minus), word_32768); } }
void f0r_update(f0r_instance_t instance, double time, const uint32_t *inframe, uint32_t *outframe) { assert(instance); colgate_instance_t *inst = (colgate_instance_t *)instance; unsigned len = inst->width * inst->height; unsigned char *dst = (unsigned char *)outframe; const unsigned char *src = (unsigned char *)inframe; unsigned i; #ifdef __SSE2__ __m128i zero = _mm_setzero_si128(); __m128i max = _mm_set1_epi16(REVERSE_LUT_SIZE - 1); for (i = 0; i < len; ++i) { __m128i l1 = inst->premult_r[*src++]; __m128i l2 = inst->premult_g[*src++]; __m128i l3 = inst->premult_b[*src++]; __m128i result = _mm_add_epi32(l3, _mm_add_epi32(l1, l2)); // Shift into the right range, and then clamp to [min, max]. // We convert to 16-bit values since we have min/max instructions // there (without needing SSE4), and because it allows us // to extract the values with one less SSE shift/move. result = _mm_srai_epi32(result, INPUT_PIXEL_BITS + MATRIX_ELEMENT_FRAC_BITS - REVERSE_LUT_BITS); result = _mm_packs_epi32(result, result); result = _mm_max_epi16(result, zero); result = _mm_min_epi16(result, max); unsigned new_rg = _mm_cvtsi128_si32(result); result = _mm_srli_si128(result, 4); unsigned new_b = _mm_cvtsi128_si32(result); *dst++ = linear_rgb_to_srgb_lut[new_rg & 0xffff]; *dst++ = linear_rgb_to_srgb_lut[new_rg >> 16]; *dst++ = linear_rgb_to_srgb_lut[new_b]; *dst++ = *src++; // Copy alpha. } #else for (i = 0; i < len; ++i) { unsigned old_r = *src++; unsigned old_g = *src++; unsigned old_b = *src++; int new_r = inst->premult_r[old_r][0] + inst->premult_g[old_g][0] + inst->premult_b[old_b][0]; int new_g = inst->premult_r[old_r][1] + inst->premult_g[old_g][1] + inst->premult_b[old_b][1]; int new_b = inst->premult_r[old_r][2] + inst->premult_g[old_g][2] + inst->premult_b[old_b][2]; *dst++ = convert_linear_rgb_to_srgb_fp(new_r); *dst++ = convert_linear_rgb_to_srgb_fp(new_g); *dst++ = convert_linear_rgb_to_srgb_fp(new_b); *dst++ = *src++; // Copy alpha. } #endif }
__m64 _m_pminsw(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_min_epi16(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold) { const int K = 4, N = K*3 + 1; int k, v = ptr[0]; short d[N]; for( k = 0; k < N; k++ ) d[k] = (short)(v - ptr[pixel[k]]); #if CV_SSE2 __m128i v0 = _mm_loadu_si128((__m128i*)(d+1)); __m128i v1 = _mm_loadu_si128((__m128i*)(d+2)); __m128i a = _mm_min_epi16(v0, v1); __m128i b = _mm_max_epi16(v0, v1); v0 = _mm_loadu_si128((__m128i*)(d+3)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+4)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d)); __m128i q0 = _mm_min_epi16(a, v0); __m128i q1 = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+5)); q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); threshold = (short)_mm_cvtsi128_si32(q0) - 1; #else int a0 = threshold; for( k = 0; k < 8; k += 2 ) { int a = std::min((int)d[k+1], (int)d[k+2]); if( a <= a0 ) continue; a = std::min(a, (int)d[k+3]); a = std::min(a, (int)d[k+4]); a0 = std::max(a0, std::min(a, (int)d[k])); a0 = std::max(a0, std::min(a, (int)d[k+5])); } int b0 = -a0; for( k = 0; k < 8; k += 2 ) { int b = std::max((int)d[k+1], (int)d[k+2]); b = std::max(b, (int)d[k+3]); if( b >= b0 ) continue; b = std::max(b, (int)d[k+4]); b0 = std::min(b0, std::max(b, (int)d[k])); b0 = std::min(b0, std::max(b, (int)d[k+5])); } threshold = -b0-1; #endif return threshold; }
__m128i ProxyRwSse2 <SplFmt_INT16>::S16 <CLIP_FLAG, SIGN_FLAG>::prepare_write_clip (const __m128i &src, const __m128i &mi, const __m128i &ma, const __m128i &sign_bit) { __m128i val = src; if (CLIP_FLAG) { val = _mm_min_epi16 (val, ma); val = _mm_max_epi16 (val, mi); } if (SIGN_FLAG) { val = _mm_xor_si128 (val, sign_bit); } return (val); }
SIMDValue SIMDUint16x8Operation::OpMin(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); // _mm_min_epu16 is SSE4.1 //x86Result.m128i_value = _mm_min_epu16(tmpaValue.m128i_value, tmpbValue.m128i_value); // XOR the sign bits so the comparison comes out correct for unsigned tmpaValue.m128i_value = _mm_xor_si128(tmpaValue.m128i_value, X86_WORD_SIGNBITS.m128i_value); tmpbValue.m128i_value = _mm_xor_si128(tmpbValue.m128i_value, X86_WORD_SIGNBITS.m128i_value); x86Result.m128i_value = _mm_min_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); x86Result.m128i_value = _mm_xor_si128(x86Result.m128i_value, X86_WORD_SIGNBITS.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); const __m128i s1s0 = _mm_unpacklo_epi8(A, B); const __m128i s3s2 = _mm_unpacklo_epi8(C, D); const __m128i s5s4 = _mm_unpacklo_epi8(E, F); const __m128i s7s6 = _mm_unpacklo_epi8(G, H); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); }
static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT) { v16 dst, src, vco; v16 max, min; src = _mm_load_si128((v16 *)VS); dst = _mm_load_si128((v16 *)VT); vco = _mm_load_si128((v16 *)cf_co); /* * Due to premature clamping in between adds, sometimes we need to add the * LESSER of two integers, either VS or VT, to the carry-in flag matching the * current vector register slice, BEFORE finally adding the greater integer. */ max = _mm_max_epi16(dst, src); min = _mm_min_epi16(dst, src); min = _mm_adds_epi16(min, vco); max = _mm_adds_epi16(max, min); _mm_store_si128((v16 *)VD, max); return; }
static void GradientPredictInverse(const uint8_t* const in, const uint8_t* const top, uint8_t* const row, int length) { if (length > 0) { int i; const int max_pos = length & ~7; const __m128i zero = _mm_setzero_si128(); __m128i A = _mm_set_epi32(0, 0, 0, row[-1]); // left sample for (i = 0; i < max_pos; i += 8) { const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]); const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]); const __m128i B = _mm_unpacklo_epi8(tmp0, zero); const __m128i C = _mm_unpacklo_epi8(tmp1, zero); const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]); const __m128i D = _mm_unpacklo_epi8(tmp2, zero); // base input const __m128i E = _mm_sub_epi16(B, C); // unclipped gradient basis B - C __m128i out = zero; // accumulator for output __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff); int k = 8; while (1) { const __m128i tmp3 = _mm_add_epi16(A, E); // delta = A + B - C const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi); const __m128i tmp5 = _mm_max_epi16(tmp4, zero); // clipped delta const __m128i tmp6 = _mm_add_epi16(tmp5, D); // add to in[] values A = _mm_and_si128(tmp6, mask_hi); // 1-complement clip out = _mm_or_si128(out, A); // accumulate output if (--k == 0) break; A = _mm_slli_si128(A, 2); // rotate left sample mask_hi = _mm_slli_si128(mask_hi, 2); // rotate mask } A = _mm_srli_si128(A, 14); // prepare left sample for next iteration _mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero)); } for (; i < length; ++i) { row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]); } } }
void av1_highbd_wiener_convolve_add_src_ssse3( const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd) { assert(x_step_q4 == 16 && y_step_q4 == 16); assert(!(w & 7)); assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); (void)x_step_q4; (void)y_step_q4; const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); DECLARE_ALIGNED(16, uint16_t, temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); int intermediate_height = h + SUBPEL_TAPS - 1; int i, j; const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; const __m128i zero = _mm_setzero_si128(); // Add an offset to account for the "add_src" part of the convolve function. const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); /* Horizontal filter */ { const __m128i coeffs_x = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); for (i = 0; i < intermediate_height; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i data2 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); // Filter even-index pixels const __m128i res_0 = _mm_madd_epi16(data, coeff_01); const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), conv_params->round_0); // Filter odd-index pixels const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), conv_params->round_0); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 const __m128i maxval = _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1); __m128i res = _mm_packs_epi32(res_even, res_odd); res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); } } } /* Vertical filter */ { const __m128i coeffs_y = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - (1 << (bd + conv_params->round_1 - 1))); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), conv_params->round_1); const __m128i res_hi_round = _mm_srai_epi32( _mm_add_epi32(res_hi, round_const), conv_params->round_1); const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; _mm_storeu_si128(p, res_16bit); } } } }
// Simple quantization static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], int n, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(2047); const __m128i zero = _mm_set1_epi16(0); __m128i sign0, sign8; __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) sign0 = _mm_srai_epi16(in0, 15); sign8 = _mm_srai_epi16(in8, 15); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); // if (coeff > 2047) coeff = 2047 coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); // out = (coeff * iQ + B) >> QFIX; { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // expand bias from 16b to 32b __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); // out = (coeff * iQ + B) out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = (coeff * iQ + B) >> QFIX; out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); // if (coeff <= mtx->zthresh_) {in=0; out=0;} { __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); in0 = _mm_and_si128(in0, cmp0); in8 = _mm_and_si128(in8, cmp8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); out0 = _mm_and_si128(out0, cmp0); out8 = _mm_and_si128(out8, cmp8); } // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not { int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, packed_out); if (n) { tmp[0] &= ~0xff; } return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); } }
SIMD_INLINE __m128i TextureBoostedSaturatedGradient16(__m128i difference, __m128i saturation, const __m128i & boost) { return _mm_mullo_epi16(_mm_max_epi16(K_ZERO, _mm_add_epi16(saturation, _mm_min_epi16(difference, saturation))), boost); }
static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i-=2) { // load the 2 strides of source srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr - 3))); srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line-3)), 1); // filter the source buffer srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + 5))); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line+5)), 1); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcRegFilt32b1_1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+output_pitch), _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); output_ptr+=dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); } }
/***************************************************************************** * This function utilises 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * * vp9_encoder.c. * * For the joint cost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * * For the component costs: * * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * * If these do not hold, then this function cannot be used without * * modification, in which case you can revert to using the C implementation, * * which does not rely on these properties. * *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min); const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); // search_param determines the length of the initial step and hence the number // of iterations. // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; const int tot_steps = cfg->total_steps - search_param; const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; const uint8_t *const what = x->plane[0].src.buf; const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; #if ARCH_X86_64 __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif unsigned int best_sad; int i; int j; int step; // Check the prerequisite cost function properties that are easy to check // in an assert. See the function-level documentation for details on all // prerequisites. assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); // Check the starting position best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { __m128i v_sad_d; __m128i v_cost_d; __m128i v_outside_d; __m128i v_inside_d; __m128i v_diff_mv_w; #if ARCH_X86_64 __m128i v_blocka[2]; #else __m128i v_blocka[1]; #endif // Compute the candidate motion vectors const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]); const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); // Clamp them to the search bounds __m128i v_these_mv_clamp_w = v_these_mv_w; v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); // The ones that did not change are inside the search area v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); // If none of them are inside, then move on if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) { continue; } // The inverse mask indicates which of the MVs are outside v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); // Shift right to keep the sign bit clear, we will use this later // to set the cost to the maximum value. v_outside_d = _mm_srli_epi32(v_outside_d, 1); // Compute the difference MV v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); // We utilise the fact that the cost function is even, and use the // absolute difference. This allows us to use unsigned indexes later // and reduces cache pressure somewhat as only a half of the table // is ever referenced. v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); // Compute the SIMD pointer offsets. { #if ARCH_X86_64 // sizeof(intptr_t) == 8 // Load the offsets __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]); __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]); // Set the ones falling outside to zero v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); v_bo32_q = _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); // Compute the candidate addresses v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); #else // ARCH_X86 // sizeof(intptr_t) == 4 __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]); v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); #endif } fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], in_what_stride, (uint32_t*)&v_sad_d); // Look up the component cost of the residual motion vector { const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); // Note: This is a use case for vpgather in AVX2 const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; __m128i v_cost_10_d, v_cost_32_d; v_cost_10_d = _mm_cvtsi32_si128(cost0); v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); v_cost_32_d = _mm_cvtsi32_si128(cost2); v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); } // Now add in the joint cost { const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); } // Multiply by sad_per_bit v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); // ROUND_POWER_OF_TWO(v_cost_d, 8) v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80)); v_cost_d = _mm_srai_epi32(v_cost_d, 8); // Add the cost to the sad v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); // Make the motion vectors outside the search area have max cost // by or'ing in the comparison mask, this way the minimum search won't // pick them. v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); // Find the minimum value and index horizontally in v_sad_d { // Try speculatively on 16 bits, so we can use the minpos intrinsic const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); // If the local best value is not saturated, just use it, otherwise // find the horizontal minimum again the hard way on 32 bits. // This is executed rarely. if (__unlikely__(local_best_sad == 0xffff)) { __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; v_loval_d = v_sad_d; v_loidx_d = _mm_set_epi32(3, 2, 1, 0); v_hival_d = _mm_srli_si128(v_loval_d, 8); v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); v_hival_d = _mm_srli_si128(v_loval_d, 4); v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); local_best_sad = _mm_extract_epi32(v_loval_d, 0); local_best_idx = _mm_extract_epi32(v_loidx_d, 0); } // Update the global minimum if the local minimum is smaller if (__likely__(local_best_sad < best_sad)) { new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; best_sad = local_best_sad; } } } bmv = new_bmv; best_address = new_best_address; v_bmv_w = _mm_set1_epi32(bmv.as_int); #if ARCH_X86_64 v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif if (__unlikely__(best_address == in_what)) { (*num00)++; } } *best_mv = bmv.as_mv; return best_sad; }
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm256_castsi256_si128(srcReg32b2), 1); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm256_castsi256_si128(srcReg32b3), 1); srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, _mm256_castsi256_si128(srcReg32b4), 1); srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, _mm256_castsi256_si128(srcReg32b5), 1); srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, _mm256_castsi256_si128(srcReg32b6), 1); srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, _mm256_castsi256_si128(srcReg32b7), 1); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); // save srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); // save srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i-=2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); // merge every two consecutive registers // save srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); // multiply 2 adjacent elements with the filter and add the result srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_max_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_max_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); // shift by 7 bit each 16 bit srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcReg32b1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+out_pitch), _mm256_extractf128_si256(srcReg32b1, 1)); output_ptr+=dst_stride; // save part of the registers for next strides srcReg32b10 = srcReg32b11; srcReg32b1 = srcReg32b3; srcReg32b11 = srcReg32b2; srcReg32b3 = srcReg32b5; srcReg32b2 = srcReg32b4; srcReg32b5 = srcReg32b7; srcReg32b7 = srcReg32b9; } if (i > 0) { __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); srcRegFilt7 = _mm_unpackhi_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), _mm256_castsi256_si128(firstFilters)); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), _mm256_castsi256_si128(firstFilters)); srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), _mm256_castsi256_si128(secondFilters)); srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), _mm256_castsi256_si128(secondFilters)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), _mm256_castsi256_si128(thirdFilters)); srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); } }
static void thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const short* src = (const short*)_src.data; short* dst = (short*)_dst.data; size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; src_step = dst_step = roi.width; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) IppiSize sz = { roi.width, roi.height }; switch( type ) { case THRESH_TRUNC: if (0 <= ippiThreshold_GT_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh)) return; setIppErrorStatus(); break; case THRESH_TOZERO: if (0 <= ippiThreshold_LTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+1, 0)) return; setIppErrorStatus(); break; case THRESH_TOZERO_INV: if (0 <= ippiThreshold_GTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0)) return; setIppErrorStatus(); break; } #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_and_si128( v0, maxval8 ); v1 = _mm_and_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_andnot_si128( v0, maxval8 ); v1 = _mm_andnot_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_min_epi16( v0, thresh8 ); v1 = _mm_min_epi16( v1, thresh8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8)); v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8)); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0); v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi) { int lastRow, lastCol; BYTE *UData,*VData,*YData; int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *buffer; /* last_line: if the last (U,V doubled) line should be skipped, set to 10B * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ buffer = _aligned_malloc(4 * 16, 16); YData = (BYTE*) pSrc[0]; UData = (BYTE*) pSrc[1]; VData = (BYTE*) pSrc[2]; nWidth = roi->width; nHeight = roi->height; if ((lastCol = (nWidth & 3))) { switch (lastCol) { case 1: r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF); break; case 2: r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; case 3: r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; } _mm_store_si128(buffer+3,r7); lastCol = 1; } nWidth += 3; nWidth = nWidth >> 2; lastRow = nHeight & 1; nHeight++; nHeight = nHeight >> 1; VaddDst = (dstStep << 1) - (nWidth << 4); VaddY = (srcStep[0] << 1) - (nWidth << 2); VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC); VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC); while (nHeight-- > 0) { if (nHeight == 0) lastRow <<= 1; i = 0; do { if (!(i & 0x01)) { /* Y-, U- and V-data is stored in different arrays. * We start with processing U-data. * * at first we fetch four U-values from its array and shuffle them like this: * 0d0d 0c0c 0b0b 0a0a * we've done two things: converting the values to signed words and duplicating * each value, because always two pixel "share" the same U- (and V-) data */ r0 = _mm_cvtsi32_si128(*(UINT32 *)UData); r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); r0 = _mm_shuffle_epi8(r0,r5); UData += 4; /* then we subtract 128 from each value, so we get D */ r3 = _mm_set_epi16(128,128,128,128,128,128,128,128); r0 = _mm_subs_epi16(r0,r3); /* we need to do two things with our D, so let's store it for later use */ r2 = r0; /* now we can multiply our D with 48 and unpack it to xmm4:xmm0 * this is what we need to get G data later on */ r4 = r0; r7 = _mm_set_epi16(48,48,48,48,48,48,48,48); r0 = _mm_mullo_epi16(r0,r7); r4 = _mm_mulhi_epi16(r4,r7); r7 = r0; r0 = _mm_unpacklo_epi16(r0,r4); r4 = _mm_unpackhi_epi16(r7,r4); /* to get B data, we need to prepare a second value, D*475 */ r1 = r2; r7 = _mm_set_epi16(475,475,475,475,475,475,475,475); r1 = _mm_mullo_epi16(r1,r7); r2 = _mm_mulhi_epi16(r2,r7); r7 = r1; r1 = _mm_unpacklo_epi16(r1,r2); r7 = _mm_unpackhi_epi16(r7,r2); /* so we got something like this: xmm7:xmm1 * this pair contains values for 16 pixel: * aabbccdd * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */ _mm_store_si128(buffer+1,r7); /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ r2 = _mm_cvtsi32_si128(*(UINT32 *)VData); r2 = _mm_shuffle_epi8(r2,r5); VData += 4; r2 = _mm_subs_epi16(r2,r3); r5 = r2; /* this is also known as E*403, we need it to convert R data */ r3 = r2; r7 = _mm_set_epi16(403,403,403,403,403,403,403,403); r2 = _mm_mullo_epi16(r2,r7); r3 = _mm_mulhi_epi16(r3,r7); r7 = r2; r2 = _mm_unpacklo_epi16(r2,r3); r7 = _mm_unpackhi_epi16(r7,r3); /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); /* doing this step: E*120 */ r3 = r5; r7 = _mm_set_epi16(120,120,120,120,120,120,120,120); r3 = _mm_mullo_epi16(r3,r7); r5 = _mm_mulhi_epi16(r5,r7); r7 = r3; r3 = _mm_unpacklo_epi16(r3,r5); r7 = _mm_unpackhi_epi16(r7,r5); /* now we complete what we've begun above: * (48*D) + (120*E) = (48*D +120*E) */ r0 = _mm_add_epi32(r0,r3); r4 = _mm_add_epi32(r4,r7); /* and store to memory ! */ _mm_store_si128(buffer,r4); } else { /* maybe you've wondered about the conditional above ? * Well, we prepared UV data for eight pixel in each line, but can only process four * per loop. So we need to load the upper four pixel data from memory each secound loop! */ r1 = _mm_load_si128(buffer+1); r2 = _mm_load_si128(buffer+2); r0 = _mm_load_si128(buffer); } if (++i == nWidth) lastCol <<= 1; /* We didn't produce any output yet, so let's do so! * Ok, fetch four pixel from the Y-data array and shuffle them like this: * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */ r4 = _mm_cvtsi32_si128(*(UINT32 *)YData); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; /* no we can perform the "real" conversion itself and produce output! */ r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); /* in the end, we only need bytes for RGB values. * So, what do we do? right! shifting left makes values bigger and thats always good. * before we had dwords of data, and by shifting left and treating the result * as packed words, we get not only signed words, but do also divide by 256 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least * significant byte, that we don't need anymore, because we've done some rounding */ r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); /* one thing we still have to face is the clip() function ... * we have still signed words, and there are those min/max instructions in SSE2 ... * the max instruction takes always the bigger of the two operands and stores it in the first one, * and it operates with signs ! * if we feed it with our values and zeros, it takes the zeros if our values are smaller than * zero and otherwise our values */ r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); /* the same thing just completely different can be used to limit our values to 255, * but now using the min instruction and 255s */ r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); /* Now we got our bytes. * the moment has come to assemble the three channels R,G and B to the xrgb dwords * on Red channel we just have to and each futural dword with 00FF0000H */ //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); /* on Green channel we have to shuffle somehow, so we get something like this: * 00d0 00c0 00b0 00a0 */ r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); /* and on Blue channel that one: * 000d 000c 000b 000a */ r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); /* and at last we or it together and get this one: * xrgb xrgb xrgb xrgb */ r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); /* Only thing to do know is writing data to memory, but this gets a bit more * complicated if the width is not a multiple of four and it is the last column in line. */ if (lastCol & 0x02) { /* let's say, we need to only convert six pixel in width * Ok, the first 4 pixel will be converted just like every 4 pixel else, but * if it's the last loop in line, last_column is shifted left by one (curious? have a look above), * and we land here. Through initialisation a mask was prepared. In this case it looks like * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */ r6 = _mm_load_si128(buffer+3); /* we and our output data with this mask to get only the valid pixel */ r4 = _mm_and_si128(r4,r6); /* then we fetch memory from the destination array ... */ r5 = _mm_lddqu_si128((__m128i *)pDst); /* ... and and it with the inverse mask. We get only those pixel, which should not be updated */ r6 = _mm_andnot_si128(r6,r5); /* we only have to or the two values together and write it back to the destination array, * and only the pixel that should be updated really get changed. */ r4 = _mm_or_si128(r4,r6); } _mm_storeu_si128((__m128i *)pDst,r4); if (!(lastRow & 0x02)) { /* Because UV data is the same for two lines, we can process the secound line just here, * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination * pointer. These offsets are iStride[0] and the target scanline. * But if we don't need to process the secound line, like if we are in the last line of processing nine lines, * we just skip all this. */ r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); if (lastCol & 0x02) { r6 = _mm_load_si128(buffer+3); r4 = _mm_and_si128(r4,r6); r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep)); r6 = _mm_andnot_si128(r6,r5); r4 = _mm_or_si128(r4,r6); /* only thing is, we should shift [rbp-42] back here, because we have processed the last column, * and this "special condition" can be released */ lastCol >>= 1; } _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } /* after all we have to increase the destination- and Y-data pointer by four pixel */ pDst += 16; YData += 4; }
static void thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const short* src = _src.ptr<short>(); short* dst = _dst.ptr<short>(); size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; src_step = dst_step = roi.width; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) CV_IPP_CHECK() { IppiSize sz = { roi.width, roi.height }; CV_SUPPRESS_DEPRECATED_START switch( type ) { case THRESH_TRUNC: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GT_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_GT_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_LTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh + 1, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_LTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+1, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO_INV: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_GTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; } CV_SUPPRESS_DEPRECATED_END } #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_and_si128( v0, maxval8 ); v1 = _mm_and_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); for( ; j <= roi.width - 8; j += 8 ) { uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_andnot_si128( v0, maxval8 ); v1 = _mm_andnot_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); for( ; j <= roi.width - 8; j += 8 ) { uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_min_epi16( v0, thresh8 ); v1 = _mm_min_epi16( v1, thresh8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh)); #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8)); v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8)); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) { int16x8_t v_src = vld1q_s16(src + j); uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0); v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) { int16x8_t v_src = vld1q_s16(src + j); uint16x8_t v_mask = vcleq_s16(v_src, v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }
static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) { return _mm_max_epi16(_mm_min_epi16(u, max), zero); }