SIMD_INLINE __m256i BinomialSum16(const __m256i & ab, const __m256i & cd) { #ifdef SIMD_MADDUBS_ERROR return _mm256_add_epi16(_mm256_maddubs_epi16(_mm256_or_si256(K_ZERO, ab), K8_01_03), _mm256_maddubs_epi16(_mm256_or_si256(K_ZERO, cd), K8_03_01)); #else return _mm256_add_epi16(_mm256_maddubs_epi16(ab, K8_01_03), _mm256_maddubs_epi16(cd, K8_03_01)); #endif }
static INLINE unsigned int masked_sad32xh_avx2( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { int x, y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_scale = _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 32) { const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); const __m256i m_inv = _mm256_sub_epi8(mask_max, m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. const __m256i data_l = _mm256_unpacklo_epi8(a, b); const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); const __m256i data_r = _mm256_unpackhi_epi8(a, b); const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. res = _mm256_shuffle_epi32(res, 0xd8); res = _mm256_permute4x64_epi64(res, 0xd8); res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int32_t sad = _mm256_extract_epi32(res, 0); return (sad + 31) >> 6; }
template <> SIMD_INLINE void InterpolateX<1>(const __m256i * alpha, __m256i * buffer) { #if defined(_MSC_VER) // Workaround for Visual Studio 2012 compiler bug in release mode: __m256i _buffer = _mm256_or_si256(K_ZERO, _mm256_load_si256(buffer)); #else __m256i _buffer = _mm256_load_si256(buffer); #endif _mm256_store_si256(buffer, _mm256_maddubs_epi16(_buffer, _mm256_load_si256(alpha))); }
static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, __m256i *const sse, __m256i *const sum) { const __m256i adj_sub = _mm256_set1_epi16(0xff01); // (1,-1) // unpack into pairs of source and reference values const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); // subtract adjacent elements using src*1 + ref*-1 const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); // add to the running totals *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); }
static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, const __m256i a, uint8_t *comp_pred) { const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS; const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits)); const __m256i ma = _mm256_sub_epi8(alpha_max, a); const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1); const __m256i aaAL = _mm256_unpacklo_epi8(a, ma); const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1); const __m256i aaAH = _mm256_unpackhi_epi8(a, ma); const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL); const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH); const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset); const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset); const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH); _mm256_storeu_si256((__m256i *)(comp_pred), roundA); }
template <> SIMD_INLINE void InterpolateX<3>(const __m256i * alpha, __m256i * buffer) { __m256i src[3], shuffled; src[0] = _mm256_load_si256(buffer + 0); src[1] = _mm256_load_si256(buffer + 1); src[2] = _mm256_load_si256(buffer + 2); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[0], 0x21), K8_SHUFFLE_X3_00); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[0], K8_SHUFFLE_X3_01)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_02)); _mm256_store_si256(buffer + 0, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 0))); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_10); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_12)); _mm256_store_si256(buffer + 1, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 1))); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_20); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[2], K8_SHUFFLE_X3_21)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[2], src[2], 0x21), K8_SHUFFLE_X3_22)); _mm256_store_si256(buffer + 2, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 2))); }
static void avx2_test (void) { union256i_w s1, s2, res; short res_ref[16]; int i; int fail = 0; for (i = 0; i < 256; i += 16) { s1.x = _mm256_loadu_si256 ((__m256i *) & vals[i]); s2.x = _mm256_loadu_si256 ((__m256i *) & vals[i + 8]); res.x = _mm256_maddubs_epi16 (s1.x, s2.x); compute_pmaddubsw256 (s1.a, s2.a, res_ref); fail += check_union256i_w (res, res_ref); } if (fail != 0) abort (); }
__m256i test_mm256_maddubs_epi16(__m256i a, __m256i b) { // CHECK: @llvm.x86.avx2.pmadd.ub.sw return _mm256_maddubs_epi16(a, b); }
__m256i test_mm256_maddubs_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_maddubs_epi16 // CHECK: call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_maddubs_epi16(a, b); }
static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i-=2) { // load the 2 strides of source srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr - 3))); srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line-3)), 1); // filter the source buffer srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + 5))); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line+5)), 1); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcRegFilt32b1_1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+output_pitch), _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); output_ptr+=dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); } }
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm256_castsi256_si128(srcReg32b2), 1); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm256_castsi256_si128(srcReg32b3), 1); srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, _mm256_castsi256_si128(srcReg32b4), 1); srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, _mm256_castsi256_si128(srcReg32b5), 1); srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, _mm256_castsi256_si128(srcReg32b6), 1); srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, _mm256_castsi256_si128(srcReg32b7), 1); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); // save srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); // save srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i-=2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); // merge every two consecutive registers // save srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); // multiply 2 adjacent elements with the filter and add the result srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_max_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_max_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); // shift by 7 bit each 16 bit srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcReg32b1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+out_pitch), _mm256_extractf128_si256(srcReg32b1, 1)); output_ptr+=dst_stride; // save part of the registers for next strides srcReg32b10 = srcReg32b11; srcReg32b1 = srcReg32b3; srcReg32b11 = srcReg32b2; srcReg32b3 = srcReg32b5; srcReg32b2 = srcReg32b4; srcReg32b5 = srcReg32b7; srcReg32b7 = srcReg32b9; } if (i > 0) { __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); srcRegFilt7 = _mm_unpackhi_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), _mm256_castsi256_si128(firstFilters)); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), _mm256_castsi256_si128(firstFilters)); srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), _mm256_castsi256_si128(secondFilters)); srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), _mm256_castsi256_si128(secondFilters)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), _mm256_castsi256_si128(thirdFilters)); srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); } }
template<int part> SIMD_INLINE __m256i SumCol(__m256i a[3]) { return _mm256_add_epi16(_mm256_maddubs_epi16(UnpackU8<part>(a[0], a[1]), K8_01), UnpackU8<part>(a[2])); }
SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1) { return _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(s0, K8_01), _mm256_maddubs_epi16(s1, K8_01)), K16_0002), 2); }
SIMD_INLINE void InterpolateX4(const __m256i * alpha, __m256i * buffer) { __m256i src = _mm256_shuffle_epi8(_mm256_load_si256(buffer), K8_SHUFFLE_X4); _mm256_store_si256(buffer, _mm256_maddubs_epi16(src, _mm256_load_si256(alpha))); }
uns cache_block_over_inputs(const u8 *w, const u8 *inputs, const u8 *outputs, const uns w_len, const uns outputs_len) { assert(outputs_len > 0); assert(outputs_len % AVX_U8_VEC_LEN == 0); assert(w_len % AVX_U8_VEC_LEN == 0); __m256i part_results[CACHE_BLOCKING_LEN]; const uns cache_blocking_len = MIN(outputs_len, cache_blocking_len); for (uns index = 0; index < w_len; index += cache_blocking_len) { const uns jndex_end = MIN(w_len, index + cache_blocking_len); for (uns cb_index = 0; cb_index < cache_blocking_len; ++cb_index) { for (uns jndex = index; jndex < jndex_end; jndex += AVX_U8_VEC_LEN) { const __m256i *weight = (__m256i*) &w[jndex + cb_index*w_len]; const __m256i *input = (__m256i*) &input[jndex]; const __m256i sum = _mm256_maddubs_epi16(*weight, *input); __m256i *bigsum = &part_results[cb_index]; // FIXME: When to do bit shifts? *bigsum = _mm256_adds_epi16(*bigsum, sum); } } } for (uns cb_index = 0; cb_index < cache_blocking_len; cb_index += cache_blocking_len) { // _mm256_permute2x128_si256: http://www.felixcloutier.com/x86/VPERM2I128.html // _mm256_shuffle_epi8: https://software.intel.com/en-us/node/582929 // _mm256_hadds_epi16: https://software.intel.com/en-us/node/582799, http://www.felixcloutier.com/x86/PHADDSW.html // _mm256_blendv_epi8: https://software.intel.com/en-us/node/582820 // _mm256_shuffle_epi8: https://software.intel.com/en-us/node/582929 // _mm256_srli_epi16: https://software.intel.com/en-us/node/582887 // _mm256_srai_epi16: https://software.intel.com/en-us/node/582815 // _mm256_setr_epi64x: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi64x&expand=4649 // Create 2 128bit parts with 16bit integers. #define SUM_2x128(X, Y) \ const __m256i x##X = part_results[cb_index + X]; \ const __m256i x##Y = part_results[cb_index + Y]; \ __m256i sum##X = _mm256_adds_epi16( _mm256_permute2x128_si256(x##X, x##Y, 0x20), _mm256_permute2x128_si256(x##X, x##Y, 0x31) ) SUM_2x128(0, 1); SUM_2x128(2, 3); SUM_2x128(4, 5); SUM_2x128(6, 7); SUM_2x128(8, 9); SUM_2x128(10, 11); SUM_2x128(12, 13); SUM_2x128(14, 15); #undef SUM_2x128 // Create 4 64bit parts with 16bit integers. #define SUM_4x64(X, Y) \ sum##X = _mm256_adds_epi16(_mm256_permute2x128_si256(_mm256_permute4x64_epi64(sum##X, 0x20), _mm256_permute4x64_epi64(sum##Y, 0x20), 0x20), \ _mm256_permute2x128_si256(_mm256_permute4x64_epi64(sum##X, 0x31), _mm256_permute4x64_epi64(sum##Y, 0x31), 0x20)) SUM_4x64(0, 2); SUM_4x64(4, 6); SUM_4x64(8, 10); SUM_4x64(12, 14); #undef SUM_4x64 // Create 8 32bit parts with 16bit integers. #define SUM_8x32(X, Y) \ sum##X = _mm256_adds_epi16(_mm256_permute2x128_si256(_mm256_permutevar8x32_epi32(x##X, _mm256_setr_epi32(0, 0, 0, 0, 6, 4, 2, 0)), \ _mm256_permutevar8x32_epi32(x##Y, _mm256_setr_epi32(0, 0, 0, 0, 6, 4, 2, 0)), 0x20), \ _mm256_permute2x128_si256(_mm256_permutevar8x32_epi32(x##X, _mm256_setr_epi32(0, 0, 0, 0, 7, 5, 3, 1)), \ _mm256_permutevar8x32_epi32(x##Y, _mm256_setr_epi32(0, 0, 0, 0, 7, 5, 3, 1)), 0x20)) SUM_8x32(0, 4); SUM_8x32(8, 12); #undef SUM_8x32 // Create 16 parts with 16bit integers. sum0 = _mm256_hadds_epi16(sum0, sum8); // Final operations. sum0 = _mm256_max_epi16(sum0, _mm256_setzero_si256()); sum0 = _mm256_srai_epi16(sum0, 8); // FIXME: Add last conversion of 16bit integers to 8bit integers. // stream store, type conversions seem ugly... _mm_stream_ps((float*)&outputs[cb_index], (__m128)_mm256_castsi256_si128(sum0)); } return cache_blocking_len; }