static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0, const __m256i *data_src1, const __m256i *round_const, const __m256i *mask_base_16, const __m256i *clip_diff, int round) { const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); const __m256i diff = _mm256_max_epu16(diffa, diffb); const __m256i diff_round = _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); return diff_clamp; }
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) { const int AB_BITS = MAX(10, (int)INTER_BITS); int x1 = 0; __m256i fxy_mask = _mm256_set1_epi32(INTER_TAB_SIZE - 1); __m256i XX = _mm256_set1_epi32(X0), YY = _mm256_set1_epi32(Y0); for (; x1 <= bw - 16; x1 += 16) { __m256i tx0, tx1, ty0, ty1; tx0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1)), XX); ty0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1)), YY); tx1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1 + 8)), XX); ty1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1 + 8)), YY); tx0 = _mm256_srai_epi32(tx0, AB_BITS - INTER_BITS); ty0 = _mm256_srai_epi32(ty0, AB_BITS - INTER_BITS); tx1 = _mm256_srai_epi32(tx1, AB_BITS - INTER_BITS); ty1 = _mm256_srai_epi32(ty1, AB_BITS - INTER_BITS); __m256i fx_ = _mm256_packs_epi32(_mm256_and_si256(tx0, fxy_mask), _mm256_and_si256(tx1, fxy_mask)); __m256i fy_ = _mm256_packs_epi32(_mm256_and_si256(ty0, fxy_mask), _mm256_and_si256(ty1, fxy_mask)); tx0 = _mm256_packs_epi32(_mm256_srai_epi32(tx0, INTER_BITS), _mm256_srai_epi32(tx1, INTER_BITS)); ty0 = _mm256_packs_epi32(_mm256_srai_epi32(ty0, INTER_BITS), _mm256_srai_epi32(ty1, INTER_BITS)); fx_ = _mm256_adds_epi16(fx_, _mm256_slli_epi16(fy_, INTER_BITS)); fx_ = _mm256_permute4x64_epi64(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0); _mm256_storeu_si256((__m256i*)(xy + x1 * 2), _mm256_unpacklo_epi16(tx0, ty0)); _mm256_storeu_si256((__m256i*)(xy + x1 * 2 + 16), _mm256_unpackhi_epi16(tx0, ty0)); _mm256_storeu_si256((__m256i*)(alpha + x1), fx_); } _mm256_zeroupper(); return x1; }
__m256i test_mm256_adds_epi16(__m256i a, __m256i b) { // CHECK: @llvm.x86.avx2.padds.w return _mm256_adds_epi16(a, b); }
__m256i test_mm256_adds_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_adds_epi16 // CHECK: call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_adds_epi16(a, b); }
static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i-=2) { // load the 2 strides of source srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr - 3))); srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line-3)), 1); // filter the source buffer srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + 5))); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line+5)), 1); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcRegFilt32b1_1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+output_pitch), _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); output_ptr+=dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); } }
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm256_castsi256_si128(srcReg32b2), 1); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm256_castsi256_si128(srcReg32b3), 1); srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, _mm256_castsi256_si128(srcReg32b4), 1); srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, _mm256_castsi256_si128(srcReg32b5), 1); srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, _mm256_castsi256_si128(srcReg32b6), 1); srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, _mm256_castsi256_si128(srcReg32b7), 1); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); // save srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); // save srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i-=2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); // merge every two consecutive registers // save srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); // multiply 2 adjacent elements with the filter and add the result srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_max_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_max_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); // shift by 7 bit each 16 bit srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcReg32b1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+out_pitch), _mm256_extractf128_si256(srcReg32b1, 1)); output_ptr+=dst_stride; // save part of the registers for next strides srcReg32b10 = srcReg32b11; srcReg32b1 = srcReg32b3; srcReg32b11 = srcReg32b2; srcReg32b3 = srcReg32b5; srcReg32b2 = srcReg32b4; srcReg32b5 = srcReg32b7; srcReg32b7 = srcReg32b9; } if (i > 0) { __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); srcRegFilt7 = _mm_unpackhi_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), _mm256_castsi256_si128(firstFilters)); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), _mm256_castsi256_si128(firstFilters)); srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), _mm256_castsi256_si128(secondFilters)); srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), _mm256_castsi256_si128(secondFilters)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), _mm256_castsi256_si128(thirdFilters)); srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); } }
uns cache_block_over_inputs(const u8 *w, const u8 *inputs, const u8 *outputs, const uns w_len, const uns outputs_len) { assert(outputs_len > 0); assert(outputs_len % AVX_U8_VEC_LEN == 0); assert(w_len % AVX_U8_VEC_LEN == 0); __m256i part_results[CACHE_BLOCKING_LEN]; const uns cache_blocking_len = MIN(outputs_len, cache_blocking_len); for (uns index = 0; index < w_len; index += cache_blocking_len) { const uns jndex_end = MIN(w_len, index + cache_blocking_len); for (uns cb_index = 0; cb_index < cache_blocking_len; ++cb_index) { for (uns jndex = index; jndex < jndex_end; jndex += AVX_U8_VEC_LEN) { const __m256i *weight = (__m256i*) &w[jndex + cb_index*w_len]; const __m256i *input = (__m256i*) &input[jndex]; const __m256i sum = _mm256_maddubs_epi16(*weight, *input); __m256i *bigsum = &part_results[cb_index]; // FIXME: When to do bit shifts? *bigsum = _mm256_adds_epi16(*bigsum, sum); } } } for (uns cb_index = 0; cb_index < cache_blocking_len; cb_index += cache_blocking_len) { // _mm256_permute2x128_si256: http://www.felixcloutier.com/x86/VPERM2I128.html // _mm256_shuffle_epi8: https://software.intel.com/en-us/node/582929 // _mm256_hadds_epi16: https://software.intel.com/en-us/node/582799, http://www.felixcloutier.com/x86/PHADDSW.html // _mm256_blendv_epi8: https://software.intel.com/en-us/node/582820 // _mm256_shuffle_epi8: https://software.intel.com/en-us/node/582929 // _mm256_srli_epi16: https://software.intel.com/en-us/node/582887 // _mm256_srai_epi16: https://software.intel.com/en-us/node/582815 // _mm256_setr_epi64x: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi64x&expand=4649 // Create 2 128bit parts with 16bit integers. #define SUM_2x128(X, Y) \ const __m256i x##X = part_results[cb_index + X]; \ const __m256i x##Y = part_results[cb_index + Y]; \ __m256i sum##X = _mm256_adds_epi16( _mm256_permute2x128_si256(x##X, x##Y, 0x20), _mm256_permute2x128_si256(x##X, x##Y, 0x31) ) SUM_2x128(0, 1); SUM_2x128(2, 3); SUM_2x128(4, 5); SUM_2x128(6, 7); SUM_2x128(8, 9); SUM_2x128(10, 11); SUM_2x128(12, 13); SUM_2x128(14, 15); #undef SUM_2x128 // Create 4 64bit parts with 16bit integers. #define SUM_4x64(X, Y) \ sum##X = _mm256_adds_epi16(_mm256_permute2x128_si256(_mm256_permute4x64_epi64(sum##X, 0x20), _mm256_permute4x64_epi64(sum##Y, 0x20), 0x20), \ _mm256_permute2x128_si256(_mm256_permute4x64_epi64(sum##X, 0x31), _mm256_permute4x64_epi64(sum##Y, 0x31), 0x20)) SUM_4x64(0, 2); SUM_4x64(4, 6); SUM_4x64(8, 10); SUM_4x64(12, 14); #undef SUM_4x64 // Create 8 32bit parts with 16bit integers. #define SUM_8x32(X, Y) \ sum##X = _mm256_adds_epi16(_mm256_permute2x128_si256(_mm256_permutevar8x32_epi32(x##X, _mm256_setr_epi32(0, 0, 0, 0, 6, 4, 2, 0)), \ _mm256_permutevar8x32_epi32(x##Y, _mm256_setr_epi32(0, 0, 0, 0, 6, 4, 2, 0)), 0x20), \ _mm256_permute2x128_si256(_mm256_permutevar8x32_epi32(x##X, _mm256_setr_epi32(0, 0, 0, 0, 7, 5, 3, 1)), \ _mm256_permutevar8x32_epi32(x##Y, _mm256_setr_epi32(0, 0, 0, 0, 7, 5, 3, 1)), 0x20)) SUM_8x32(0, 4); SUM_8x32(8, 12); #undef SUM_8x32 // Create 16 parts with 16bit integers. sum0 = _mm256_hadds_epi16(sum0, sum8); // Final operations. sum0 = _mm256_max_epi16(sum0, _mm256_setzero_si256()); sum0 = _mm256_srai_epi16(sum0, 8); // FIXME: Add last conversion of 16bit integers to 8bit integers. // stream store, type conversions seem ugly... _mm_stream_ps((float*)&outputs[cb_index], (__m128)_mm256_castsi256_si128(sum0)); } return cache_blocking_len; }