static INLINE void variance16_kernel_avx2( const uint8_t *const src, const int src_stride, const uint8_t *const ref, const int ref_stride, __m256i *const sse, __m256i *const sum) { const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); variance_kernel_avx2(s, r, sse, sum); }
//! \brief //! Performs a bitwise right shift logical by the specified count //! inline __m256i srli(__m256i arg, int count) { __m128i arg_low = _mm256_castsi256_si128(arg); __m128i arg_hi = _mm256_extractf128_si256(arg, 1); __m128i newlow = _mm_srli_epi32(arg_low, count); __m128i newhi = _mm_srli_epi32(arg_hi, count); __m256i result = _mm256_castsi128_si256(newlow); result = _mm256_insertf128_si256(result, newhi, 1); return result; }
static void sfid_render_cache_rt_write_simd8_bgra_unorm8_xmajor(struct thread *t, const struct sfid_render_cache_args *args) { __m256i argb; const float scale = 255.0f; struct reg src[4]; memcpy(src, &t->grf[args->src], sizeof(src)); const int cpp = 4; const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; void *base = xmajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp); if (gt.blend.enable) { /* Load unorm8 */ __m128i lo = _mm_load_si128(base); __m128i hi = _mm_load_si128(base + 512); __m256i dst_argb = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); dst_argb = _mm256_permute4x64_epi64(dst_argb, SWIZZLE(0, 2, 1, 3)); blend_unorm8_argb(src, dst_argb); } gamma_correct(args->rt.format, src); const __m256i r = to_unorm(src[0].reg, scale); const __m256i g = to_unorm(src[1].reg, scale); const __m256i b = to_unorm(src[2].reg, scale); const __m256i a = to_unorm(src[3].reg, scale); argb = _mm256_slli_epi32(a, 8); argb = _mm256_or_si256(argb, r); argb = _mm256_slli_epi32(argb, 8); argb = _mm256_or_si256(argb, g); argb = _mm256_slli_epi32(argb, 8); argb = _mm256_or_si256(argb, b); /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ argb = _mm256_permute4x64_epi64(argb, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(argb, 0)); _mm_maskstore_epi32(base + 512, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(argb, 1)); }
static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); const __m128i zr = _mm_setzero_si128(); const __m128i lo = _mm_unpacklo_epi16(isc, zr); const __m128i hi = _mm_unpackhi_epi16(isc, zr); const __m256i iscan = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); const __m256i zero = _mm256_setzero_si256(); const __m256i zc = _mm256_cmpeq_epi32(dq, zero); const __m256i nz = _mm256_cmpeq_epi32(zc, zero); __m256i cur_eob = _mm256_sub_epi32(iscan, nz); cur_eob = _mm256_and_si256(cur_eob, nz); *eob = _mm256_max_epi32(cur_eob, *eob); }
static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i-=2) { // load the 2 strides of source srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr - 3))); srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line-3)), 1); // filter the source buffer srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + 5))); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line+5)), 1); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcRegFilt32b1_1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+output_pitch), _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); output_ptr+=dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); } }
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm256_castsi256_si128(srcReg32b2), 1); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm256_castsi256_si128(srcReg32b3), 1); srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, _mm256_castsi256_si128(srcReg32b4), 1); srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, _mm256_castsi256_si128(srcReg32b5), 1); srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, _mm256_castsi256_si128(srcReg32b6), 1); srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, _mm256_castsi256_si128(srcReg32b7), 1); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); // save srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); // save srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i-=2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); // merge every two consecutive registers // save srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); // multiply 2 adjacent elements with the filter and add the result srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_max_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_max_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); // shift by 7 bit each 16 bit srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcReg32b1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+out_pitch), _mm256_extractf128_si256(srcReg32b1, 1)); output_ptr+=dst_stride; // save part of the registers for next strides srcReg32b10 = srcReg32b11; srcReg32b1 = srcReg32b3; srcReg32b11 = srcReg32b2; srcReg32b3 = srcReg32b5; srcReg32b2 = srcReg32b4; srcReg32b5 = srcReg32b7; srcReg32b7 = srcReg32b9; } if (i > 0) { __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); srcRegFilt7 = _mm_unpackhi_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), _mm256_castsi256_si128(firstFilters)); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), _mm256_castsi256_si128(firstFilters)); srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), _mm256_castsi256_si128(secondFilters)); srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), _mm256_castsi256_si128(secondFilters)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), _mm256_castsi256_si128(thirdFilters)); srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); } }
void Viterbi::AlignWithOutCellOff(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix, int maxres, ViterbiResult* result) #endif #endif { // Linear topology of query (and template) HMM: // 1. The HMM HMM has L+2 columns. Columns 1 to L contain // a match state, a delete state and an insert state each. // 2. The Start state is M0, the virtual match state in column i=0 (j=0). (Therefore X[k][0]=ANY) // This column has only a match state and it has only a transitions to the next match state. // 3. The End state is M(L+1), the virtual match state in column i=L+1.(j=L+1) (Therefore X[k][L+1]=ANY) // Column L has no transitions to the delete state: tr[L][M2D]=tr[L][D2D]=0. // 4. Transitions I->D and D->I are ignored, since they do not appear in PsiBlast alignments // (as long as the gap opening penalty d is higher than the best match score S(a,b)). // Pairwise alignment of two HMMs: // 1. Pair-states for the alignment of two HMMs are // MM (Q:Match T:Match) , GD (Q:Gap T:Delete), IM (Q:Insert T:Match), DG (Q:Delelte, T:Match) , MI (Q:Match T:Insert) // 2. Transitions are allowed only between the MM-state and each of the four other states. // Saving space: // The best score ending in pair state XY sXY[i][j] is calculated from left to right (j=1->t->L) // and top to bottom (i=1->q->L). To save space, only the last row of scores calculated is kept in memory. // (The backtracing matrices are kept entirely in memory [O(t->L*q->L)]). // When the calculation has proceeded up to the point where the scores for cell (i,j) are caculated, // sXY[i-1][j'] = sXY[j'] for j'>=j (A below) // sXY[i][j'] = sXY[j'] for j'<j (B below) // sXY[i-1][j-1]= sXY_i_1_j_1 (C below) // sXY[i][j] = sXY_i_j (D below) // j-1 // j // i-1: CAAAAAAAAAAAAAAAAAA // i : BBBBBBBBBBBBBD // Variable declarations const float smin = (this->local ? 0 : -FLT_MAX); //used to distinguish between SW and NW algorithms in maximization const simd_float smin_vec = simdf32_set(smin); const simd_float shift_vec = simdf32_set(shift); // const simd_float one_vec = simdf32_set(1); // 00000001 const simd_int mm_vec = simdi32_set(2); //MM 00000010 const simd_int gd_vec = simdi32_set(3); //GD 00000011 const simd_int im_vec = simdi32_set(4); //IM 00000100 const simd_int dg_vec = simdi32_set(5); //DG 00000101 const simd_int mi_vec = simdi32_set(6); //MI 00000110 const simd_int gd_mm_vec = simdi32_set(8); // 00001000 const simd_int im_mm_vec = simdi32_set(16);// 00010000 const simd_int dg_mm_vec = simdi32_set(32);// 00100000 const simd_int mi_mm_vec = simdi32_set(64);// 01000000 #ifdef VITERBI_SS_SCORE HMM * q_s = q->GetHMM(0); const unsigned char * t_index; if(ss_hmm_mode == HMM::PRED_PRED || ss_hmm_mode == HMM::DSSP_PRED ){ t_index = t->pred_index; }else if(ss_hmm_mode == HMM::PRED_DSSP){ t_index = t->dssp_index; } simd_float * ss_score_vec = (simd_float *) ss_score; #endif #ifdef AVX2 const simd_int shuffle_mask_extract = _mm256_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1); #endif #ifdef VITERBI_CELLOFF const __m128i tmp_vec = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);//01000000010000000100000001000000 #ifdef AVX2 const simd_int co_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_vec), tmp_vec, 1); const simd_int float_min_vec = (simd_int) _mm256_set1_ps(-FLT_MAX); const simd_int shuffle_mask_celloff = _mm256_set_epi8( 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); #else // SSE case const simd_int co_vec = tmp_vec; const simd_int float_min_vec = (simd_int) simdf32_set(-FLT_MAX); #endif #endif // AVX2 end int i,j; //query and template match state indices simd_int i2_vec = simdi32_set(0); simd_int j2_vec = simdi32_set(0); simd_float sMM_i_j = simdf32_set(0); simd_float sMI_i_j,sIM_i_j,sGD_i_j,sDG_i_j; simd_float Si_vec; simd_float sMM_i_1_j_1; simd_float sMI_i_1_j_1; simd_float sIM_i_1_j_1; simd_float sGD_i_1_j_1; simd_float sDG_i_1_j_1; simd_float score_vec = simdf32_set(-FLT_MAX); simd_int byte_result_vec = simdi32_set(0); // Initialization of top row, i.e. cells (0,j) for (j=0; j <= t->L; ++j) { const unsigned int index_pos_j = j * 5; sMM_DG_MI_GD_IM_vec[index_pos_j + 0] = simdf32_set(-j*penalty_gap_template); sMM_DG_MI_GD_IM_vec[index_pos_j + 1] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 2] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 3] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 4] = simdf32_set(-FLT_MAX); } // Viterbi algorithm const int queryLength = q->L; for (i=1; i <= queryLength; ++i) // Loop through query positions i { // If q is compared to t, exclude regions where overlap of q with t < min_overlap residues // Initialize cells sMM_i_1_j_1 = simdf32_set(-(i - 1) * penalty_gap_query); // initialize at (i-1,0) sIM_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i-1,jmin-1) sMI_i_1_j_1 = simdf32_set(-FLT_MAX); sDG_i_1_j_1 = simdf32_set(-FLT_MAX); sGD_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i,jmin-1) const unsigned int index_pos_i = 0 * 5; sMM_DG_MI_GD_IM_vec[index_pos_i + 0] = simdf32_set(-i * penalty_gap_query); // initialize at (i,0) sMM_DG_MI_GD_IM_vec[index_pos_i + 1] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 2] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 3] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 4] = simdf32_set(-FLT_MAX); #ifdef AVX2 unsigned long long * sCO_MI_DG_IM_GD_MM_vec = (unsigned long long *) viterbiMatrix->getRow(i); #else unsigned int *sCO_MI_DG_IM_GD_MM_vec = (unsigned int *) viterbiMatrix->getRow(i); #endif const unsigned int start_pos_tr_i_1 = (i - 1) * 7; const unsigned int start_pos_tr_i = (i) * 7; const simd_float q_m2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 2)); // M2M const simd_float q_m2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 3)); // M2D const simd_float q_d2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 4)); // D2M const simd_float q_d2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 5)); // D2D const simd_float q_i2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 6)); // I2m const simd_float q_i2i = simdf32_load((float *) (q->tr + start_pos_tr_i)); // I2I const simd_float q_m2i = simdf32_load((float *) (q->tr + start_pos_tr_i + 1)); // M2I // Find maximum score; global alignment: maxize only over last row and last column const bool findMaxInnerLoop = (local || i == queryLength); const int targetLength = t->L; #ifdef VITERBI_SS_SCORE if(ss_hmm_mode == HMM::NO_SS_INFORMATION){ // set all to log(1.0) = 0.0 memset(ss_score, 0, (targetLength+1)*VECSIZE_FLOAT*sizeof(float)); }else { const float * score; if(ss_hmm_mode == HMM::PRED_PRED){ score = &S33[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0][0]; }else if (ss_hmm_mode == HMM::DSSP_PRED){ score = &S73[ (int)q_s->ss_dssp[i]][0][0]; }else{ score = &S37[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0]; } // access SS scores and write them to the ss_score array for (j = 0; j <= (targetLength*VECSIZE_FLOAT); j++) // Loop through template positions j { ss_score[j] = ssw * score[t_index[j]]; } } #endif for (j=1; j <= targetLength; ++j) // Loop through template positions j { simd_int index_vec; simd_int res_gt_vec; // cache line optimized reading const unsigned int start_pos_tr_j_1 = (j-1) * 7; const unsigned int start_pos_tr_j = (j) * 7; const simd_float t_m2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+2)); // M2M const simd_float t_m2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+3)); // M2D const simd_float t_d2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+4)); // D2M const simd_float t_d2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+5)); // D2D const simd_float t_i2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+6)); // I2m const simd_float t_i2i = simdf32_load((float *) (t->tr+start_pos_tr_j)); // I2i const simd_float t_m2i = simdf32_load((float *) (t->tr+start_pos_tr_j+1)); // M2I // Find max value // CALCULATE_MAX6( sMM_i_j, // smin, // sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M], // sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M], // sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M], // sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M], // sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M], // bMM[i][j] // ); // same as sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M] simd_float mm_m2m_m2m_vec = simdf32_add( simdf32_add(sMM_i_1_j_1, q_m2m), t_m2m); // if mm > min { 2 } res_gt_vec = (simd_int)simdf32_gt(mm_m2m_m2m_vec, smin_vec); byte_result_vec = simdi_and(res_gt_vec, mm_vec); sMM_i_j = simdf32_max(smin_vec, mm_m2m_m2m_vec); // same as sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M] simd_float gd_m2m_d2m_vec = simdf32_add( simdf32_add(sGD_i_1_j_1, q_m2m), t_d2m); // if gd > max { 3 } res_gt_vec = (simd_int)simdf32_gt(gd_m2m_d2m_vec, sMM_i_j); index_vec = simdi_and( res_gt_vec, gd_vec); byte_result_vec = simdi_or( index_vec, byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, gd_m2m_d2m_vec); // same as sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M] simd_float im_m2m_d2m_vec = simdf32_add( simdf32_add(sIM_i_1_j_1, q_i2m), t_m2m); // if im > max { 4 } MAX2(im_m2m_d2m_vec, sMM_i_j, im_vec,byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, im_m2m_d2m_vec); // same as sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M] simd_float dg_m2m_d2m_vec = simdf32_add( simdf32_add(sDG_i_1_j_1, q_d2m), t_m2m); // if dg > max { 5 } MAX2(dg_m2m_d2m_vec, sMM_i_j, dg_vec,byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, dg_m2m_d2m_vec); // same as sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M], simd_float mi_m2m_d2m_vec = simdf32_add( simdf32_add(sMI_i_1_j_1, q_m2m), t_i2m); // if mi > max { 6 } MAX2(mi_m2m_d2m_vec, sMM_i_j, mi_vec, byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, mi_m2m_d2m_vec); // TODO add secondary structure score // calculate amino acid profile-profile scores Si_vec = log2f4(ScalarProd20Vec((simd_float *) q->p[i],(simd_float *) t->p[j])); #ifdef VITERBI_SS_SCORE Si_vec = simdf32_add(ss_score_vec[j], Si_vec); #endif Si_vec = simdf32_add(Si_vec, shift_vec); sMM_i_j = simdf32_add(sMM_i_j, Si_vec); //+ ScoreSS(q,t,i,j) + shift + (Sstruc==NULL? 0: Sstruc[i][j]); const unsigned int index_pos_j = (j * 5); const unsigned int index_pos_j_1 = (j - 1) * 5; const simd_float sMM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 0)); const simd_float sGD_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 3)); const simd_float sIM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 4)); const simd_float sMM_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 0)); const simd_float sDG_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 1)); const simd_float sMI_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 2)); sMM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 0)); sDG_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 1)); sMI_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 2)); sGD_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 3)); sIM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 4)); // sGD_i_j = max2 // ( // sMM[j-1] + t->tr[j-1][M2D], // MM->GD gap opening in query // sGD[j-1] + t->tr[j-1][D2D], // GD->GD gap extension in query // bGD[i][j] // ); //sMM_DG_GD_MI_IM_vec simd_float mm_gd_vec = simdf32_add(sMM_j_1, t_m2d); // MM->GD gap opening in query simd_float gd_gd_vec = simdf32_add(sGD_j_1, t_d2d); // GD->GD gap extension in query // if mm_gd > gd_dg { 8 } MAX2_SET_MASK(mm_gd_vec, gd_gd_vec,gd_mm_vec, byte_result_vec); sGD_i_j = simdf32_max( mm_gd_vec, gd_gd_vec ); // sIM_i_j = max2 // ( // sMM[j-1] + q->tr[i][M2I] + t->tr[j-1][M2M] , // sIM[j-1] + q->tr[i][I2I] + t->tr[j-1][M2M], // IM->IM gap extension in query // bIM[i][j] // ); simd_float mm_mm_vec = simdf32_add(simdf32_add(sMM_j_1, q_m2i), t_m2m); simd_float im_im_vec = simdf32_add(simdf32_add(sIM_j_1, q_i2i), t_m2m); // IM->IM gap extension in query // if mm_mm > im_im { 16 } MAX2_SET_MASK(mm_mm_vec,im_im_vec, im_mm_vec, byte_result_vec); sIM_i_j = simdf32_max( mm_mm_vec, im_im_vec ); // sDG_i_j = max2 // ( // sMM[j] + q->tr[i-1][M2D], // sDG[j] + q->tr[i-1][D2D], //gap extension (DD) in query // bDG[i][j] // ); simd_float mm_dg_vec = simdf32_add(sMM_j, q_m2d); simd_float dg_dg_vec = simdf32_add(sDG_j, q_d2d); //gap extension (DD) in query // if mm_dg > dg_dg { 32 } MAX2_SET_MASK(mm_dg_vec,dg_dg_vec, dg_mm_vec, byte_result_vec); sDG_i_j = simdf32_max( mm_dg_vec , dg_dg_vec ); // sMI_i_j = max2 // ( // sMM[j] + q->tr[i-1][M2M] + t->tr[j][M2I], // MM->MI gap opening M2I in template // sMI[j] + q->tr[i-1][M2M] + t->tr[j][I2I], // MI->MI gap extension I2I in template // bMI[i][j] // ); simd_float mm_mi_vec = simdf32_add( simdf32_add(sMM_j, q_m2m), t_m2i); // MM->MI gap opening M2I in template simd_float mi_mi_vec = simdf32_add( simdf32_add(sMI_j, q_m2m), t_i2i); // MI->MI gap extension I2I in template // if mm_mi > mi_mi { 64 } MAX2_SET_MASK(mm_mi_vec, mi_mi_vec,mi_mm_vec, byte_result_vec); sMI_i_j = simdf32_max( mm_mi_vec, mi_mi_vec ); // Cell of logic // if (cell_off[i][j]) //shift 10000000100000001000000010000000 -> 01000000010000000100000001000000 //because 10000000000000000000000000000000 = -2147483648 kills cmplt #ifdef VITERBI_CELLOFF #ifdef AVX2 simd_int matrix_vec = _mm256_set1_epi64x(sCO_MI_DG_IM_GD_MM_vec[j]>>1); matrix_vec = _mm256_shuffle_epi8(matrix_vec,shuffle_mask_celloff); #else // if(((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040) > 0){ // std::cout << ((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040 ) << std::endl; // } simd_int matrix_vec = simdi32_set(sCO_MI_DG_IM_GD_MM_vec[j]>>1); #endif simd_int cell_off_vec = simdi_and(matrix_vec, co_vec); simd_int res_eq_co_vec = simdi32_gt(co_vec, cell_off_vec ); // shift is because signed can't be checked here simd_float cell_off_float_min_vec = (simd_float) simdi_andnot(res_eq_co_vec, float_min_vec); // inverse sMM_i_j = simdf32_add(sMM_i_j,cell_off_float_min_vec); // add the cell off vec to sMM_i_j. Set -FLT_MAX to cell off sGD_i_j = simdf32_add(sGD_i_j,cell_off_float_min_vec); sIM_i_j = simdf32_add(sIM_i_j,cell_off_float_min_vec); sDG_i_j = simdf32_add(sDG_i_j,cell_off_float_min_vec); sMI_i_j = simdf32_add(sMI_i_j,cell_off_float_min_vec); #endif simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 0), sMM_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 1), sDG_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 2), sMI_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 3), sGD_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 4), sIM_i_j); // write values back to ViterbiMatrix #ifdef AVX2 /* byte_result_vec 000H 000G 000F 000E 000D 000C 000B 000A */ /* abcdefgh 0000 0000 HGFE 0000 0000 0000 0000 DCBA */ const __m256i abcdefgh = _mm256_shuffle_epi8(byte_result_vec, shuffle_mask_extract); /* abcd 0000 0000 0000 DCBA */ const __m128i abcd = _mm256_castsi256_si128(abcdefgh); /* efgh 0000 0000 HGFE 0000 */ const __m128i efgh = _mm256_extracti128_si256(abcdefgh, 1); _mm_storel_epi64((__m128i*)&sCO_MI_DG_IM_GD_MM_vec[j], _mm_or_si128(abcd, efgh)); #else byte_result_vec = _mm_packs_epi32(byte_result_vec, byte_result_vec); byte_result_vec = _mm_packus_epi16(byte_result_vec, byte_result_vec); int int_result = _mm_cvtsi128_si32(byte_result_vec); sCO_MI_DG_IM_GD_MM_vec[j] = int_result; #endif // Find maximum score; global alignment: maxize only over last row and last column // if(sMM_i_j>score && (par.loc || i==q->L)) { i2=i; j2=j; score=sMM_i_j; } if (findMaxInnerLoop){ // new score is higer // output // 0 0 0 MAX simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec); // old score is higher // output // MAX MAX MAX 0 simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec); simd_int curr_pos_j = simdi32_set(j); simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j); simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec); j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo); simd_int curr_pos_i = simdi32_set(i); simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i); simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec); i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo); score_vec=simdf32_max(sMM_i_j,score_vec); } } //end for j // if global alignment: look for best cell in last column if (!local){ // new score is higer // output // 0 0 0 MAX simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec); // old score is higher // output // MAX MAX MAX 0 simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec); simd_int curr_pos_j = simdi32_set(j); simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j); simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec); j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo); simd_int curr_pos_i = simdi32_set(i); simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i); simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec); i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo); score_vec = simdf32_max(sMM_i_j,score_vec); } // end for j } // end for i for(int seq_index=0; seq_index < maxres; seq_index++){ result->score[seq_index]=((float*)&score_vec)[seq_index]; result->i[seq_index] = ((int*)&i2_vec)[seq_index]; result->j[seq_index] = ((int*)&j2_vec)[seq_index]; // std::cout << seq_index << "\t" << result->score[seq_index] << "\t" << result->i[seq_index] <<"\t" << result->j[seq_index] << std::endl; } // printf("Template=%-12.12s i=%-4i j=%-4i score=%6.3f\n",t->name,i2,j2,score); }
static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { const __m128i zero = _mm_setzero_si128(); const __m128i dc = _mm_unpacklo_epi16(*p, zero); const __m128i ac = _mm_unpackhi_epi16(*p, zero); *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); }
__m128i l0, l1; __m256i str, mask, res, blockmask; __m256i s2mask, s3mask, s4mask, s5mask; /* _mm256_shuffle_epi8 works on 128-bit lanes, so we need to get * the two 128-bit lanes into big-endian order separately: */ l0 = _mm_loadu_si128((__m128i *)c); l0 = _mm_shuffle_epi8(l0, _mm_setr_epi8(2, 2, 1, 0, 5, 5, 4, 3, 8, 8, 7, 6, 11, 11, 10, 9)); l1 = _mm_loadu_si128((__m128i *)&c[12]); l1 = _mm_shuffle_epi8(l1, _mm_setr_epi8(2, 2, 1, 0, 5, 5, 4, 3, 8, 8, 7, 6, 11, 11, 10, 9)); /* Combine into a single 256-bit register: */ str = _mm256_castsi128_si256(l0); str = _mm256_insertf128_si256(str, l1, 1); /* Mask to pass through only the lower 6 bits of one byte: */ mask = _mm256_set1_epi32(0x3F000000); /* Shift bits by 2, mask in only the first byte: */ res = _mm256_and_si256(_mm256_srli_epi32(str, 2), mask); mask = _mm256_srli_epi32(mask, 8); /* Shift bits by 4, mask in only the second byte: */ res = _mm256_or_si256(res, _mm256_and_si256(_mm256_srli_epi32(str, 4), mask)); mask = _mm256_srli_epi32(mask, 8); /* Shift bits by 6, mask in only the third byte: */ res = _mm256_or_si256(res, _mm256_and_si256(_mm256_srli_epi32(str, 6) , mask));
void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i = 0; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); const uint16_t *src0 = invert_mask ? pred : ref; const uint16_t *src1 = invert_mask ? ref : pred; const int stride0 = invert_mask ? width : ref_stride; const int stride1 = invert_mask ? ref_stride : width; const __m256i zero = _mm256_setzero_si256(); if (width == 8) { do { const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0); const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1); const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask); const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8)); __m256i m = _mm256_castsi128_si256(m_l); m = _mm256_insertf128_si256(m, m_h, 1); const __m256i m_16 = _mm256_unpacklo_epi8(m, zero); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp)); _mm_storeu_si128((__m128i *)(comp_pred + width), _mm256_extractf128_si256(comp, 1)); src0 += (stride0 << 1); src1 += (stride1 << 1); mask += (mask_stride << 1); comp_pred += (width << 1); i += 2; } while (i < height); } else if (width == 16) { do { const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0)); const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1)); const __m256i m_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); _mm256_storeu_si256((__m256i *)comp_pred, comp); src0 += stride0; src1 += stride1; mask += mask_stride; comp_pred += width; i += 1; } while (i < height); } else if (width == 32) { do { const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0); const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16)); const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1); const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16)); const __m256i m01_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); const __m256i m23_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16))); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16); const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16); _mm256_storeu_si256((__m256i *)comp_pred, comp); _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1); src0 += stride0; src1 += stride1; mask += mask_stride; comp_pred += width; i += 1; } while (i < height); } }
static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) { const __m256i d = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); }
static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) { __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo)); __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi)); __m256i a = _mm256_castsi128_si256(a0); return _mm256_inserti128_si256(a, a1, 1); }
_mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr) { return _mm256_inserti128_si256( _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1); }
INLINE avxi( const ssei& a, const ssei& b ) : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
INLINE explicit avxi( const ssei& a ) : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
static inline __m256d gmx_mm256_exp2_pd(__m256d x) { /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */ const __m256d arglimit = _mm256_set1_pd(1022.0); const __m128i expbase = _mm_set1_epi32(1023); const __m256d P2 = _mm256_set1_pd(2.30933477057345225087e-2); const __m256d P1 = _mm256_set1_pd(2.02020656693165307700e1); const __m256d P0 = _mm256_set1_pd(1.51390680115615096133e3); /* Q2 == 1.0 */ const __m256d Q1 = _mm256_set1_pd(2.33184211722314911771e2); const __m256d Q0 = _mm256_set1_pd(4.36821166879210612817e3); const __m256d one = _mm256_set1_pd(1.0); const __m256d two = _mm256_set1_pd(2.0); __m256d valuemask; __m256i iexppart; __m128i iexppart128a, iexppart128b; __m256d fexppart; __m256d intpart; __m256d z, z2; __m256d PolyP, PolyQ; iexppart128a = _mm256_cvtpd_epi32(x); intpart = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT); /* Add exponent bias */ iexppart128a = _mm_add_epi32(iexppart128a, expbase); /* We now want to shift the exponent 52 positions left, but to achieve this we need * to separate the 128-bit register data into two registers (4x64-bit > 128bit) * shift them, and then merge into a single __m256d. * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b. * It doesnt matter what we put in the 2nd/4th position, since that data will be * shifted out and replaced with zeros. */ iexppart128b = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2)); iexppart128a = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0)); iexppart128b = _mm_slli_epi64(iexppart128b, 52); iexppart128a = _mm_slli_epi64(iexppart128a, 52); iexppart = _mm256_castsi128_si256(iexppart128a); iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1); valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ); fexppart = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart)); z = _mm256_sub_pd(x, intpart); z2 = _mm256_mul_pd(z, z); PolyP = _mm256_mul_pd(P2, z2); PolyP = _mm256_add_pd(PolyP, P1); PolyQ = _mm256_add_pd(z2, Q1); PolyP = _mm256_mul_pd(PolyP, z2); PolyQ = _mm256_mul_pd(PolyQ, z2); PolyP = _mm256_add_pd(PolyP, P0); PolyQ = _mm256_add_pd(PolyQ, Q0); PolyP = _mm256_mul_pd(PolyP, z); z = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP))); z = _mm256_add_pd(one, _mm256_mul_pd(two, z)); z = _mm256_mul_pd(z, fexppart); return z; }