void Coefs(unsigned char *current_part_ptr, int current_part_stride, unsigned char *ref_part_ptr, int ref_part_stride, unsigned char *coef_buf, int n) { static const unsigned short c_32[8] = {32, 32, 32, 32, 32, 32, 32, 32}; int i; __m128i v_row0_0, v_row0_1; __m128i v_temp_0, v_temp_1; __m128i v_result; __m128i vZero; vZero = _mm_setzero_si128(); __m128i v_32 = _mm_loadu_si128((__m128i*)c_32); __m128i* coef_ptr = (__m128i*) coef_buf; v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr); v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9); v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3); ref_part_ptr += ref_part_stride; // row0: 0 1 2 3 4 5 6 7 // row1: 2 3 4 5 6 7 8 9 v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero); v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero); for ( i = 0; i < n; i++ ) { v_row0_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[0]); v_row0_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[1]); v_result = v_32; v_result = _mm_add_epi16(v_result, v_row0_0); v_result = _mm_add_epi16(v_result, v_row0_1); v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr); v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9); v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3); ref_part_ptr += ref_part_stride; v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero); v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero); v_temp_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[2]); v_temp_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[3]); v_result = _mm_add_epi16(v_result, v_temp_0); v_result = _mm_add_epi16(v_result, v_temp_1); v_result = _mm_srli_epi16(v_result, 6); _mm_store_si128((__m128i*)(current_part_ptr), v_result); current_part_ptr += current_part_stride; } }
static inline long conv_yHalf_yF (const uint16_t *src, float *dst, long samples) { const uint64_t *s_vec; __v4sf *d_vec; long n = samples; s_vec = (const uint64_t *)src; d_vec = (__v4sf *)dst; while (n >= 4) { __m128i in_val = _mm_insert_epi64((__m128i)_mm_setzero_ps(), *s_vec++, 0); __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val); _mm_storeu_ps((float *)d_vec++, out_val); n -= 4; } src = (const uint16_t *)s_vec; dst = (float *)d_vec; while (n) { __m128i in_val = _mm_insert_epi16((__m128i)_mm_setzero_ps(), *src++, 0); __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val); _mm_store_ss(dst++, out_val); n -= 1; } return samples; }
// Lowering to pinsrw requires optimization. __m128i test_mm_insert_epi16(__m128i A, short B) { // DAG-LABEL: test_mm_insert_epi16 // DAG: [[x:%.*]] = and i32 %{{.*}}, 7 // DAG: insertelement <8 x i16> %{{.*}}, i32 [[x]] // // ASM-LABEL: test_mm_insert_epi16 // ASM: movw return _mm_insert_epi16(A, B, 8); }
rfx_dwt_2d_encode_block_horiz_sse2(INT16* src, INT16* l, INT16* h, int subband_width) { int y; int n; int first; __m128i src_2n; __m128i src_2n_1; __m128i src_2n_2; __m128i h_n; __m128i h_n_m; __m128i l_n; for (y = 0; y < subband_width; y++) { for (n = 0; n < subband_width; n += 8) { /* The following 3 Set operations consumes more than half of the total DWT processing time! */ src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]); src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]); src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16], src[14], src[12], src[10], src[8], src[6], src[4], src[2]); /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */ h_n = _mm_add_epi16(src_2n, src_2n_2); h_n = _mm_srai_epi16(h_n, 1); h_n = _mm_sub_epi16(src_2n_1, h_n); h_n = _mm_srai_epi16(h_n, 1); _mm_store_si128((__m128i*) h, h_n); h_n_m = _mm_loadu_si128((__m128i*) (h - 1)); if (n == 0) { first = _mm_extract_epi16(h_n_m, 1); h_n_m = _mm_insert_epi16(h_n_m, first, 0); } /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */ l_n = _mm_add_epi16(h_n_m, h_n); l_n = _mm_srai_epi16(l_n, 1); l_n = _mm_add_epi16(l_n, src_2n); _mm_store_si128((__m128i*) l, l_n); src += 16; l += 8; h += 8; } } }
static FORCE_INLINE void blur_r6_h_right_sse2(const PixelType *srcp, PixelType *dstp) { __m128i avg12 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 1)), _mm_loadu_si128((const __m128i *)(srcp - 2))); __m128i avg34 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 3)), _mm_loadu_si128((const __m128i *)(srcp - 4))); __m128i avg56 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 5)), _mm_loadu_si128((const __m128i *)(srcp - 6))); __m128i avg012 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp)), avg12); __m128i avg3456 = mm_avg_epu<PixelType>(avg34, avg56); __m128i avg0123456 = mm_avg_epu<PixelType>(avg012, avg3456); __m128i avg = mm_avg_epu<PixelType>(avg012, avg0123456); // This is the right edge. Only the highest six pixels are needed. if (sizeof(PixelType) == 1) { int extra_bytes = *(int16_t *)(dstp + 8); avg = _mm_insert_epi16(avg, extra_bytes, 4); _mm_storeh_pi((__m64 *)(dstp + 8), _mm_castsi128_ps(avg)); } else { int extra_bytes = dstp[0]; avg = _mm_insert_epi16(avg, extra_bytes, 0); extra_bytes = dstp[1]; avg = _mm_insert_epi16(avg, extra_bytes, 1); _mm_storeu_si128((__m128i *)(dstp), avg); } }
void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { const int bd = 8; assert(x_step_q4 == 16 && y_step_q4 == 16); assert(!(w & 7)); (void)x_step_q4; (void)y_step_q4; uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]; int intermediate_height = h + SUBPEL_TAPS - 1; int i, j; const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; const __m128i zero = _mm_setzero_si128(); // Add an offset to account for the "add_src" part of the convolve function. const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); /* Horizontal filter */ { const __m128i coeffs_x = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) + (1 << (bd + FILTER_BITS - 1))); for (i = 0; i < intermediate_height; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); // Filter even-index pixels const __m128i src_0 = _mm_unpacklo_epi8(data, zero); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), FILTER_BITS - EXTRAPREC_BITS); // Filter odd-index pixels const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), FILTER_BITS - EXTRAPREC_BITS); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 __m128i res = _mm_packs_epi32(res_even, res_odd); res = _mm_min_epi16(_mm_max_epi16(res, zero), _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1)); _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); } } } /* Vertical filter */ { const __m128i coeffs_y = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) - (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1))); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS); const __m128i res_hi_round = _mm_srai_epi32( _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS); const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; _mm_storel_epi64(p, res_8bit); } } } }
static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, const uint8_t* src) { rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels; const int x_add = wrk->x_add; int accum = x_add; __m128i cur_pixels; // SSE2 implementation only works with 16b signed arithmetic at max. if (wrk->src_width < 8 || accum >= (1 << 15)) { WebPRescalerImportRowExpand_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(wrk->x_expand); if (wrk->num_channels == 4) { LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; while (1) { const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum); const __m128i out = _mm_madd_epi16(cur_pixels, mult); _mm_storeu_si128((__m128i*)frow, out); frow += 4; if (frow >= frow_end) break; accum -= wrk->x_sub; if (accum < 0) { LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; accum += x_add; } } } else { int left; const uint8_t* const src_limit = src + wrk->src_width - 8; LoadEightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; while (1) { const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum); const __m128i out = _mm_madd_epi16(cur_pixels, mult); assert(sizeof(*frow) == sizeof(uint32_t)); WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out)); frow += 1; if (frow >= frow_end) break; accum -= wrk->x_sub; if (accum < 0) { if (--left) { cur_pixels = _mm_srli_si128(cur_pixels, 2); } else if (src <= src_limit) { LoadEightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; } else { // tail cur_pixels = _mm_srli_si128(cur_pixels, 2); cur_pixels = _mm_insert_epi16(cur_pixels, src[1], 1); src += 1; left = 1; } accum += x_add; } } } assert(accum == 0); }
void av1_highbd_wiener_convolve_add_src_ssse3( const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd) { assert(x_step_q4 == 16 && y_step_q4 == 16); assert(!(w & 7)); assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); (void)x_step_q4; (void)y_step_q4; const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); DECLARE_ALIGNED(16, uint16_t, temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); int intermediate_height = h + SUBPEL_TAPS - 1; int i, j; const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; const __m128i zero = _mm_setzero_si128(); // Add an offset to account for the "add_src" part of the convolve function. const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); /* Horizontal filter */ { const __m128i coeffs_x = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); for (i = 0; i < intermediate_height; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i data2 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); // Filter even-index pixels const __m128i res_0 = _mm_madd_epi16(data, coeff_01); const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), conv_params->round_0); // Filter odd-index pixels const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), conv_params->round_0); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 const __m128i maxval = _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1); __m128i res = _mm_packs_epi32(res_even, res_odd); res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); } } } /* Vertical filter */ { const __m128i coeffs_y = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - (1 << (bd + conv_params->round_1 - 1))); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), conv_params->round_1); const __m128i res_hi_round = _mm_srai_epi32( _mm_add_epi32(res_hi, round_const), conv_params->round_1); const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; _mm_storeu_si128(p, res_16bit); } } } }
rfx_dwt_2d_decode_block_horiz_sse2(INT16* l, INT16* h, INT16* dst, int subband_width) { int y, n; INT16* l_ptr = l; INT16* h_ptr = h; INT16* dst_ptr = dst; int first; int last; __m128i l_n; __m128i h_n; __m128i h_n_m; __m128i tmp_n; __m128i dst_n; __m128i dst_n_p; __m128i dst1; __m128i dst2; for (y = 0; y < subband_width; y++) { /* Even coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ l_n = _mm_load_si128((__m128i*) l_ptr); h_n = _mm_load_si128((__m128i*) h_ptr); h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1)); if (n == 0) { first = _mm_extract_epi16(h_n_m, 1); h_n_m = _mm_insert_epi16(h_n_m, first, 0); } tmp_n = _mm_add_epi16(h_n, h_n_m); tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1)); tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) l_ptr, dst_n); l_ptr += 8; h_ptr += 8; } l_ptr -= subband_width; h_ptr -= subband_width; /* Odd coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ h_n = _mm_load_si128((__m128i*) h_ptr); h_n = _mm_slli_epi16(h_n, 1); dst_n = _mm_load_si128((__m128i*) (l_ptr)); dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1)); if (n == subband_width - 8) { last = _mm_extract_epi16(dst_n_p, 6); dst_n_p = _mm_insert_epi16(dst_n_p, last, 7); } tmp_n = _mm_add_epi16(dst_n_p, dst_n); tmp_n = _mm_srai_epi16(tmp_n, 1); tmp_n = _mm_add_epi16(tmp_n, h_n); dst1 = _mm_unpacklo_epi16(dst_n, tmp_n); dst2 = _mm_unpackhi_epi16(dst_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst1); _mm_store_si128((__m128i*) (dst_ptr + 8), dst2); l_ptr += 8; h_ptr += 8; dst_ptr += 16; } } }
int smith_waterman_sse2_word(const unsigned char * query_sequence, unsigned short * query_profile_word, const int query_length, const unsigned char * db_sequence, const int db_length, unsigned short gap_open, unsigned short gap_extend, struct f_struct * f_str) { int i, j, k; short score; int cmp; int iter = (query_length + 7) / 8; __m128i *p; __m128i *workspace = (__m128i *) f_str->workspace; __m128i E, F, H; __m128i v_maxscore; __m128i v_gapopen; __m128i v_gapextend; __m128i v_min; __m128i v_minimums; __m128i v_temp; __m128i *pHLoad, *pHStore; __m128i *pE; __m128i *pScore; /* Load gap opening penalty to all elements of a constant */ v_gapopen = _mm_setzero_si128(); /* Apple Devel */ v_gapopen = _mm_insert_epi16 (v_gapopen, gap_open, 0); v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0); v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0); /* Load gap extension penalty to all elements of a constant */ v_gapextend = _mm_setzero_si128(); /* Apple Devel */ v_gapextend = _mm_insert_epi16 (v_gapextend, gap_extend, 0); v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0); v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0); /* load v_maxscore with the zeros. since we are using signed */ /* math, we will bias the maxscore to -32768 so we have the */ /* full range of the short. */ v_maxscore = _mm_setzero_si128(); /* Apple Devel */ v_maxscore = _mm_cmpeq_epi16 (v_maxscore, v_maxscore); v_maxscore = _mm_slli_epi16 (v_maxscore, 15); v_minimums = _mm_shuffle_epi32 (v_maxscore, 0); v_min = _mm_shuffle_epi32 (v_maxscore, 0); v_min = _mm_srli_si128 (v_min, 14); /* Zero out the storage vector */ k = 2 * iter; p = workspace; for (i = 0; i < k; i++) { _mm_store_si128 (p++, v_maxscore); } pE = workspace; pHStore = pE + iter; pHLoad = pHStore + iter; for (i = 0; i < db_length; ++i) { /* fetch first data asap. */ pScore = (__m128i *) query_profile_word + db_sequence[i] * iter; /* bias all elements in F to -32768 */ F = _mm_setzero_si128(); /* Apple Devel */ F = _mm_cmpeq_epi16 (F, F); F = _mm_slli_epi16 (F, 15); /* load the next h value */ H = _mm_load_si128 (pHStore + iter - 1); H = _mm_slli_si128 (H, 2); H = _mm_or_si128 (H, v_min); p = pHLoad; pHLoad = pHStore; pHStore = p; for (j = 0; j < iter; j++) { /* load E values */ E = _mm_load_si128 (pE + j); /* add score to H */ H = _mm_adds_epi16 (H, *pScore++); /* Update highest score encountered this far */ v_maxscore = _mm_max_epi16 (v_maxscore, H); /* get max from H, E and F */ H = _mm_max_epi16 (H, E); H = _mm_max_epi16 (H, F); /* save H values */ _mm_store_si128 (pHStore + j, H); /* subtract the gap open penalty from H */ H = _mm_subs_epi16 (H, v_gapopen); /* update E value */ E = _mm_subs_epi16 (E, v_gapextend); E = _mm_max_epi16 (E, H); /* update F value */ F = _mm_subs_epi16 (F, v_gapextend); F = _mm_max_epi16 (F, H); /* save E values */ _mm_store_si128 (pE + j, E); /* load the next h value */ H = _mm_load_si128 (pHLoad + j); } /* reset pointers to the start of the saved data */ j = 0; H = _mm_load_si128 (pHStore + j); /* the computed F value is for the given column. since */ /* we are at the end, we need to shift the F value over */ /* to the next column. */ F = _mm_slli_si128 (F, 2); F = _mm_or_si128 (F, v_min); v_temp = _mm_subs_epi16 (H, v_gapopen); v_temp = _mm_cmpgt_epi16 (F, v_temp); cmp = _mm_movemask_epi8 (v_temp); while (cmp != 0x0000) { E = _mm_load_si128 (pE + j); H = _mm_max_epi16 (H, F); /* save H values */ _mm_store_si128 (pHStore + j, H); /* update E in case the new H value would change it */ H = _mm_subs_epi16 (H, v_gapopen); E = _mm_max_epi16 (E, H); _mm_store_si128 (pE + j, E); /* update F value */ F = _mm_subs_epi16 (F, v_gapextend); j++; if (j >= iter) { j = 0; F = _mm_slli_si128 (F, 2); F = _mm_or_si128 (F, v_min); } H = _mm_load_si128 (pHStore + j); v_temp = _mm_subs_epi16 (H, v_gapopen); v_temp = _mm_cmpgt_epi16 (F, v_temp); cmp = _mm_movemask_epi8 (v_temp); } } /* find largest score in the v_maxscore vector */ v_temp = _mm_srli_si128 (v_maxscore, 8); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); v_temp = _mm_srli_si128 (v_maxscore, 4); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); v_temp = _mm_srli_si128 (v_maxscore, 2); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); /* extract the largest score */ score = _mm_extract_epi16 (v_maxscore, 0); /* return largest score biased by 32768 */ /* fix for Mac OSX clang 4.1 */ /* #ifdef __clang__ if (score < 0) score += 32768; return score; #else */ return score + 32768; /* #endif */ }
wchar_t * __cdecl wcsstr ( const wchar_t * wcs1, const wchar_t * wcs2 ) { const wchar_t *stmp1, *stmp2; __m128i zero, pattern, characters1, characters2; // An empty search string matches everything. if (0 == *wcs2) return (wchar_t *)wcs1; if (__isa_available > __ISA_AVAILABLE_SSE2) { wchar_t c; unsigned i; // Load XMM with first characters of wcs2. if (XMM_PAGE_SAFE(wcs2)) { pattern = _mm_loadu_si128((__m128i*)wcs2); } else { pattern = _mm_xor_si128(pattern, pattern); c = *(stmp2 = wcs2); for (i = 0; i < XMM_CHARS; ++i) { pattern = _mm_srli_si128(pattern, sizeof(wchar_t)); pattern = _mm_insert_epi16(pattern, c, (XMM_CHARS-1)); if (0 != c) c = *++stmp2; } } for(;;) { // Check for partial match, if none step forward and continue. if (XMM_PAGE_SAFE(wcs1)) { characters1 = _mm_loadu_si128((__m128i*)wcs1); // If no potential match or end found, try next XMMWORD. if (_mm_cmpistra(pattern, characters1, f_srch_sub)) { wcs1 += XMM_CHARS; continue; } // If end found there was no match. else if (!_mm_cmpistrc(pattern, characters1, f_srch_sub)) { return NULL; } // Get position of potential match. wcs1 += _mm_cmpistri(pattern, characters1, f_srch_sub); } else { // If end of string found there was no match. if (0 == *wcs1) { return NULL; } // If current character doesn't match first character // of search string try next character. if (*wcs1 != *wcs2) { ++wcs1; continue; } } // Potential match, compare to check for full match. stmp1 = wcs1; stmp2 = wcs2; for (;;) { // If next XMMWORD is page-safe for each string // do a XMMWORD comparison. if (XMM_PAGE_SAFE(stmp1) && XMM_PAGE_SAFE(stmp2)) { characters1 = _mm_loadu_si128((__m128i*)stmp1); characters2 = _mm_loadu_si128((__m128i*)stmp2); // If unequal then no match found. if (!_mm_cmpistro(characters2, characters1, f_srch_sub)) { break; } // If end of search string then match found. else if (_mm_cmpistrs(characters2, characters1, f_srch_sub)) { return (wchar_t *)wcs1; } stmp1 += XMM_CHARS; stmp2 += XMM_CHARS; continue; } // Compare next character. else { // If end of search string then match found. if (0 == *stmp2) { return (wchar_t *)wcs1; } // If unequal then no match found. if (*stmp1 != *stmp2) { break; } // Character matched - try next character. ++stmp1; ++stmp2; } } // Match not found at current position, try next. ++wcs1; } } else if (__isa_available == __ISA_AVAILABLE_SSE2) { unsigned offset, mask; // Build search pattern and zero pattern. Search pattern is // XMMWORD with the initial character of the search string // in every position. Zero pattern has a zero termination // character in every position. pattern = _mm_cvtsi32_si128(wcs2[0]); pattern = _mm_shufflelo_epi16(pattern, 0); pattern = _mm_shuffle_epi32(pattern, 0); zero = _mm_xor_si128(zero, zero); // Main loop for searching wcs1. for (;;) { // If XMM check is safe advance wcs1 to the next // possible match or end. if (XMM_PAGE_SAFE(wcs1)) { characters1 = _mm_loadu_si128((__m128i*)wcs1); characters2 = _mm_cmpeq_epi16(characters1, zero); characters1 = _mm_cmpeq_epi16(characters1, pattern); characters1 = _mm_or_si128(characters1, characters2); mask = _mm_movemask_epi8(characters1); // If no character match or end found try next XMMWORD. if (0 == mask) { wcs1 += XMM_CHARS; continue; } // Advance wcs1 pointer to next possible match or end. _BitScanForward(&offset, mask); wcs1 += (offset/sizeof(wchar_t)); } // If at the end of wcs1, then no match found. if (0 == wcs1[0]) return NULL; // If a first-character match is found compare // strings to look for match. if (wcs2[0] == wcs1[0]) { stmp1 = wcs1; stmp2 = wcs2; for (;;) { // If aligned as specified advance to next // possible difference or wcs2 end. if (XMM_PAGE_SAFE(stmp2) && XMM_PAGE_SAFE(stmp1)) { characters1 = _mm_loadu_si128((__m128i*)stmp1); characters2 = _mm_loadu_si128((__m128i*)stmp2); characters1 = _mm_cmpeq_epi16(characters1, characters2); characters2 = _mm_cmpeq_epi16(characters2, zero); characters1 = _mm_cmpeq_epi16(characters1, zero); characters1 = _mm_or_si128(characters1, characters2); mask = _mm_movemask_epi8(characters1); // If mask is zero there is no difference and // wcs2 does not end in this XMMWORD. Continue // with next XMMWORD. if (0 == mask) { stmp1 += XMM_CHARS; stmp2 += XMM_CHARS; continue; } // Advance string pointers to next significant // character. _BitScanForward(&offset, mask); stmp1 += (offset/sizeof(wchar_t)); stmp2 += (offset/sizeof(wchar_t)); } // If we've reached the end of wcs2 then a match // has been found. if (0 == stmp2[0]) return (wchar_t *)wcs1; // If we've reached a difference then no match // was found. if (stmp1[0] != stmp2[0]) break; // Otherwise advance to next character and try // again. ++stmp1; ++stmp2; } } // Current character wasn't a match, try next character. ++wcs1; } } else { const wchar_t *cp = wcs1; const wchar_t *s1, *s2; while (*cp) { s1 = cp; s2 = wcs2; while ( *s1 && *s2 && !(*s1-*s2) ) s1++, s2++; if (!*s2) return (wchar_t *) cp; cp++; } return NULL; } }
static FORCE_INLINE void warp_mmword_u8_sse2(const uint8_t *srcp, const uint8_t *edgep, uint8_t *dstp, int src_stride, int edge_stride, int height, int x, int y, const __m128i &depth, const __m128i &zero, const __m128i &x_limit_min, const __m128i &x_limit_max, const __m128i &y_limit_min, const __m128i &y_limit_max, const __m128i &word_64, const __m128i &word_127, const __m128i &word_128, const __m128i &word_255, const __m128i &one_stride) { int SMAG = 1 << SMAGL; // calculate displacement __m128i above = _mm_loadl_epi64((const __m128i *)(edgep + x - (y ? edge_stride : 0))); __m128i below = _mm_loadl_epi64((const __m128i *)(edgep + x + (y < height - 1 ? edge_stride : 0))); __m128i left = _mm_loadl_epi64((const __m128i *)(edgep + x - 1)); __m128i right = _mm_loadl_epi64((const __m128i *)(edgep + x + 1)); above = _mm_unpacklo_epi8(above, zero); below = _mm_unpacklo_epi8(below, zero); left = _mm_unpacklo_epi8(left, zero); right = _mm_unpacklo_epi8(right, zero); __m128i h = _mm_sub_epi16(left, right); __m128i v = _mm_sub_epi16(above, below); h = _mm_slli_epi16(h, 7); v = _mm_slli_epi16(v, 7); h = _mm_mulhi_epi16(h, depth); v = _mm_mulhi_epi16(v, depth); v = _mm_max_epi16(v, y_limit_min); v = _mm_min_epi16(v, y_limit_max); __m128i remainder_h = h; __m128i remainder_v = v; if (SMAGL) { remainder_h = _mm_slli_epi16(remainder_h, SMAGL); remainder_v = _mm_slli_epi16(remainder_v, SMAGL); } remainder_h = _mm_and_si128(remainder_h, word_127); remainder_v = _mm_and_si128(remainder_v, word_127); h = _mm_srai_epi16(h, 7 - SMAGL); v = _mm_srai_epi16(v, 7 - SMAGL); __m128i xx = _mm_set1_epi32(x << SMAGL); xx = _mm_packs_epi32(xx, xx); h = _mm_adds_epi16(h, xx); remainder_h = _mm_and_si128(remainder_h, _mm_cmpgt_epi16(x_limit_max, h)); remainder_h = _mm_andnot_si128(_mm_cmpgt_epi16(x_limit_min, h), remainder_h); h = _mm_max_epi16(h, x_limit_min); h = _mm_min_epi16(h, x_limit_max); // h and v contain the displacement now. __m128i disp_lo = _mm_unpacklo_epi16(v, h); __m128i disp_hi = _mm_unpackhi_epi16(v, h); disp_lo = _mm_madd_epi16(disp_lo, one_stride); disp_hi = _mm_madd_epi16(disp_hi, one_stride); __m128i line0 = _mm_setzero_si128(); __m128i line1 = _mm_setzero_si128(); int offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset), 0); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride), 0); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 1 * SMAG), 1); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 1 * SMAG), 1); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 2 * SMAG), 2); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 2 * SMAG), 2); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 3 * SMAG), 3); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 3 * SMAG), 3); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 4 * SMAG), 4); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 4 * SMAG), 4); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 5 * SMAG), 5); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 5 * SMAG), 5); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 6 * SMAG), 6); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 6 * SMAG), 6); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 7 * SMAG), 7); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 7 * SMAG), 7); __m128i left0 = _mm_and_si128(line0, word_255); __m128i left1 = _mm_and_si128(line1, word_255); __m128i right0 = _mm_srli_epi16(line0, 8); __m128i right1 = _mm_srli_epi16(line1, 8); left0 = _mm_mullo_epi16(left0, _mm_sub_epi16(word_128, remainder_h)); left1 = _mm_mullo_epi16(left1, _mm_sub_epi16(word_128, remainder_h)); right0 = _mm_mullo_epi16(right0, remainder_h); right1 = _mm_mullo_epi16(right1, remainder_h); line0 = _mm_add_epi16(left0, right0); line1 = _mm_add_epi16(left1, right1); line0 = _mm_add_epi16(line0, word_64); line1 = _mm_add_epi16(line1, word_64); line0 = _mm_srai_epi16(line0, 7); line1 = _mm_srai_epi16(line1, 7); line0 = _mm_mullo_epi16(line0, _mm_sub_epi16(word_128, remainder_v)); line1 = _mm_mullo_epi16(line1, remainder_v); __m128i result = _mm_add_epi16(line0, line1); result = _mm_add_epi16(result, word_64); result = _mm_srai_epi16(result, 7); result = _mm_packus_epi16(result, result); _mm_storel_epi64((__m128i *)(dstp + x), result); }
int smith_waterman_sse2_byte(const unsigned char * query_sequence, unsigned char * query_profile_byte, const int query_length, const unsigned char * db_sequence, const int db_length, unsigned char bias, unsigned char gap_open, unsigned char gap_extend, struct f_struct * f_str) { int i, j, k; int score; int dup; int cmp; int iter = (query_length + 15) / 16; __m128i *p; __m128i *workspace = (__m128i *) f_str->workspace; __m128i E, F, H; __m128i v_maxscore; __m128i v_bias; __m128i v_gapopen; __m128i v_gapextend; __m128i v_temp; __m128i v_zero; __m128i *pHLoad, *pHStore; __m128i *pE; __m128i *pScore; /* Load the bias to all elements of a constant */ dup = ((short) bias << 8) | bias; v_bias = _mm_setzero_si128(); v_bias = _mm_insert_epi16 (v_bias, dup, 0); v_bias = _mm_shufflelo_epi16 (v_bias, 0); v_bias = _mm_shuffle_epi32 (v_bias, 0); /* Load gap opening penalty to all elements of a constant */ dup = ((short) gap_open << 8) | gap_open; v_gapopen = _mm_setzero_si128(); v_gapopen = _mm_insert_epi16 (v_gapopen, dup, 0); v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0); v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0); /* Load gap extension penalty to all elements of a constant */ dup = ((short) gap_extend << 8) | gap_extend; v_gapextend = _mm_setzero_si128(); v_gapextend = _mm_insert_epi16 (v_gapextend, dup, 0); v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0); v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0); /* initialize the max score */ /* v_maxscore = _mm_xor_si128 (v_maxscore, v_maxscore); - Apple Devel*/ v_maxscore = _mm_setzero_si128(); /* Apple Devel */ /* create a constant of all zeros for comparison */ /* v_zero = _mm_xor_si128 (v_zero, v_zero); - Apple Devel */ v_zero = _mm_setzero_si128(); /* Apple Devel */ /* Zero out the storage vector */ k = iter * 2; p = workspace; for (i = 0; i < k; i++) { _mm_store_si128 (p++, v_maxscore); } pE = workspace; pHStore = pE + iter; pHLoad = pHStore + iter; for (i = 0; i < db_length; ++i) { /* fetch first data asap. */ pScore = (__m128i *) query_profile_byte + db_sequence[i] * iter; /* zero out F value. */ /* F = _mm_xor_si128 (F, F); -Apple Devel */ F = _mm_setzero_si128(); /* Apple Devel */ /* load the next h value */ H = _mm_load_si128 (pHStore + iter - 1); H = _mm_slli_si128 (H, 1); p = pHLoad; pHLoad = pHStore; pHStore = p; for (j = 0; j < iter; j++) { /* load values E. */ E = _mm_load_si128 (pE + j); /* add score to H */ H = _mm_adds_epu8 (H, *pScore++); H = _mm_subs_epu8 (H, v_bias); /* Update highest score encountered this far */ v_maxscore = _mm_max_epu8 (v_maxscore, H); /* get max from H, E and F */ H = _mm_max_epu8 (H, E); H = _mm_max_epu8 (H, F); /* save H values */ _mm_store_si128 (pHStore + j, H); /* subtract the gap open penalty from H */ H = _mm_subs_epu8 (H, v_gapopen); /* update E value */ E = _mm_subs_epu8 (E, v_gapextend); E = _mm_max_epu8 (E, H); /* update F value */ F = _mm_subs_epu8 (F, v_gapextend); F = _mm_max_epu8 (F, H); /* save E values */ _mm_store_si128 (pE + j, E); /* load the next h value */ H = _mm_load_si128 (pHLoad + j); } /* reset pointers to the start of the saved data */ j = 0; H = _mm_load_si128 (pHStore + j); /* the computed F value is for the given column. since */ /* we are at the end, we need to shift the F value over */ /* to the next column. */ F = _mm_slli_si128 (F, 1); v_temp = _mm_subs_epu8 (H, v_gapopen); v_temp = _mm_subs_epu8 (F, v_temp); v_temp = _mm_cmpeq_epi8 (v_temp, v_zero); cmp = _mm_movemask_epi8 (v_temp); while (cmp != 0xffff) { E = _mm_load_si128 (pE + j); H = _mm_max_epu8 (H, F); /* save H values */ _mm_store_si128 (pHStore + j, H); /* update E in case the new H value would change it */ H = _mm_subs_epu8 (H, v_gapopen); E = _mm_max_epu8 (E, H); _mm_store_si128 (pE + j, E); /* update F value */ F = _mm_subs_epu8 (F, v_gapextend); j++; if (j >= iter) { j = 0; F = _mm_slli_si128 (F, 1); } H = _mm_load_si128 (pHStore + j); v_temp = _mm_subs_epu8 (H, v_gapopen); v_temp = _mm_subs_epu8 (F, v_temp); v_temp = _mm_cmpeq_epi8 (v_temp, v_zero); cmp = _mm_movemask_epi8 (v_temp); } } /* find largest score in the v_maxscore vector */ v_temp = _mm_srli_si128 (v_maxscore, 8); v_maxscore = _mm_max_epu8 (v_maxscore, v_temp); v_temp = _mm_srli_si128 (v_maxscore, 4); v_maxscore = _mm_max_epu8 (v_maxscore, v_temp); v_temp = _mm_srli_si128 (v_maxscore, 2); v_maxscore = _mm_max_epu8 (v_maxscore, v_temp); v_temp = _mm_srli_si128 (v_maxscore, 1); v_maxscore = _mm_max_epu8 (v_maxscore, v_temp); /* store in temporary variable */ score = _mm_extract_epi16 (v_maxscore, 0); score = score & 0x00ff; /* check if we might have overflowed */ if (score + bias >= 255) { score = 255; } /* return largest score */ return score; }
inline void FAST::detect9simd( const Image& img, uint8_t threshold, FeatureSetWrapper& features, size_t border ) { #define CHECK_BARRIER(lo, hi, other, flags) \ { \ __m128i diff = _mm_subs_epu8(lo, other); \ __m128i diff2 = _mm_subs_epu8(other, hi); \ __m128i z = _mm_setzero_si128(); \ diff = _mm_cmpeq_epi8(diff, z); \ diff2 = _mm_cmpeq_epi8(diff2, z); \ flags = ~(_mm_movemask_epi8(diff) | (_mm_movemask_epi8(diff2) << 16)); \ } size_t stride; const uint8_t * iptr = img.map( &stride ); int offsets[ 16 ]; make_offsets( offsets, stride ); const size_t tripleStride = 3 * stride; // The compiler refuses to reserve a register for this const __m128i barriers = _mm_set1_epi8( threshold ); // xend is the beginning of the last pixels in the row that need to be processed in the normal way size_t width = img.width(); size_t height = img.height(); size_t xend = width - border - ( width - border ) % 16; size_t aligned_start = ( (int)( border / 16 ) + 1 ) << 4; const uint8_t* im = iptr; im += ( border * stride ); const uint8_t * ptr; for ( size_t y = border; y < height - border; y++ ) { ptr = im + border; for ( size_t x = border; x < aligned_start; x++ ){ if( isCorner9( ptr, offsets, threshold ) ) features( x, y, score9Pixel( ptr, offsets, threshold ) ); ptr++; } for ( size_t x = aligned_start; x < xend; x += 16, ptr += 16 ) { __m128i lo, hi; { const __m128i here = _mm_load_si128( (const __m128i*)ptr ); lo = _mm_subs_epu8( here, barriers ); hi = _mm_adds_epu8( here, barriers ); } uint32_t ans_0, ans_8, possible; { __m128i top = _mm_load_si128( ( const __m128i* )( ptr - tripleStride ) ); __m128i bottom = _mm_load_si128( ( const __m128i* )( ptr + tripleStride ) ); CHECK_BARRIER( lo, hi, top, ans_0 ); CHECK_BARRIER( lo, hi, bottom, ans_8 ); possible = ans_0 | ans_8; if ( !possible ){ continue; } } uint32_t ans_15, ans_1; { __m128i a = _mm_loadu_si128( ( const __m128i* )( ptr - 1 - tripleStride ) ); __m128i c = _mm_insert_epi16( _mm_srli_si128( a, 2 ), *( const uint16_t* ) (ptr + 15 - tripleStride), 7 ); CHECK_BARRIER( lo, hi, a, ans_15 ); CHECK_BARRIER( lo, hi, c, ans_1 ); // 8 or (15 and 1 ) possible &= ans_8 | (ans_15 & ans_1); if ( !possible ) continue; } uint32_t ans_9, ans_7; { __m128i d = _mm_loadu_si128( ( const __m128i* )( ptr - 1 + tripleStride ) ); __m128i f = _mm_insert_epi16( _mm_srli_si128( d, 2 ), *( const uint16_t* )( ptr + 15 + tripleStride ), 7 ); CHECK_BARRIER( lo, hi, d, ans_9 ); CHECK_BARRIER( lo, hi, f, ans_7 ); possible &= ans_9 | ( ans_0 & ans_1 ); possible &= ans_7 | ( ans_15 & ans_0 ); if ( !possible ) continue; } uint32_t ans_12, ans_4; { __m128i left = _mm_loadu_si128( ( const __m128i* )( ptr - 3 ) ); __m128i right = _mm_loadu_si128( ( const __m128i* )( ptr + 3 ) ); CHECK_BARRIER( lo, hi, left, ans_12 ); CHECK_BARRIER( lo, hi, right, ans_4 ); possible &= ans_12 | ( ans_4 & ( ans_1 | ans_7 ) ); possible &= ans_4 | ( ans_12 & ( ans_9 | ans_15 ) ); if ( !possible ) continue; } uint32_t ans_14, ans_6; { __m128i ul = _mm_loadu_si128( ( const __m128i* ) ( ptr - 2 - 2 * stride ) ); __m128i lr = _mm_loadu_si128( ( const __m128i* ) ( ptr + 2 + 2 * stride ) ); CHECK_BARRIER( lo, hi, ul, ans_14 ); CHECK_BARRIER( lo, hi, lr, ans_6 ); { const unsigned int ans_6_7 = ans_6 & ans_7; possible &= ans_14 | (ans_6_7 & (ans_4 | (ans_8 & ans_9))); possible &= ans_1 | (ans_6_7) | ans_12; } { const unsigned int ans_14_15 = ans_14 & ans_15; possible &= ans_6 | (ans_14_15 & (ans_12 | (ans_0 & ans_1))); possible &= ans_9 | (ans_14_15) | ans_4; } if ( !possible ) continue; } uint32_t ans_10, ans_2; { __m128i ll = _mm_loadu_si128( ( const __m128i* ) (ptr - 2 + 2 * stride) ); __m128i ur = _mm_loadu_si128( ( const __m128i* ) (ptr + 2 - 2 * stride) ); CHECK_BARRIER( lo, hi, ll, ans_10 ); CHECK_BARRIER( lo, hi, ur, ans_2 ); { const unsigned int ans_1_2 = ans_1 & ans_2; possible &= ans_10 | (ans_1_2 & ((ans_0 & ans_15) | ans_4)); possible &= ans_12 | (ans_1_2) | (ans_6 & ans_7); } { const unsigned int ans_9_10 = ans_9 & ans_10; possible &= ans_2 | (ans_9_10 & ((ans_7 & ans_8) | ans_12)); possible &= ans_4 | (ans_9_10) | (ans_14 & ans_15); } possible &= ans_8 | ans_14 | ans_2; possible &= ans_0 | ans_10 | ans_6; if ( !possible ) continue; } uint32_t ans_13, ans_5; { __m128i g = _mm_loadu_si128( ( const __m128i* ) (ptr - 3 - stride ) ); __m128i l = _mm_loadu_si128( ( const __m128i* ) (ptr + 3 + stride ) ); CHECK_BARRIER( lo, hi, g, ans_13 ); CHECK_BARRIER( lo, hi, l, ans_5 ); const uint32_t ans_15_0 = ans_15 & ans_0; const uint32_t ans_7_8 = ans_7 & ans_8; { const uint32_t ans_12_13 = ans_12 & ans_13; possible &= ans_5 | (ans_12_13 & ans_14 & ((ans_15_0) | ans_10)); possible &= ans_7 | (ans_1 & ans_2) | (ans_12_13); possible &= ans_2 | (ans_12_13) | (ans_7_8); } { const uint32_t ans_4_5 = ans_4 & ans_5; const uint32_t ans_9_10 = ans_9 & ans_10; possible &= ans_13 | (ans_4_5 & ans_6 & ((ans_7_8) | ans_2)); possible &= ans_15 | (ans_4_5) | (ans_9_10); possible &= ans_10 | (ans_4_5) | (ans_15_0); possible &= ans_15 | (ans_9_10) | (ans_4_5); } possible &= ans_8 | (ans_13 & ans_14) | ans_2; possible &= ans_0 | (ans_5 & ans_6) | ans_10; if ( !possible ) continue; } uint32_t ans_11, ans_3; { __m128i ii = _mm_loadu_si128( ( const __m128i* )( ptr - 3 + stride ) ); __m128i jj = _mm_loadu_si128( ( const __m128i* )( ptr + 3 - stride ) ); CHECK_BARRIER( lo, hi, ii, ans_11 ); CHECK_BARRIER( lo, hi, jj, ans_3 ); { const uint32_t ans_2_3 = ans_2 & ans_3; possible &= ans_11 | (ans_2_3 & ans_4 & ((ans_0 & ans_1) | (ans_5 & ans_6))); possible &= ans_13 | (ans_7 & ans_8) | (ans_2_3); possible &= ans_8 | (ans_2_3) | (ans_13 & ans_14); } { const uint32_t ans_11_12 = ans_11 & ans_12; possible &= ans_3 | (ans_10 & ans_11_12 & ((ans_8 & ans_9) | (ans_13 & ans_14))); possible &= ans_1 | (ans_11_12) | (ans_6 & ans_7); possible &= ans_6 | (ans_0 & ans_1) | (ans_11_12); } { const uint32_t ans_3_4 = ans_3 & ans_4; possible &= ans_9 | (ans_3_4) | (ans_14 & ans_15); possible &= ans_14 | (ans_8 & ans_9) | (ans_3_4); } { const uint32_t ans_10_11 = ans_10 & ans_11; possible &= ans_5 | (ans_15 & ans_0) | (ans_10_11); possible &= ans_0 | (ans_10_11) | (ans_5 & ans_6); } if ( !possible ) continue; } possible |= (possible >> 16); //if(possible & 0x0f) //Does this make it faster? { if ( possible & (1 << 0) ) features( x, y, score9Pixel( ptr, offsets, threshold ) ); if ( possible & (1 << 1) ) features( x + 1, y, score9Pixel( ptr + 1, offsets, threshold ) ); if ( possible & (1 << 2) ) features( x + 2, y, score9Pixel( ptr + 2, offsets, threshold ) ); if ( possible & (1 << 3) ) features( x + 3, y, score9Pixel( ptr + 3, offsets, threshold ) ); if ( possible & (1 << 4) ) features( x + 4, y, score9Pixel( ptr + 4, offsets, threshold ) ); if ( possible & (1 << 5) ) features( x + 5, y, score9Pixel( ptr + 5, offsets, threshold ) ); if ( possible & (1 << 6) ) features( x + 6, y, score9Pixel( ptr + 6, offsets, threshold ) ); if ( possible & (1 << 7) ) features( x + 7, y, score9Pixel( ptr + 7, offsets, threshold ) ); } //if(possible & 0xf0) //Does this mak( , fast)r? { if ( possible & (1 << 8) ) features( x + 8, y, score9Pixel( ptr + 8, offsets, threshold ) ); if ( possible & (1 << 9) ) features( x + 9, y, score9Pixel( ptr + 9, offsets, threshold ) ); if ( possible & (1 << 10) ) features( x + 10, y, score9Pixel( ptr + 10, offsets, threshold ) ); if ( possible & (1 << 11) ) features( x + 11, y, score9Pixel( ptr + 11, offsets, threshold ) ); if ( possible & (1 << 12) ) features( x + 12, y, score9Pixel( ptr + 12, offsets, threshold ) ); if ( possible & (1 << 13) ) features( x + 13, y, score9Pixel( ptr + 13, offsets, threshold ) ); if ( possible & (1 << 14) ) features( x + 14, y, score9Pixel( ptr + 14, offsets, threshold ) ); if ( possible & (1 << 15) ) features( x + 15, y, score9Pixel( ptr + 15, offsets, threshold ) ); } } for ( size_t x = xend; x < width - border; x++ ){ if( isCorner9( ptr, offsets, threshold ) ) features( x, y, score9Pixel( ptr, offsets, threshold ) ); ptr++; } im += stride; } img.unmap( iptr ); #undef CHECK_BARRIER }
/* * Calculate the Smith-Waterman score. * * This is basically an SSE2 version of Wozniak's vectored implementation, but * without a score table. Further, we assume a fixed database and query size, * so *nogap and *b_gap must be pre-allocated (the malloc overhead for very * small scans is _huge_). * * NOTE THE FOLLOWING: * * 1) seqA must be padded with 7 bytes at the beginning and end. The first * element of seqA should be the first pad byte. * * 2) seqB must be padded with bytes on the end up to mod 8 characters. * The first element of seqB should be (of course) the first character. * * 3) seqA and seqB's padding _must_ be different, otherwise our logic will * consider the padding as matches! * * 4) These is no _mm_max_epu16 prior to SSE 4! We must use the signed max * function. Unfortunately, this limits our maximum score to 2^15 - 1, or * 32767. Since bad things happen if we roll over, our caller must ensure * that this will not happen. */ static int vect_sw_diff_gap(int8_t *seqA, int lena, int8_t *seqB, int lenb, int8_t *ls_seqA, int initbp, bool is_rna) { int i, j, score = 0; __m128i v_score, v_zero, v_match, v_mismatch; __m128i v_a_gap_ext, v_a_gap_open_ext; #ifndef v_b_gap_open_ext __m128i v_b_gap_ext, v_b_gap_open_ext; #endif __m128i v_a_gap, v_b_gap, v_nogap; __m128i v_last_nogap, v_prev_nogap, v_seq_a, v_seq_b; __m128i v_tmp; /* shut up icc */ (void)ls_seqA; (void)initbp; #define SET16(a, e7, e6, e5, e4, e3, e2, e1, e0) \ _mm_set_epi16((int16_t)a[e7], (int16_t)a[e6], \ (int16_t)a[e5], (int16_t)a[e4], \ (int16_t)a[e3], (int16_t)a[e2], \ (int16_t)a[e1], (int16_t)a[e0]) v_score = _mm_setzero_si128(); v_zero = _mm_setzero_si128(); v_match = SET16((&match), 0, 0, 0, 0, 0, 0, 0, 0); v_mismatch = SET16((&mismatch), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_ext = SET16((&a_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_open_ext = SET16((&a_gap_open), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_open_ext = _mm_add_epi16(v_a_gap_open_ext, v_a_gap_ext); v_b_gap_ext = SET16((&b_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0); v_b_gap_open_ext = SET16((&b_gap_open), 0, 0, 0, 0, 0, 0, 0, 0); v_b_gap_open_ext = _mm_add_epi16(v_b_gap_open_ext, v_b_gap_ext); for (i = 0; i < lena + 14; i++) { nogap[i] = 0; b_gap[i] = (int16_t)-b_gap_open; } for (i = 0; i < (lenb + 7)/8; i++) { int k = i * 8; v_b_gap = SET16(b_gap, 6, 6, 5, 4, 3, 2, 1, 0); v_nogap = SET16(nogap, 6, 6, 5, 4, 3, 2, 1, 0); v_seq_a = SET16(seqA, 0, 0, 1, 2, 3, 4, 5, 6); v_seq_b = SET16(seqB, k+7, k+6, k+5, k+4, k+3, k+2, k+1, k+0); v_a_gap = v_a_gap_ext; v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_open_ext); v_last_nogap = _mm_setzero_si128(); v_prev_nogap = _mm_setzero_si128(); for (j = 0; j < (lena + 7); j++) { v_b_gap = _mm_slli_si128(v_b_gap, 2); v_b_gap = _mm_insert_epi16(v_b_gap, b_gap[j+7], 0); v_nogap = _mm_slli_si128(v_nogap, 2); v_nogap = _mm_insert_epi16(v_nogap, nogap[j+7], 0); v_seq_a = _mm_slli_si128(v_seq_a, 2); v_seq_a = _mm_insert_epi16(v_seq_a, seqA[j+7], 0); v_tmp = _mm_sub_epi16(v_last_nogap, v_a_gap_open_ext); v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_ext); v_a_gap = _mm_max_epi16(v_a_gap, v_tmp); v_tmp = _mm_sub_epi16(v_nogap, v_b_gap_open_ext); v_b_gap = _mm_sub_epi16(v_b_gap, v_b_gap_ext); v_b_gap = _mm_max_epi16(v_b_gap, v_tmp); /* compute the score (v_last_nogap is a tmp variable) */ v_last_nogap = _mm_cmpeq_epi16(v_seq_a, v_seq_b); v_tmp = _mm_and_si128(v_last_nogap, v_match); v_last_nogap = _mm_cmpeq_epi16(v_last_nogap, v_zero); v_last_nogap = _mm_and_si128(v_last_nogap, v_mismatch); v_tmp = _mm_or_si128(v_tmp, v_last_nogap); v_last_nogap = _mm_add_epi16(v_prev_nogap, v_tmp); v_last_nogap = _mm_max_epi16(v_last_nogap, v_zero); v_last_nogap = _mm_max_epi16(v_last_nogap, v_a_gap); v_last_nogap = _mm_max_epi16(v_last_nogap, v_b_gap); v_prev_nogap = v_nogap; v_nogap = v_last_nogap; b_gap[j] = (int16_t)_mm_extract_epi16(v_b_gap, 7); nogap[j] = (int16_t)_mm_extract_epi16(v_nogap, 7); v_score = _mm_max_epi16(v_score, v_last_nogap); } } /* * Ugh. Old gcc can't loop and using _mm_store to an int16_t array * breaks strict-aliasing rules. */ assert(score == 0); score = MAX(score, _mm_extract_epi16(v_score, 0)); score = MAX(score, _mm_extract_epi16(v_score, 1)); score = MAX(score, _mm_extract_epi16(v_score, 2)); score = MAX(score, _mm_extract_epi16(v_score, 3)); score = MAX(score, _mm_extract_epi16(v_score, 4)); score = MAX(score, _mm_extract_epi16(v_score, 5)); score = MAX(score, _mm_extract_epi16(v_score, 6)); score = MAX(score, _mm_extract_epi16(v_score, 7)); return (score); }
/* * Calculate the Smith-Waterman score. * * This is basically an SSE2 version of Wozniak's vectored implementation, but * without a score table. Further, we assume a fixed database and query size, * so *nogap and *b_gap must be pre-allocated (the malloc overhead for very * small scans is _huge_). * * NOTE THE FOLLOWING: * * 1) seqA must be padded with 7 bytes at the beginning and end. The first * element of seqA should be the first pad byte. * * 2) seqB must be padded with bytes on the end up to mod 8 characters. * The first element of seqB should be (of course) the first character. * * 3) seqA and seqB's padding _must_ be different, otherwise our logic will * consider the padding as matches! * * 4) These is no _mm_max_epu16 prior to SSE 4! We must use the signed max * function. Unfortunately, this limits our maximum score to 2^15 - 1, or * 32767. Since bad things happen if we roll over, our caller must ensure * that this will not happen. */ static int vect_sw_diff_gap(int8_t *seqA, int lena, int8_t *seqB, int lenb, int8_t *ls_seqA, int initbp, bool is_rna) { int i, j, score = 0; __m128i v_score, v_zero, v_match, v_mismatch; __m128i v_a_gap_ext, v_a_gap_open_ext; #ifndef v_b_gap_open_ext __m128i v_b_gap_ext, v_b_gap_open_ext; #endif __m128i v_a_gap, v_b_gap, v_nogap; __m128i v_last_nogap, v_prev_nogap, v_seq_a, v_seq_b; __m128i v_tmp; /* shut up icc */ (void)ls_seqA; (void)initbp; #define SET16(a, e7, e6, e5, e4, e3, e2, e1, e0) \ _mm_set_epi16((int16_t)a[e7], (int16_t)a[e6], \ (int16_t)a[e5], (int16_t)a[e4], \ (int16_t)a[e3], (int16_t)a[e2], \ (int16_t)a[e1], (int16_t)a[e0]) v_score = _mm_setzero_si128(); v_zero = _mm_setzero_si128(); v_match = SET16((&match), 0, 0, 0, 0, 0, 0, 0, 0); v_mismatch = SET16((&mismatch), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_ext = SET16((&a_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_open_ext = SET16((&a_gap_open), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_open_ext = _mm_add_epi16(v_a_gap_open_ext, v_a_gap_ext); v_b_gap_ext = SET16((&b_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0); v_b_gap_open_ext = SET16((&b_gap_open), 0, 0, 0, 0, 0, 0, 0, 0); v_b_gap_open_ext = _mm_add_epi16(v_b_gap_open_ext, v_b_gap_ext); for (i = 0; i < lena + 14; i++) { nogap[i] = 0; b_gap[i] = (int16_t)-b_gap_open; } /* * When using colour space reads, we must handle the first row * specially. This is because the read will begin with some marker * base, which will affect matching against the genome. * * For 25mer reads, this actually makes things faster, because our * vectorised portion becomes evenly divisible by 8 again. Yey. */ if (use_colours) { int a_gap, prev_nogap, last_nogap; a_gap = -a_gap_open; last_nogap = prev_nogap = 0; for (i = 7; i < (lena + 7); i++) { int a, ms; a_gap = MAX((last_nogap - a_gap_open - a_gap_ext), (a_gap - a_gap_ext)); b_gap[i] =(uint16_t)MAX((nogap[i] - b_gap_open - b_gap_ext), (b_gap[i] - b_gap_ext)); a = lstocs(ls_seqA[i], initbp, is_rna); ms = (a == seqB[0]) ? match : mismatch; last_nogap = MAX((prev_nogap + ms), 0); last_nogap = MAX(last_nogap, a_gap); last_nogap = MAX(last_nogap, b_gap[i]); prev_nogap = nogap[i]; nogap[i] = (uint16_t)last_nogap; score = MAX(score, last_nogap); } v_score = SET16((&score), 0, 0, 0, 0, 0, 0, 0, 0); score = 0; seqB++; lenb--; assert(lenb != 0); } for (i = 0; i < (lenb + 7)/8; i++) { int k = i * 8; v_b_gap = SET16(b_gap, 6, 6, 5, 4, 3, 2, 1, 0); v_nogap = SET16(nogap, 6, 6, 5, 4, 3, 2, 1, 0); v_seq_a = SET16(seqA, 0, 0, 1, 2, 3, 4, 5, 6); v_seq_b = SET16(seqB, k+7, k+6, k+5, k+4, k+3, k+2, k+1, k+0); v_a_gap = v_a_gap_ext; v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_open_ext); v_last_nogap = _mm_setzero_si128(); v_prev_nogap = _mm_setzero_si128(); for (j = 0; j < (lena + 7); j++) { v_b_gap = _mm_slli_si128(v_b_gap, 2); v_b_gap = _mm_insert_epi16(v_b_gap, b_gap[j+7], 0); v_nogap = _mm_slli_si128(v_nogap, 2); v_nogap = _mm_insert_epi16(v_nogap, nogap[j+7], 0); v_seq_a = _mm_slli_si128(v_seq_a, 2); v_seq_a = _mm_insert_epi16(v_seq_a, seqA[j+7], 0); v_tmp = _mm_sub_epi16(v_last_nogap, v_a_gap_open_ext); v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_ext); v_a_gap = _mm_max_epi16(v_a_gap, v_tmp); v_tmp = _mm_sub_epi16(v_nogap, v_b_gap_open_ext); v_b_gap = _mm_sub_epi16(v_b_gap, v_b_gap_ext); v_b_gap = _mm_max_epi16(v_b_gap, v_tmp); /* compute the score (v_last_nogap is a tmp variable) */ v_last_nogap = _mm_cmpeq_epi16(v_seq_a, v_seq_b); v_tmp = _mm_and_si128(v_last_nogap, v_match); v_last_nogap = _mm_cmpeq_epi16(v_last_nogap, v_zero); v_last_nogap = _mm_and_si128(v_last_nogap, v_mismatch); v_tmp = _mm_or_si128(v_tmp, v_last_nogap); v_last_nogap = _mm_add_epi16(v_prev_nogap, v_tmp); v_last_nogap = _mm_max_epi16(v_last_nogap, v_zero); v_last_nogap = _mm_max_epi16(v_last_nogap, v_a_gap); v_last_nogap = _mm_max_epi16(v_last_nogap, v_b_gap); v_prev_nogap = v_nogap; v_nogap = v_last_nogap; b_gap[j] = (int16_t)_mm_extract_epi16(v_b_gap, 7); nogap[j] = (int16_t)_mm_extract_epi16(v_nogap, 7); v_score = _mm_max_epi16(v_score, v_last_nogap); } } /* * Ugh. Old gcc can't loop and using _mm_store to an int16_t array * breaks strict-aliasing rules. */ assert(score == 0); score = MAX(score, _mm_extract_epi16(v_score, 0)); score = MAX(score, _mm_extract_epi16(v_score, 1)); score = MAX(score, _mm_extract_epi16(v_score, 2)); score = MAX(score, _mm_extract_epi16(v_score, 3)); score = MAX(score, _mm_extract_epi16(v_score, 4)); score = MAX(score, _mm_extract_epi16(v_score, 5)); score = MAX(score, _mm_extract_epi16(v_score, 6)); score = MAX(score, _mm_extract_epi16(v_score, 7)); return (score); }
int global_sse2_word(int queryLength, unsigned short *profile, const unsigned char *dbSeq, int dbLength, unsigned short gapOpen, unsigned short gapExtend, unsigned short ceiling, struct f_struct *f_str) { int i, j; int score; int scale; int temp; int distance; int offset; int position; int cmp; int iter; __m128i *pvH; __m128i *pvE; __m128i vE, vF, vH; __m128i vHNext; __m128i vFPrev; __m128i vGapOpen; __m128i vGapExtend; __m128i vCeiling; __m128i vScale; __m128i vScaleAmt; __m128i vScaleTmp; __m128i vTemp; __m128i vNull; __m128i *pvScore; scale = 0; iter = (queryLength + 7) / 8; offset = (queryLength - 1) % iter; position = 7 - (queryLength - 1) / iter; pvH = (__m128i *)f_str->workspace; pvE = pvH + iter; /* Load gap opening penalty to all elements of a constant */ vGapOpen = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0); vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0); vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0); /* Load gap extension penalty to all elements of a constant */ vGapExtend = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0); vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0); vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0); /* Generate the ceiling before scaling */ vTemp = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vTemp = _mm_insert_epi16 (vTemp, ceiling, 0); vTemp = _mm_shufflelo_epi16 (vTemp, 0); vTemp = _mm_shuffle_epi32 (vTemp, 0); vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp); vCeiling = _mm_srli_epi16 (vCeiling, 1); vCeiling = _mm_subs_epi16 (vCeiling, vTemp); vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen); vNull = _mm_cmpeq_epi16 (vTemp, vTemp); vNull = _mm_slli_epi16 (vNull, 15); vScaleAmt = _mm_xor_si128 (vNull, vNull); /* Zero out the storage vector */ vTemp = _mm_adds_epi16 (vNull, vGapOpen); for (i = 0; i < iter; i++) { _mm_store_si128 (pvH + i, vTemp); _mm_store_si128 (pvE + i, vNull); } /* initialize F */ vF = vNull; vFPrev = vNull; /* load and scale H for the next round */ vTemp = _mm_srli_si128 (vGapOpen, 14); vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_adds_epi16 (vH, vTemp); for (i = 0; i < dbLength; ++i) { /* fetch first data asap. */ pvScore = (__m128i *) profile + dbSeq[i] * iter; vF = vNull; vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < iter; j++) { /* correct H from the previous columns F */ vHNext = _mm_load_si128 (pvH + j); vHNext = _mm_max_epi16 (vHNext, vFPrev); /* load and correct E value */ vE = _mm_load_si128 (pvE + j); vTemp = _mm_subs_epi16 (vHNext, vGapOpen); vE = _mm_max_epi16 (vE, vTemp); _mm_store_si128 (pvE + j, vE); /* add score to vH */ vH = _mm_adds_epi16 (vH, *pvScore++); /* get max from vH, vE and vF */ vH = _mm_max_epi16 (vH, vE); vH = _mm_max_epi16 (vH, vF); _mm_store_si128 (pvH + j, vH); /* update vF value */ vH = _mm_subs_epi16 (vH, vGapOpen); vF = _mm_max_epi16 (vF, vH); /* load the next h values */ vH = vHNext; } /* check if we need to scale before the next round */ vTemp = _mm_cmpgt_epi16 (vF, vCeiling); cmp = _mm_movemask_epi8 (vTemp); /* broadcast F values */ vF = _mm_xor_si128 (vF, vNull); vTemp = _mm_slli_si128 (vF, 2); vTemp = _mm_subs_epu16 (vTemp, vScaleAmt); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vF, 4); vScaleTmp = _mm_slli_si128 (vScaleAmt, 2); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vScaleTmp, 4); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp); vTemp = _mm_slli_si128 (vF, 8); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); /* scale if necessary */ if (cmp != 0x0000) { __m128i vScale1; __m128i vScale2; vScale = _mm_slli_si128 (vF, 2); vScale = _mm_subs_epu16 (vScale, vGapOpen); vScale = _mm_subs_epu16 (vScale, vScaleAmt); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vScale, vTemp); vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vTemp, vScale); vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp); /* rescale the previous F */ vF = _mm_subs_epu16 (vF, vScale); /* check if we can continue in signed 16-bits */ vTemp = _mm_xor_si128 (vF, vNull); vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling); cmp = _mm_movemask_epi8 (vTemp); if (cmp != 0x0000) { return OVERFLOW_SCORE; } vTemp = _mm_adds_epi16 (vCeiling, vCeiling); vScale1 = _mm_subs_epu16 (vScale, vTemp); vScale2 = _mm_subs_epu16 (vScale, vScale1); /* scale all the vectors */ for (j = 0; j < iter; j++) { /* load H and E */ vH = _mm_load_si128 (pvH + j); vE = _mm_load_si128 (pvE + j); /* get max from vH, vE and vF */ vH = _mm_subs_epi16 (vH, vScale1); vH = _mm_subs_epi16 (vH, vScale2); vE = _mm_subs_epi16 (vE, vScale1); vE = _mm_subs_epi16 (vE, vScale2); /* save the H and E */ _mm_store_si128 (pvH + j, vH); _mm_store_si128 (pvE + j, vE); } vScale = vScaleAmt; for (j = 0; j < position; ++j) { vScale = _mm_slli_si128 (vScale, 2); } /* calculate the final scaling amount */ vTemp = _mm_xor_si128 (vTemp, vTemp); vScale1 = _mm_unpacklo_epi16 (vScale, vTemp); vScale2 = _mm_unpackhi_epi16 (vScale, vTemp); vScale = _mm_add_epi32 (vScale1, vScale2); vTemp = _mm_srli_si128 (vScale, 8); vScale = _mm_add_epi32 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 4); vScale = _mm_add_epi32 (vScale, vTemp); scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0); temp = (int) (unsigned short) _mm_extract_epi16 (vScale, 1); scale = scale + (temp << 16); } /* scale the F value for the next round */ vFPrev = _mm_slli_si128 (vF, 2); vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt); vFPrev = _mm_xor_si128 (vFPrev, vNull); /* load and scale H for the next round */ vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_xor_si128 (vH, vNull); vH = _mm_slli_si128 (vH, 2); vH = _mm_subs_epu16 (vH, vScaleAmt); vH = _mm_insert_epi16 (vH, gapOpen, 0); vH = _mm_xor_si128 (vH, vNull); } vH = _mm_load_si128 (pvH + offset); vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < position; ++j) { vH = _mm_slli_si128 (vH, 2); } score = (int) (signed short) _mm_extract_epi16 (vH, 7); score = score + SHORT_BIAS; /* return largest score */ distance = (queryLength + dbLength) * gapExtend; score = score - (gapOpen * 2) - distance + scale; return score; }
int global_sse2_byte(int queryLength, unsigned char *profile, const unsigned char *dbSeq, int dbLength, unsigned short gapOpen, unsigned short gapExtend, unsigned short ceiling, unsigned short bias, struct f_struct *f_str) { int i, j; int score; int scale; int distance; int offset; int position; int dup; int cmp; int iter; __m128i *pvH; __m128i *pvE; __m128i vE, vF, vH; __m128i vHInit; __m128i vHNext; __m128i vFPrev; __m128i vBias; __m128i vGapOpen; __m128i vGapExtend; __m128i vCeiling; __m128i vScale; __m128i vScaleAmt; __m128i vScaleTmp; __m128i vTemp; __m128i vNull; __m128i *pvScore; scale = 0; iter = (queryLength + 15) / 16; offset = (queryLength - 1) % iter; position = 15 - (queryLength - 1) / iter; pvH = (__m128i *)f_str->workspace; pvE = pvH + iter; /* Load the bias to all elements of a constant */ dup = (bias << 8) | (bias & 0x00ff); vBias = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */ vBias = _mm_insert_epi16 (vBias, dup, 0); vBias = _mm_shufflelo_epi16 (vBias, 0); vBias = _mm_shuffle_epi32 (vBias, 0); /* Load gap opening penalty to all elements of a constant */ dup = (gapOpen << 8) | (gapOpen & 0x00ff); vGapOpen = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */ vGapOpen = _mm_insert_epi16 (vGapOpen, dup, 0); vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0); vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0); /* Load gap extension penalty to all elements of a constant */ dup = (gapExtend << 8) | (gapExtend & 0x00ff); vGapExtend = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */ vGapExtend = _mm_insert_epi16 (vGapExtend, dup, 0); vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0); vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0); /* Generate the ceiling before scaling */ dup = (ceiling << 8) | (ceiling & 0x00ff); vTemp = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */ vTemp = _mm_insert_epi16 (vTemp, dup, 0); vTemp = _mm_shufflelo_epi16 (vTemp, 0); vTemp = _mm_shuffle_epi32 (vTemp, 0); vCeiling = _mm_cmpeq_epi8 (vTemp, vTemp); vCeiling = _mm_subs_epu8 (vCeiling, vTemp); vCeiling = _mm_subs_epu8 (vCeiling, vGapOpen); /* since we want to use the full range, zero is redefined as */ /* 2 * gapOpen. the lowest scaled score will an insert followed */ /* by a delete. */ vHInit = _mm_srli_si128 (vGapOpen, 15); /* vNull = _mm_xor_si128 (vNull, vNull); */ vNull = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */ vScaleAmt = vNull; /* Zero out the storage vector */ for (i = 0; i < iter; i++) { _mm_store_si128 (pvH + i, vGapOpen); _mm_store_si128 (pvE + i, vNull); } /* initialize F */ vF = vNull; vFPrev = vNull; /* load and scale H for the next round */ vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_slli_si128 (vH, 1); vH = _mm_adds_epu8 (vH, vHInit); vH = _mm_adds_epu8 (vH, vHInit); for (i = 0; i < dbLength; ++i) { /* fetch first data asap. */ pvScore = (__m128i *) profile + dbSeq[i] * iter; vF = _mm_xor_si128 (vF, vF); vH = _mm_max_epu8 (vH, vFPrev); for (j = 0; j < iter; j++) { /* correct H from the previous columns F */ vHNext = _mm_load_si128 (pvH + j); vHNext = _mm_max_epu8 (vHNext, vFPrev); /* load and correct E value */ vE = _mm_load_si128 (pvE + j); vTemp = _mm_subs_epu8 (vHNext, vGapOpen); vE = _mm_max_epu8 (vE, vTemp); _mm_store_si128 (pvE + j, vE); /* add score to vH */ vH = _mm_adds_epu8 (vH, *pvScore++); vH = _mm_subs_epu8 (vH, vBias); /* get max from vH, vE and vF */ vH = _mm_max_epu8 (vH, vE); vH = _mm_max_epu8 (vH, vF); _mm_store_si128 (pvH + j, vH); /* update vF value */ vH = _mm_subs_epu8 (vH, vGapOpen); vF = _mm_max_epu8 (vF, vH); /* load the next h values */ vH = vHNext; } /* check if we need to scale before the next round */ vTemp = _mm_subs_epu8 (vCeiling, vF); vTemp = _mm_cmpeq_epi8 (vTemp, vNull); cmp = _mm_movemask_epi8 (vTemp); /* broadcast F values */ vTemp = _mm_slli_si128 (vF, 1); vTemp = _mm_subs_epu8 (vTemp, vScaleAmt); vF = _mm_max_epu8 (vF, vTemp); vScaleTmp = _mm_slli_si128 (vScaleAmt, 1); vScaleTmp = _mm_adds_epu8 (vScaleTmp, vScaleAmt); vTemp = _mm_slli_si128 (vF, 2); vTemp = _mm_subs_epu8 (vTemp, vScaleTmp); vF = _mm_max_epu8 (vF, vTemp); vTemp = _mm_slli_si128 (vScaleTmp, 2); vScaleTmp = _mm_adds_epu8 (vScaleTmp, vTemp); vTemp = _mm_slli_si128 (vF, 4); vTemp = _mm_subs_epu8 (vTemp, vScaleTmp); vF = _mm_max_epu8 (vF, vTemp); vTemp = _mm_slli_si128 (vScaleTmp, 4); vScaleTmp = _mm_adds_epu8 (vScaleTmp, vTemp); vTemp = _mm_slli_si128 (vF, 8); vTemp = _mm_subs_epu8 (vTemp, vScaleTmp); vF = _mm_max_epu8 (vF, vTemp); /* scale if necessary */ if (cmp != 0x0000) { vScale = _mm_slli_si128 (vF, 1); vScale = _mm_subs_epu8 (vScale, vGapOpen); vScale = _mm_subs_epu8 (vScale, vScaleAmt); vTemp = _mm_slli_si128 (vScale, 1); vTemp = _mm_subs_epu8 (vScale, vTemp); vScaleAmt = _mm_adds_epu8 (vScaleAmt, vTemp); vTemp = _mm_slli_si128 (vScale, 1); vTemp = _mm_subs_epu8 (vTemp, vScale); vScaleAmt = _mm_subs_epu8 (vScaleAmt, vTemp); /* rescale the previous F */ vF = _mm_subs_epu8 (vF, vScale); /* check if we can continue in 8-bits */ vTemp = _mm_subs_epu8 (vCeiling, vF); vTemp = _mm_cmpeq_epi8 (vTemp, vNull); cmp = _mm_movemask_epi8 (vTemp); if (cmp != 0x0000) { return OVERFLOW_SCORE; } /* scale all the vectors */ for (j = 0; j < iter; j++) { /* load H and E */ vH = _mm_load_si128 (pvH + j); vE = _mm_load_si128 (pvE + j); /* get max from vH, vE and vF */ vH = _mm_subs_epu8 (vH, vScale); vE = _mm_subs_epu8 (vE, vScale); /* save the H and E */ _mm_store_si128 (pvH + j, vH); _mm_store_si128 (pvE + j, vE); } /* calculate the final scaling amount */ vScale = vScaleAmt; for (j = 0; j < position; ++j) { vScale = _mm_slli_si128 (vScale, 1); } vTemp = _mm_unpacklo_epi8 (vScale, vNull); vScale = _mm_unpackhi_epi8 (vScale, vNull); vScale = _mm_adds_epi16 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 8); vScale = _mm_adds_epi16 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 4); vScale = _mm_adds_epi16 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 2); vScale = _mm_adds_epi16 (vScale, vTemp); scale = (int) _mm_extract_epi16 (vScale, 0); } /* scale the F value for the next round */ vFPrev = _mm_slli_si128 (vF, 1); vFPrev = _mm_subs_epu8 (vFPrev, vScaleAmt); /* load and scale H for the next round */ vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_slli_si128 (vH, 1); vH = _mm_subs_epu8 (vH, vScaleAmt); vH = _mm_or_si128 (vH, vHInit); } /* calculate the max global score */ vH = _mm_load_si128 (pvH + offset); vH = _mm_max_epu8 (vH, vF); for (j = 0; j < position; ++j) { vH = _mm_slli_si128 (vH, 1); } score = (int) (unsigned short) _mm_extract_epi16 (vH, 7); score >>= 8; /* return largest score */ distance = (queryLength + dbLength) * gapExtend; score = score - (gapOpen * 2) - distance + scale; return score; }