Example #1
0
void Coefs(unsigned char *current_part_ptr, int current_part_stride, unsigned char *ref_part_ptr, int ref_part_stride, unsigned char *coef_buf, int n)	{


static const unsigned short c_32[8] = {32, 32, 32, 32, 32, 32, 32, 32};			

int i;			

__m128i v_row0_0, v_row0_1;			
__m128i v_temp_0, v_temp_1;			
__m128i v_result;			

__m128i vZero;
vZero = _mm_setzero_si128();			
__m128i v_32 = _mm_loadu_si128((__m128i*)c_32);			

__m128i* coef_ptr = (__m128i*) coef_buf;			

v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr);			
v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9);			
v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3);			
ref_part_ptr += ref_part_stride;			
// row0: 0 1 2 3 4 5 6 7			
// row1: 2 3 4 5 6 7 8 9			

v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero);			
v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero);			

for ( i = 0; i < n; i++ )			
{			
v_row0_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[0]);			
v_row0_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[1]);			

v_result = v_32;			
v_result = _mm_add_epi16(v_result, v_row0_0);			
v_result = _mm_add_epi16(v_result, v_row0_1);			

v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr);			
v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9);			
v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3);			
ref_part_ptr += ref_part_stride;			
v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero);			
v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero);			
v_temp_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[2]);			
v_temp_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[3]);			

v_result = _mm_add_epi16(v_result, v_temp_0);			
v_result = _mm_add_epi16(v_result, v_temp_1);			
v_result = _mm_srli_epi16(v_result, 6);			

_mm_store_si128((__m128i*)(current_part_ptr), v_result);			
current_part_ptr += current_part_stride;			
}			

}			
Example #2
0
static inline long
conv_yHalf_yF (const uint16_t *src, float *dst, long samples)
{
  const uint64_t *s_vec;
  __v4sf         *d_vec;

  long n = samples;

  s_vec = (const uint64_t *)src;
  d_vec = (__v4sf *)dst;

  while (n >= 4)
    {
      __m128i in_val = _mm_insert_epi64((__m128i)_mm_setzero_ps(), *s_vec++, 0);
      __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
      _mm_storeu_ps((float *)d_vec++, out_val);
      n -= 4;
    }

  src = (const uint16_t *)s_vec;
  dst = (float *)d_vec;

  while (n)
    {
      __m128i in_val = _mm_insert_epi16((__m128i)_mm_setzero_ps(), *src++, 0);
      __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
      _mm_store_ss(dst++, out_val);
      n -= 1;
    }

  return samples;
}
Example #3
0
// Lowering to pinsrw requires optimization.
__m128i test_mm_insert_epi16(__m128i A, short B) {
  // DAG-LABEL: test_mm_insert_epi16
  // DAG: [[x:%.*]] = and i32 %{{.*}}, 7
  // DAG: insertelement <8 x i16> %{{.*}}, i32 [[x]]
  //
  // ASM-LABEL: test_mm_insert_epi16
  // ASM: movw
  return _mm_insert_epi16(A, B, 8);
}
Example #4
0
rfx_dwt_2d_encode_block_horiz_sse2(INT16* src, INT16* l, INT16* h, int subband_width)
{
	int y;
	int n;
	int first;
	__m128i src_2n;
	__m128i src_2n_1;
	__m128i src_2n_2;
	__m128i h_n;
	__m128i h_n_m;
	__m128i l_n;

	for (y = 0; y < subband_width; y++)
	{
		for (n = 0; n < subband_width; n += 8)
		{
			/* The following 3 Set operations consumes more than half of the total DWT processing time! */
			src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
			src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
			src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16],
				src[14], src[12], src[10], src[8], src[6], src[4], src[2]);

			/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */

			h_n = _mm_add_epi16(src_2n, src_2n_2);
			h_n = _mm_srai_epi16(h_n, 1);
			h_n = _mm_sub_epi16(src_2n_1, h_n);
			h_n = _mm_srai_epi16(h_n, 1);

			_mm_store_si128((__m128i*) h, h_n);

			h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
			if (n == 0)
			{
				first = _mm_extract_epi16(h_n_m, 1);
				h_n_m = _mm_insert_epi16(h_n_m, first, 0);
			}

			/* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */

			l_n = _mm_add_epi16(h_n_m, h_n);
			l_n = _mm_srai_epi16(l_n, 1);
			l_n = _mm_add_epi16(l_n, src_2n);

			_mm_store_si128((__m128i*) l, l_n);

			src += 16;
			l += 8;
			h += 8;
		}
	}
}
static FORCE_INLINE void blur_r6_h_right_sse2(const PixelType *srcp, PixelType *dstp) {
    __m128i avg12 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 1)), _mm_loadu_si128((const __m128i *)(srcp - 2)));
    __m128i avg34 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 3)), _mm_loadu_si128((const __m128i *)(srcp - 4)));
    __m128i avg56 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 5)), _mm_loadu_si128((const __m128i *)(srcp - 6)));

    __m128i avg012 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp)), avg12);
    __m128i avg3456 = mm_avg_epu<PixelType>(avg34, avg56);
    __m128i avg0123456 = mm_avg_epu<PixelType>(avg012, avg3456);
    __m128i avg = mm_avg_epu<PixelType>(avg012, avg0123456);

    // This is the right edge. Only the highest six pixels are needed.
    if (sizeof(PixelType) == 1) {
        int extra_bytes = *(int16_t *)(dstp + 8);
        avg = _mm_insert_epi16(avg, extra_bytes, 4);
        _mm_storeh_pi((__m64 *)(dstp + 8), _mm_castsi128_ps(avg));
    } else {
        int extra_bytes = dstp[0];
        avg = _mm_insert_epi16(avg, extra_bytes, 0);
        extra_bytes = dstp[1];
        avg = _mm_insert_epi16(avg, extra_bytes, 1);
        _mm_storeu_si128((__m128i *)(dstp), avg);
    }
}
void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int x_step_q4,
                                    const int16_t *filter_y, int y_step_q4,
                                    int w, int h) {
  const int bd = 8;
  assert(x_step_q4 == 16 && y_step_q4 == 16);
  assert(!(w & 7));
  (void)x_step_q4;
  (void)y_step_q4;

  uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE];
  int intermediate_height = h + SUBPEL_TAPS - 1;
  int i, j;
  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;

  const __m128i zero = _mm_setzero_si128();
  // Add an offset to account for the "add_src" part of the convolve function.
  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);

  /* Horizontal filter */
  {
    const __m128i coeffs_x =
        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const =
        _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
                       (1 << (bd + FILTER_BITS - 1)));

    for (i = 0; i < intermediate_height; ++i) {
      for (j = 0; j < w; j += 8) {
        const __m128i data =
            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);

        // Filter even-index pixels
        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);

        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                         _mm_add_epi32(res_2, res_6));
        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
                                  FILTER_BITS - EXTRAPREC_BITS);

        // Filter odd-index pixels
        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);

        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                        _mm_add_epi32(res_3, res_7));
        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
                                 FILTER_BITS - EXTRAPREC_BITS);

        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
        __m128i res = _mm_packs_epi32(res_even, res_odd);
        res = _mm_min_epi16(_mm_max_epi16(res, zero),
                            _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1));
        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
      }
    }
  }

  /* Vertical filter */
  {
    const __m128i coeffs_y =
        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const =
        _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
                       (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));

    for (i = 0; i < h; ++i) {
      for (j = 0; j < w; j += 8) {
        // Filter even-index pixels
        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
        const __m128i src_0 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
        const __m128i src_2 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
        const __m128i src_4 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
        const __m128i src_6 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
                               *(__m128i *)(data + 7 * MAX_SB_SIZE));

        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);

        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
                                               _mm_add_epi32(res_4, res_6));

        // Filter odd-index pixels
        const __m128i src_1 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
        const __m128i src_3 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
        const __m128i src_5 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
        const __m128i src_7 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
                               *(__m128i *)(data + 7 * MAX_SB_SIZE));

        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);

        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
                                              _mm_add_epi32(res_5, res_7));

        // Rearrange pixels back into the order 0 ... 7
        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);

        const __m128i res_lo_round = _mm_srai_epi32(
            _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
        const __m128i res_hi_round = _mm_srai_epi32(
            _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);

        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);

        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
        _mm_storel_epi64(p, res_8bit);
      }
    }
  }
}
Example #7
0
static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
                                         const uint8_t* src) {
  rescaler_t* frow = wrk->frow;
  const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
  const int x_add = wrk->x_add;
  int accum = x_add;
  __m128i cur_pixels;

  // SSE2 implementation only works with 16b signed arithmetic at max.
  if (wrk->src_width < 8 || accum >= (1 << 15)) {
    WebPRescalerImportRowExpand_C(wrk, src);
    return;
  }

  assert(!WebPRescalerInputDone(wrk));
  assert(wrk->x_expand);
  if (wrk->num_channels == 4) {
    LoadTwoPixels_SSE2(src, &cur_pixels);
    src += 4;
    while (1) {
      const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
      const __m128i out = _mm_madd_epi16(cur_pixels, mult);
      _mm_storeu_si128((__m128i*)frow, out);
      frow += 4;
      if (frow >= frow_end) break;
      accum -= wrk->x_sub;
      if (accum < 0) {
        LoadTwoPixels_SSE2(src, &cur_pixels);
        src += 4;
        accum += x_add;
      }
    }
  } else {
    int left;
    const uint8_t* const src_limit = src + wrk->src_width - 8;
    LoadEightPixels_SSE2(src, &cur_pixels);
    src += 7;
    left = 7;
    while (1) {
      const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum);
      const __m128i out = _mm_madd_epi16(cur_pixels, mult);
      assert(sizeof(*frow) == sizeof(uint32_t));
      WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out));
      frow += 1;
      if (frow >= frow_end) break;
      accum -= wrk->x_sub;
      if (accum < 0) {
        if (--left) {
          cur_pixels = _mm_srli_si128(cur_pixels, 2);
        } else if (src <= src_limit) {
          LoadEightPixels_SSE2(src, &cur_pixels);
          src += 7;
          left = 7;
        } else {   // tail
          cur_pixels = _mm_srli_si128(cur_pixels, 2);
          cur_pixels = _mm_insert_epi16(cur_pixels, src[1], 1);
          src += 1;
          left = 1;
        }
        accum += x_add;
      }
    }
  }
  assert(accum == 0);
}
void av1_highbd_wiener_convolve_add_src_ssse3(
    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
    const int16_t *filter_y, int y_step_q4, int w, int h,
    const ConvolveParams *conv_params, int bd) {
  assert(x_step_q4 == 16 && y_step_q4 == 16);
  assert(!(w & 7));
  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
  (void)x_step_q4;
  (void)y_step_q4;

  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);

  DECLARE_ALIGNED(16, uint16_t,
                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
  int intermediate_height = h + SUBPEL_TAPS - 1;
  int i, j;
  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;

  const __m128i zero = _mm_setzero_si128();
  // Add an offset to account for the "add_src" part of the convolve function.
  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);

  /* Horizontal filter */
  {
    const __m128i coeffs_x =
        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const = _mm_set1_epi32(
        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));

    for (i = 0; i < intermediate_height; ++i) {
      for (j = 0; j < w; j += 8) {
        const __m128i data =
            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
        const __m128i data2 =
            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);

        // Filter even-index pixels
        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
        const __m128i res_2 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
        const __m128i res_4 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
        const __m128i res_6 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);

        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                         _mm_add_epi32(res_2, res_6));
        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
                                  conv_params->round_0);

        // Filter odd-index pixels
        const __m128i res_1 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
        const __m128i res_3 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
        const __m128i res_5 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
        const __m128i res_7 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);

        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                        _mm_add_epi32(res_3, res_7));
        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
                                 conv_params->round_0);

        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
        const __m128i maxval =
            _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1);
        __m128i res = _mm_packs_epi32(res_even, res_odd);
        res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
      }
    }
  }

  /* Vertical filter */
  {
    const __m128i coeffs_y =
        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const =
        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
                       (1 << (bd + conv_params->round_1 - 1)));

    for (i = 0; i < h; ++i) {
      for (j = 0; j < w; j += 8) {
        // Filter even-index pixels
        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
        const __m128i src_0 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
        const __m128i src_2 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
        const __m128i src_4 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
        const __m128i src_6 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
                               *(__m128i *)(data + 7 * MAX_SB_SIZE));

        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);

        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
                                               _mm_add_epi32(res_4, res_6));

        // Filter odd-index pixels
        const __m128i src_1 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
        const __m128i src_3 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
        const __m128i src_5 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
        const __m128i src_7 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
                               *(__m128i *)(data + 7 * MAX_SB_SIZE));

        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);

        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
                                              _mm_add_epi32(res_5, res_7));

        // Rearrange pixels back into the order 0 ... 7
        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);

        const __m128i res_lo_round = _mm_srai_epi32(
            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
        const __m128i res_hi_round = _mm_srai_epi32(
            _mm_add_epi32(res_hi, round_const), conv_params->round_1);

        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
        res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);

        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
        _mm_storeu_si128(p, res_16bit);
      }
    }
  }
}
Example #9
0
rfx_dwt_2d_decode_block_horiz_sse2(INT16* l, INT16* h, INT16* dst, int subband_width)
{
	int y, n;
	INT16* l_ptr = l;
	INT16* h_ptr = h;
	INT16* dst_ptr = dst;
	int first;
	int last;
	__m128i l_n;
	__m128i h_n;
	__m128i h_n_m;
	__m128i tmp_n;
	__m128i dst_n;
	__m128i dst_n_p;
	__m128i dst1;
	__m128i dst2;

	for (y = 0; y < subband_width; y++)
	{
		/* Even coefficients */
		for (n = 0; n < subband_width; n += 8)
		{
			/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
			
			l_n = _mm_load_si128((__m128i*) l_ptr);

			h_n = _mm_load_si128((__m128i*) h_ptr);
			h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));

			if (n == 0)
			{
				first = _mm_extract_epi16(h_n_m, 1);
				h_n_m = _mm_insert_epi16(h_n_m, first, 0);
			}
			
			tmp_n = _mm_add_epi16(h_n, h_n_m);
			tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
			tmp_n = _mm_srai_epi16(tmp_n, 1);
			
			dst_n = _mm_sub_epi16(l_n, tmp_n);
			
			_mm_store_si128((__m128i*) l_ptr, dst_n);
			
			l_ptr += 8;
			h_ptr += 8;
		}

		l_ptr -= subband_width;
		h_ptr -= subband_width;
		
		/* Odd coefficients */
		for (n = 0; n < subband_width; n += 8)
		{
			/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
			
			h_n = _mm_load_si128((__m128i*) h_ptr);
			
			h_n = _mm_slli_epi16(h_n, 1);
			
			dst_n = _mm_load_si128((__m128i*) (l_ptr));
			dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));

			if (n == subband_width - 8)
			{
				last = _mm_extract_epi16(dst_n_p, 6);
				dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
			}
			
			tmp_n = _mm_add_epi16(dst_n_p, dst_n);
			tmp_n = _mm_srai_epi16(tmp_n, 1);
			
			tmp_n = _mm_add_epi16(tmp_n, h_n);
			
			dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
			dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
			
			_mm_store_si128((__m128i*) dst_ptr, dst1);
			_mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
			
			l_ptr += 8;
			h_ptr += 8;
			dst_ptr += 16;
		}
	}
}
int
smith_waterman_sse2_word(const unsigned char *     query_sequence,
                         unsigned short *    query_profile_word,
                         const int                 query_length,
                         const unsigned char *     db_sequence,
                         const int                 db_length,
                         unsigned short      gap_open,
                         unsigned short      gap_extend,
                         struct f_struct *   f_str)
{
    int     i, j, k;
    short   score;

    int     cmp;
    int     iter = (query_length + 7) / 8;
    

    __m128i *p;
    __m128i *workspace = (__m128i *) f_str->workspace;

    __m128i E, F, H;

    __m128i v_maxscore;
    __m128i v_gapopen;
    __m128i v_gapextend;

    __m128i v_min;
    __m128i v_minimums;
    __m128i v_temp;

    __m128i *pHLoad, *pHStore;
    __m128i *pE;

    __m128i *pScore;

    /* Load gap opening penalty to all elements of a constant */
    v_gapopen = _mm_setzero_si128();	/* Apple Devel */
    v_gapopen = _mm_insert_epi16 (v_gapopen, gap_open, 0);
    v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
    v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);

    /* Load gap extension penalty to all elements of a constant */
    v_gapextend = _mm_setzero_si128();	/* Apple Devel */
    v_gapextend = _mm_insert_epi16 (v_gapextend, gap_extend, 0);
    v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
    v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);

    /* load v_maxscore with the zeros.  since we are using signed */
    /*  math, we will bias the maxscore to -32768 so we have the */
    /*  full range of the short. */
    v_maxscore = _mm_setzero_si128();	/* Apple Devel */
    v_maxscore = _mm_cmpeq_epi16 (v_maxscore, v_maxscore);
    v_maxscore = _mm_slli_epi16 (v_maxscore, 15);

    v_minimums = _mm_shuffle_epi32 (v_maxscore, 0);

    v_min = _mm_shuffle_epi32 (v_maxscore, 0);
    v_min = _mm_srli_si128 (v_min, 14);

    /* Zero out the storage vector */
    k = 2 * iter;

    p = workspace;
    for (i = 0; i < k; i++)
    {
        _mm_store_si128 (p++, v_maxscore);
    }

    pE = workspace;
    pHStore = pE + iter;
    pHLoad = pHStore + iter;

    for (i = 0; i < db_length; ++i)
    {
        /* fetch first data asap. */
        pScore = (__m128i *) query_profile_word + db_sequence[i] * iter;

        /* bias all elements in F to -32768 */
        F = _mm_setzero_si128();	/* Apple Devel */
        F = _mm_cmpeq_epi16 (F, F);
        F = _mm_slli_epi16 (F, 15);

        /* load the next h value */
        H = _mm_load_si128 (pHStore + iter - 1);
        H = _mm_slli_si128 (H, 2);
        H = _mm_or_si128 (H, v_min);

        p = pHLoad;
        pHLoad = pHStore;
        pHStore = p;

        for (j = 0; j < iter; j++)
        {
            /* load E values */
            E = _mm_load_si128 (pE + j);

            /* add score to H */
            H = _mm_adds_epi16 (H, *pScore++);

            /* Update highest score encountered this far */
            v_maxscore = _mm_max_epi16 (v_maxscore, H);

            /* get max from H, E and F */
            H = _mm_max_epi16 (H, E);
            H = _mm_max_epi16 (H, F);

            /* save H values */
            _mm_store_si128 (pHStore + j, H);

            /* subtract the gap open penalty from H */
            H = _mm_subs_epi16 (H, v_gapopen);

            /* update E value */
            E = _mm_subs_epi16 (E, v_gapextend);
            E = _mm_max_epi16 (E, H);

            /* update F value */
            F = _mm_subs_epi16 (F, v_gapextend);
            F = _mm_max_epi16 (F, H);

            /* save E values */
            _mm_store_si128 (pE + j, E);

            /* load the next h value */
            H = _mm_load_si128 (pHLoad + j);
        }

        /* reset pointers to the start of the saved data */
        j = 0;
        H = _mm_load_si128 (pHStore + j);

        /*  the computed F value is for the given column.  since */
        /*  we are at the end, we need to shift the F value over */
        /*  to the next column. */
        F = _mm_slli_si128 (F, 2);
        F = _mm_or_si128 (F, v_min);
        v_temp = _mm_subs_epi16 (H, v_gapopen);
        v_temp = _mm_cmpgt_epi16 (F, v_temp);
        cmp  = _mm_movemask_epi8 (v_temp);

        while (cmp != 0x0000) 
        {
            E = _mm_load_si128 (pE + j);

            H = _mm_max_epi16 (H, F);

            /* save H values */
            _mm_store_si128 (pHStore + j, H);

            /* update E in case the new H value would change it */
            H = _mm_subs_epi16 (H, v_gapopen);
            E = _mm_max_epi16 (E, H);
            _mm_store_si128 (pE + j, E);

            /* update F value */
            F = _mm_subs_epi16 (F, v_gapextend);

            j++;
            if (j >= iter)
            {
                j = 0;
                F = _mm_slli_si128 (F, 2);
                F = _mm_or_si128 (F, v_min);
            }
            H = _mm_load_si128 (pHStore + j);

            v_temp = _mm_subs_epi16 (H, v_gapopen);
            v_temp = _mm_cmpgt_epi16 (F, v_temp);
            cmp  = _mm_movemask_epi8 (v_temp);
        }
    }

    /* find largest score in the v_maxscore vector */
    v_temp = _mm_srli_si128 (v_maxscore, 8);
    v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
    v_temp = _mm_srli_si128 (v_maxscore, 4);
    v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
    v_temp = _mm_srli_si128 (v_maxscore, 2);
    v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);

    /* extract the largest score */
    score = _mm_extract_epi16 (v_maxscore, 0);

    /* return largest score biased by 32768 */

    /* fix for Mac OSX clang 4.1 */ 
    /*
#ifdef __clang__
    if (score < 0) score += 32768;
    return score;
#else
    */
    return score + 32768;
    /* #endif */
}
Example #11
0
wchar_t * __cdecl wcsstr (
        const wchar_t * wcs1,
        const wchar_t * wcs2
        )
{
    const wchar_t *stmp1, *stmp2;
    __m128i zero, pattern, characters1, characters2;

    // An empty search string matches everything.
    if (0 == *wcs2)
        return (wchar_t *)wcs1;

    if (__isa_available > __ISA_AVAILABLE_SSE2)
    {
        wchar_t c;
        unsigned i;

        // Load XMM with first characters of wcs2.
        if (XMM_PAGE_SAFE(wcs2))
        {
            pattern = _mm_loadu_si128((__m128i*)wcs2);
        }
        else
        {
            pattern = _mm_xor_si128(pattern, pattern);
            c = *(stmp2 = wcs2);
            for (i = 0; i < XMM_CHARS; ++i)
            {
                pattern = _mm_srli_si128(pattern, sizeof(wchar_t));
                pattern = _mm_insert_epi16(pattern, c, (XMM_CHARS-1));
                if (0 != c) c = *++stmp2;
            }
        }

        for(;;)
        {
            // Check for partial match, if none step forward and continue.
            if (XMM_PAGE_SAFE(wcs1))
            {
                characters1 = _mm_loadu_si128((__m128i*)wcs1);
                // If no potential match or end found, try next XMMWORD.
                if (_mm_cmpistra(pattern, characters1, f_srch_sub))
                {
                    wcs1 += XMM_CHARS;
                    continue;
                }
                // If end found there was no match.
                else if (!_mm_cmpistrc(pattern, characters1, f_srch_sub))
                {
                    return NULL;
                }

                // Get position of potential match.
                wcs1 += _mm_cmpistri(pattern, characters1, f_srch_sub);
            }
            else
            {
              // If end of string found there was no match.
              if (0 == *wcs1)
              {
                  return NULL;
              }

              // If current character doesn't match first character
              // of search string try next character.
              if (*wcs1 != *wcs2)
              {
                  ++wcs1;
                  continue;
              }
            }

            // Potential match, compare to check for full match.
            stmp1 = wcs1;
            stmp2 = wcs2;
            for (;;)
            {
                // If next XMMWORD is page-safe for each string
                // do a XMMWORD comparison.
                if (XMM_PAGE_SAFE(stmp1) && XMM_PAGE_SAFE(stmp2))
                {
                    characters1 = _mm_loadu_si128((__m128i*)stmp1);
                    characters2 = _mm_loadu_si128((__m128i*)stmp2);

                    // If unequal then no match found.
                    if (!_mm_cmpistro(characters2, characters1, f_srch_sub))
                    {
                        break;
                    }

                    // If end of search string then match found.
                    else if (_mm_cmpistrs(characters2, characters1, f_srch_sub))
                    {
                        return (wchar_t *)wcs1;
                    }

                    stmp1 += XMM_CHARS;
                    stmp2 += XMM_CHARS;
                    continue;
                }

                // Compare next character.
                else
                {
                    // If end of search string then match found.
                    if (0 == *stmp2)
                    {
                        return (wchar_t *)wcs1;
                    }

                    // If unequal then no match found.
                    if (*stmp1 != *stmp2)
                    {
                        break;
                    }

                    // Character matched - try next character.
                    ++stmp1;
                    ++stmp2;
                }
            }

            // Match not found at current position, try next.
            ++wcs1;
        }
    }
    else if (__isa_available == __ISA_AVAILABLE_SSE2)
    {
        unsigned offset, mask;

        // Build search pattern and zero pattern. Search pattern is
        // XMMWORD with the initial character of the search string
        // in every position. Zero pattern has a zero termination
        // character in every position.

        pattern = _mm_cvtsi32_si128(wcs2[0]);
        pattern = _mm_shufflelo_epi16(pattern, 0);
        pattern = _mm_shuffle_epi32(pattern, 0);
        zero = _mm_xor_si128(zero, zero);

        // Main loop for searching wcs1.

        for (;;)
        {
            // If XMM check is safe advance wcs1 to the next
            // possible match or end.

            if (XMM_PAGE_SAFE(wcs1))
            {
                characters1 = _mm_loadu_si128((__m128i*)wcs1);
                characters2 = _mm_cmpeq_epi16(characters1, zero);
                characters1 = _mm_cmpeq_epi16(characters1, pattern);
                characters1 = _mm_or_si128(characters1, characters2);
                mask = _mm_movemask_epi8(characters1);

                // If no character match or end found try next XMMWORD.

                if (0 == mask)
                {
                    wcs1 += XMM_CHARS;
                    continue;
                }

                // Advance wcs1 pointer to next possible match or end.

                _BitScanForward(&offset, mask);
                wcs1 += (offset/sizeof(wchar_t));
            }

            // If at the end of wcs1, then no match found.

            if (0 == wcs1[0]) return NULL;

            // If a first-character match is found compare
            // strings to look for match.

            if (wcs2[0] == wcs1[0])
            {
                stmp1 = wcs1;
                stmp2 = wcs2;
                for (;;)
                {
                    // If aligned as specified advance to next
                    // possible difference or wcs2 end.

                    if (XMM_PAGE_SAFE(stmp2) && XMM_PAGE_SAFE(stmp1))
                    {
                        characters1 = _mm_loadu_si128((__m128i*)stmp1);
                        characters2 = _mm_loadu_si128((__m128i*)stmp2);
                        characters1 = _mm_cmpeq_epi16(characters1, characters2);
                        characters2 = _mm_cmpeq_epi16(characters2, zero);
                        characters1 = _mm_cmpeq_epi16(characters1, zero);
                        characters1 = _mm_or_si128(characters1, characters2);
                        mask = _mm_movemask_epi8(characters1);

                        // If mask is zero there is no difference and
                        // wcs2 does not end in this XMMWORD. Continue
                        // with next XMMWORD.

                        if (0 == mask)
                        {
                            stmp1 += XMM_CHARS;
                            stmp2 += XMM_CHARS;
                            continue;
                        }

                        // Advance string pointers to next significant
                        // character.

                        _BitScanForward(&offset, mask);
                        stmp1 += (offset/sizeof(wchar_t));
                        stmp2 += (offset/sizeof(wchar_t));
                    }

                    // If we've reached the end of wcs2 then a match
                    // has been found.

                    if (0 == stmp2[0]) return (wchar_t *)wcs1;

                    // If we've reached a difference then no match
                    // was found.

                    if (stmp1[0] != stmp2[0]) break;

                    // Otherwise advance to next character and try
                    // again.

                    ++stmp1;
                    ++stmp2;
                }
            }

            // Current character wasn't a match, try next character.

            ++wcs1;
        }
    }
    else
    {
        const wchar_t *cp = wcs1;
        const wchar_t *s1, *s2;

        while (*cp)
        {
            s1 = cp;
            s2 = wcs2;

            while ( *s1 && *s2 && !(*s1-*s2) )
                s1++, s2++;

            if (!*s2)
                return (wchar_t *) cp;

            cp++;
        }

        return NULL;
    }
}
Example #12
0
static FORCE_INLINE void warp_mmword_u8_sse2(const uint8_t *srcp, const uint8_t *edgep, uint8_t *dstp, int src_stride, int edge_stride, int height, int x, int y, const __m128i &depth, const __m128i &zero, const __m128i &x_limit_min, const __m128i &x_limit_max, const __m128i &y_limit_min, const __m128i &y_limit_max, const __m128i &word_64, const __m128i &word_127, const __m128i &word_128, const __m128i &word_255, const __m128i &one_stride) {
    int SMAG = 1 << SMAGL;

    // calculate displacement

    __m128i above = _mm_loadl_epi64((const __m128i *)(edgep + x - (y ? edge_stride : 0)));
    __m128i below = _mm_loadl_epi64((const __m128i *)(edgep + x + (y < height - 1 ? edge_stride : 0)));

    __m128i left = _mm_loadl_epi64((const __m128i *)(edgep + x - 1));
    __m128i right = _mm_loadl_epi64((const __m128i *)(edgep + x + 1));

    above = _mm_unpacklo_epi8(above, zero);
    below = _mm_unpacklo_epi8(below, zero);
    left = _mm_unpacklo_epi8(left, zero);
    right = _mm_unpacklo_epi8(right, zero);

    __m128i h = _mm_sub_epi16(left, right);
    __m128i v = _mm_sub_epi16(above, below);

    h = _mm_slli_epi16(h, 7);
    v = _mm_slli_epi16(v, 7);

    h = _mm_mulhi_epi16(h, depth);
    v = _mm_mulhi_epi16(v, depth);

    v = _mm_max_epi16(v, y_limit_min);
    v = _mm_min_epi16(v, y_limit_max);

    __m128i remainder_h = h;
    __m128i remainder_v = v;

    if (SMAGL) {
        remainder_h = _mm_slli_epi16(remainder_h, SMAGL);
        remainder_v = _mm_slli_epi16(remainder_v, SMAGL);
    }

    remainder_h = _mm_and_si128(remainder_h, word_127);
    remainder_v = _mm_and_si128(remainder_v, word_127);

    h = _mm_srai_epi16(h, 7 - SMAGL);
    v = _mm_srai_epi16(v, 7 - SMAGL);

    __m128i xx = _mm_set1_epi32(x << SMAGL);
    xx = _mm_packs_epi32(xx, xx);

    h = _mm_adds_epi16(h, xx);

    remainder_h = _mm_and_si128(remainder_h, _mm_cmpgt_epi16(x_limit_max, h));
    remainder_h = _mm_andnot_si128(_mm_cmpgt_epi16(x_limit_min, h), remainder_h);

    h = _mm_max_epi16(h, x_limit_min);
    h = _mm_min_epi16(h, x_limit_max);

    // h and v contain the displacement now.

    __m128i disp_lo = _mm_unpacklo_epi16(v, h);
    __m128i disp_hi = _mm_unpackhi_epi16(v, h);
    disp_lo = _mm_madd_epi16(disp_lo, one_stride);
    disp_hi = _mm_madd_epi16(disp_hi, one_stride);

    __m128i line0 = _mm_setzero_si128();
    __m128i line1 = _mm_setzero_si128();

    int offset = _mm_cvtsi128_si32(disp_lo);
    disp_lo = _mm_srli_si128(disp_lo, 4);
    line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset), 0);
    line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride), 0);

    offset = _mm_cvtsi128_si32(disp_lo);
    disp_lo = _mm_srli_si128(disp_lo, 4);
    line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 1 * SMAG), 1);
    line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 1 * SMAG), 1);

    offset = _mm_cvtsi128_si32(disp_lo);
    disp_lo = _mm_srli_si128(disp_lo, 4);
    line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 2 * SMAG), 2);
    line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 2 * SMAG), 2);

    offset = _mm_cvtsi128_si32(disp_lo);
    disp_lo = _mm_srli_si128(disp_lo, 4);
    line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 3 * SMAG), 3);
    line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 3 * SMAG), 3);

    offset = _mm_cvtsi128_si32(disp_hi);
    disp_hi = _mm_srli_si128(disp_hi, 4);
    line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 4 * SMAG), 4);
    line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 4 * SMAG), 4);

    offset = _mm_cvtsi128_si32(disp_hi);
    disp_hi = _mm_srli_si128(disp_hi, 4);
    line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 5 * SMAG), 5);
    line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 5 * SMAG), 5);

    offset = _mm_cvtsi128_si32(disp_hi);
    disp_hi = _mm_srli_si128(disp_hi, 4);
    line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 6 * SMAG), 6);
    line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 6 * SMAG), 6);

    offset = _mm_cvtsi128_si32(disp_hi);
    disp_hi = _mm_srli_si128(disp_hi, 4);
    line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 7 * SMAG), 7);
    line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 7 * SMAG), 7);

    __m128i left0 = _mm_and_si128(line0, word_255);
    __m128i left1 = _mm_and_si128(line1, word_255);

    __m128i right0 = _mm_srli_epi16(line0, 8);
    __m128i right1 = _mm_srli_epi16(line1, 8);

    left0 = _mm_mullo_epi16(left0, _mm_sub_epi16(word_128, remainder_h));
    left1 = _mm_mullo_epi16(left1, _mm_sub_epi16(word_128, remainder_h));

    right0 = _mm_mullo_epi16(right0, remainder_h);
    right1 = _mm_mullo_epi16(right1, remainder_h);

    line0 = _mm_add_epi16(left0, right0);
    line1 = _mm_add_epi16(left1, right1);

    line0 = _mm_add_epi16(line0, word_64);
    line1 = _mm_add_epi16(line1, word_64);

    line0 = _mm_srai_epi16(line0, 7);
    line1 = _mm_srai_epi16(line1, 7);

    line0 = _mm_mullo_epi16(line0, _mm_sub_epi16(word_128, remainder_v));
    line1 = _mm_mullo_epi16(line1, remainder_v);

    __m128i result = _mm_add_epi16(line0, line1);

    result = _mm_add_epi16(result, word_64);

    result = _mm_srai_epi16(result, 7);

    result = _mm_packus_epi16(result, result);

    _mm_storel_epi64((__m128i *)(dstp + x), result);
}
int
smith_waterman_sse2_byte(const unsigned char *     query_sequence,
                         unsigned char *     query_profile_byte,
                         const int                 query_length,
                         const unsigned char *     db_sequence,
                         const int                 db_length,
                         unsigned char       bias,
                         unsigned char       gap_open,
                         unsigned char       gap_extend,
                         struct f_struct *   f_str)
{
    int     i, j, k;
    int     score;

    int     dup;
    int     cmp;
    int     iter = (query_length + 15) / 16;
    
    __m128i *p;
    __m128i *workspace = (__m128i *) f_str->workspace;

    __m128i E, F, H;

    __m128i v_maxscore;
    __m128i v_bias;
    __m128i v_gapopen;
    __m128i v_gapextend;

    __m128i v_temp;
    __m128i v_zero;

    __m128i *pHLoad, *pHStore;
    __m128i *pE;

    __m128i *pScore;

    /* Load the bias to all elements of a constant */
    dup    = ((short) bias << 8) | bias;
    v_bias = _mm_setzero_si128();
    v_bias = _mm_insert_epi16 (v_bias, dup, 0);
    v_bias = _mm_shufflelo_epi16 (v_bias, 0);
    v_bias = _mm_shuffle_epi32 (v_bias, 0);

    /* Load gap opening penalty to all elements of a constant */
    dup  = ((short) gap_open << 8) | gap_open;
    v_gapopen = _mm_setzero_si128();
    v_gapopen = _mm_insert_epi16 (v_gapopen, dup, 0);
    v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
    v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);

    /* Load gap extension penalty to all elements of a constant */
    dup  = ((short) gap_extend << 8) | gap_extend;
    v_gapextend = _mm_setzero_si128();
    v_gapextend = _mm_insert_epi16 (v_gapextend, dup, 0);
    v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
    v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);

    /* initialize the max score */
    /*     v_maxscore = _mm_xor_si128 (v_maxscore, v_maxscore);  - Apple Devel*/
    v_maxscore = _mm_setzero_si128();	/* Apple Devel */

    /* create a constant of all zeros for comparison */
    /* v_zero = _mm_xor_si128 (v_zero, v_zero);   - Apple Devel */
    v_zero = _mm_setzero_si128();	/* Apple Devel */

    /* Zero out the storage vector */
    k = iter * 2;

    p = workspace;
    for (i = 0; i < k; i++)
    {
        _mm_store_si128 (p++, v_maxscore);
    }

    pE = workspace;
    pHStore = pE + iter;
    pHLoad = pHStore + iter;

    for (i = 0; i < db_length; ++i)
    {
        /* fetch first data asap. */
        pScore = (__m128i *) query_profile_byte + db_sequence[i] * iter;

        /* zero out F value. */
        /* F = _mm_xor_si128 (F, F);  -Apple Devel */
        F = _mm_setzero_si128();	/* Apple Devel */

        /* load the next h value */
        H = _mm_load_si128 (pHStore + iter - 1);
        H = _mm_slli_si128 (H, 1);

        p = pHLoad;
        pHLoad = pHStore;
        pHStore = p;

        for (j = 0; j < iter; j++)
        {
            /* load values E. */
            E = _mm_load_si128 (pE + j);

            /* add score to H */
            H = _mm_adds_epu8 (H, *pScore++);
            H = _mm_subs_epu8 (H, v_bias);

            /* Update highest score encountered this far */
            v_maxscore = _mm_max_epu8 (v_maxscore, H);

            /* get max from H, E and F */
            H = _mm_max_epu8 (H, E);
            H = _mm_max_epu8 (H, F);

            /* save H values */
            _mm_store_si128 (pHStore + j, H);

            /* subtract the gap open penalty from H */
            H = _mm_subs_epu8 (H, v_gapopen);

            /* update E value */
            E = _mm_subs_epu8 (E, v_gapextend);
            E = _mm_max_epu8 (E, H);

            /* update F value */
            F = _mm_subs_epu8 (F, v_gapextend);
            F = _mm_max_epu8 (F, H);

            /* save E values */
            _mm_store_si128 (pE + j, E);

            /* load the next h value */
            H = _mm_load_si128 (pHLoad + j);
        }

        /* reset pointers to the start of the saved data */
        j = 0;
        H = _mm_load_si128 (pHStore + j);

        /*  the computed F value is for the given column.  since */
        /*  we are at the end, we need to shift the F value over */
        /*  to the next column. */
        F = _mm_slli_si128 (F, 1);
        v_temp = _mm_subs_epu8 (H, v_gapopen);
        v_temp = _mm_subs_epu8 (F, v_temp);
        v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
        cmp  = _mm_movemask_epi8 (v_temp);

        while (cmp != 0xffff) 
        {
            E = _mm_load_si128 (pE + j);

            H = _mm_max_epu8 (H, F);

            /* save H values */
            _mm_store_si128 (pHStore + j, H);

            /* update E in case the new H value would change it */
            H = _mm_subs_epu8 (H, v_gapopen);
            E = _mm_max_epu8 (E, H);
            _mm_store_si128 (pE + j, E);

            /* update F value */
            F = _mm_subs_epu8 (F, v_gapextend);

            j++;
            if (j >= iter)
            {
                j = 0;
                F = _mm_slli_si128 (F, 1);
            }
            H = _mm_load_si128 (pHStore + j);

            v_temp = _mm_subs_epu8 (H, v_gapopen);
            v_temp = _mm_subs_epu8 (F, v_temp);
            v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
            cmp  = _mm_movemask_epi8 (v_temp);
        }
    }

    /* find largest score in the v_maxscore vector */
    v_temp = _mm_srli_si128 (v_maxscore, 8);
    v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
    v_temp = _mm_srli_si128 (v_maxscore, 4);
    v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
    v_temp = _mm_srli_si128 (v_maxscore, 2);
    v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
    v_temp = _mm_srli_si128 (v_maxscore, 1);
    v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);

    /* store in temporary variable */
    score = _mm_extract_epi16 (v_maxscore, 0);
    score = score & 0x00ff;

    /*  check if we might have overflowed */
    if (score + bias >= 255)
    {
        score = 255;
    }

    /* return largest score */
    return score;
}
Example #14
0
	inline void FAST::detect9simd( const Image& img, uint8_t threshold, FeatureSetWrapper& features, size_t border )
	{
	#define CHECK_BARRIER(lo, hi, other, flags)		\
		{                                               \
			__m128i diff = _mm_subs_epu8(lo, other);	\
			__m128i diff2 = _mm_subs_epu8(other, hi);	\
			__m128i z = _mm_setzero_si128();			\
			diff = _mm_cmpeq_epi8(diff, z);				\
			diff2 = _mm_cmpeq_epi8(diff2, z);			\
			flags = ~(_mm_movemask_epi8(diff) | (_mm_movemask_epi8(diff2) << 16)); \
		}

		size_t stride;
		const uint8_t * iptr = img.map( &stride );

		int offsets[ 16 ];
		make_offsets( offsets, stride );

		const size_t tripleStride = 3 * stride;

		// The compiler refuses to reserve a register for this
		const __m128i barriers = _mm_set1_epi8( threshold  );

		// xend is the beginning of the last pixels in the row that need to be processed in the normal way
		size_t width = img.width();
		size_t height = img.height();
		size_t xend = width - border - ( width - border ) % 16;
		size_t aligned_start = ( (int)( border / 16 ) + 1 ) << 4;


		const uint8_t* im = iptr;
		im += ( border * stride );
		const uint8_t * ptr;

		for ( size_t y = border; y < height - border; y++ ) {
			ptr = im + border;
			for ( size_t x = border; x < aligned_start; x++ ){
				if( isCorner9( ptr, offsets, threshold ) )
					features( x, y, score9Pixel( ptr, offsets, threshold ) );
				ptr++;
			}

			for ( size_t x = aligned_start; x < xend; x += 16, ptr += 16 ) {
				__m128i lo, hi;
				{
					const __m128i here = _mm_load_si128( (const __m128i*)ptr );
					lo = _mm_subs_epu8( here, barriers );
					hi = _mm_adds_epu8( here, barriers );
				}



				uint32_t ans_0, ans_8, possible;
				{
					__m128i top = _mm_load_si128( ( const __m128i* )( ptr - tripleStride ) );
					__m128i bottom = _mm_load_si128( ( const __m128i* )( ptr + tripleStride ) );

					CHECK_BARRIER( lo, hi, top, ans_0 );
					CHECK_BARRIER( lo, hi, bottom, ans_8 );

					possible = ans_0 | ans_8;

					if ( !possible ){
						continue;
					}
				}

				uint32_t ans_15, ans_1;
				{
					__m128i a = _mm_loadu_si128( ( const __m128i* )( ptr - 1 - tripleStride ) );
					__m128i c = _mm_insert_epi16( _mm_srli_si128( a, 2 ), *( const uint16_t* ) (ptr + 15 - tripleStride), 7 );
					CHECK_BARRIER( lo, hi, a, ans_15 );
					CHECK_BARRIER( lo, hi, c, ans_1 );
					// 8 or (15 and 1 )
					possible &= ans_8 | (ans_15 & ans_1);

					if ( !possible )
						continue;
				}

				uint32_t ans_9, ans_7;
				{
					__m128i d = _mm_loadu_si128( ( const __m128i* )( ptr - 1 + tripleStride ) );
					__m128i f = _mm_insert_epi16( _mm_srli_si128( d, 2 ), *( const uint16_t* )( ptr + 15 + tripleStride ), 7 );
					CHECK_BARRIER( lo, hi, d, ans_9 );
					CHECK_BARRIER( lo, hi, f, ans_7 );
					possible &= ans_9 | ( ans_0 & ans_1 );
					possible &= ans_7 | ( ans_15 & ans_0 );

					if ( !possible )
						continue;
				}

				uint32_t ans_12, ans_4;
				{
					__m128i left = _mm_loadu_si128( ( const __m128i* )( ptr - 3 ) );
					__m128i right = _mm_loadu_si128( ( const __m128i* )( ptr + 3 ) );
					CHECK_BARRIER( lo, hi, left, ans_12 );
					CHECK_BARRIER( lo, hi, right, ans_4 );
					possible &= ans_12 | ( ans_4 & ( ans_1 | ans_7 ) );
					possible &= ans_4 | ( ans_12 & ( ans_9 | ans_15 ) );

					if ( !possible )
						continue;
				}

				uint32_t ans_14, ans_6;
				{
					__m128i ul = _mm_loadu_si128( ( const __m128i* ) ( ptr - 2 - 2 * stride ) );
					__m128i lr = _mm_loadu_si128( ( const __m128i* ) ( ptr + 2 + 2 * stride ) );
					CHECK_BARRIER( lo, hi, ul, ans_14 );
					CHECK_BARRIER( lo, hi, lr, ans_6 );
					{
						const unsigned int ans_6_7 = ans_6 & ans_7;
						possible &= ans_14 | (ans_6_7 & (ans_4 | (ans_8 & ans_9)));
						possible &= ans_1 | (ans_6_7) | ans_12;
					}
					{
						const unsigned int ans_14_15 = ans_14 & ans_15;
						possible &= ans_6 | (ans_14_15 & (ans_12 | (ans_0 & ans_1)));
						possible &= ans_9 | (ans_14_15) | ans_4;
					}

					if ( !possible )
						continue;
				}

				uint32_t ans_10, ans_2;
				{
					__m128i ll = _mm_loadu_si128( ( const __m128i* ) (ptr - 2 + 2 * stride) );
					__m128i ur = _mm_loadu_si128( ( const __m128i* ) (ptr + 2 - 2 * stride) );
					CHECK_BARRIER( lo, hi, ll, ans_10 );
					CHECK_BARRIER( lo, hi, ur, ans_2 );
					{
						const unsigned int ans_1_2 = ans_1 & ans_2;
						possible &= ans_10 | (ans_1_2 & ((ans_0 & ans_15) | ans_4));
						possible &= ans_12 | (ans_1_2) | (ans_6 & ans_7);
					}
					{
						const unsigned int ans_9_10 = ans_9 & ans_10;
						possible &= ans_2 | (ans_9_10 & ((ans_7 & ans_8) | ans_12));
						possible &= ans_4 | (ans_9_10) | (ans_14 & ans_15);
					}
					possible &= ans_8 | ans_14 | ans_2;
					possible &= ans_0 | ans_10 | ans_6;

					if ( !possible )
						continue;
				}

				uint32_t ans_13, ans_5;
				{
					__m128i g = _mm_loadu_si128( ( const __m128i* ) (ptr - 3 - stride ) );
					__m128i l = _mm_loadu_si128( ( const __m128i* ) (ptr + 3 + stride ) );
					CHECK_BARRIER( lo, hi, g, ans_13 );
					CHECK_BARRIER( lo, hi, l, ans_5 );
					const uint32_t ans_15_0 = ans_15 & ans_0;
					const uint32_t ans_7_8 = ans_7 & ans_8;
					{
						const uint32_t ans_12_13 = ans_12 & ans_13;
						possible &= ans_5 | (ans_12_13 & ans_14 & ((ans_15_0) | ans_10));
						possible &= ans_7 | (ans_1 & ans_2) | (ans_12_13);
						possible &= ans_2 | (ans_12_13) | (ans_7_8);
					}
					{
						const uint32_t ans_4_5 = ans_4 & ans_5;
						const uint32_t ans_9_10 = ans_9 & ans_10;
						possible &= ans_13 | (ans_4_5 & ans_6 & ((ans_7_8) | ans_2));
						possible &= ans_15 | (ans_4_5) | (ans_9_10);
						possible &= ans_10 | (ans_4_5) | (ans_15_0);
						possible &= ans_15 | (ans_9_10) | (ans_4_5);
					}

					possible &= ans_8 | (ans_13 & ans_14) | ans_2;
					possible &= ans_0 | (ans_5 & ans_6) | ans_10;

					if ( !possible )
						continue;
				}


				uint32_t ans_11, ans_3;
				{
					__m128i ii = _mm_loadu_si128( ( const __m128i* )( ptr - 3 + stride ) );
					__m128i jj = _mm_loadu_si128( ( const __m128i* )( ptr + 3 - stride ) );
					CHECK_BARRIER( lo, hi, ii, ans_11 );
					CHECK_BARRIER( lo, hi, jj, ans_3 );
					{
						const uint32_t ans_2_3 = ans_2 & ans_3;
						possible &= ans_11 | (ans_2_3 & ans_4 & ((ans_0 & ans_1) | (ans_5 & ans_6)));
						possible &= ans_13 | (ans_7 & ans_8) | (ans_2_3);
						possible &= ans_8 | (ans_2_3) | (ans_13 & ans_14);
					}
					{
						const uint32_t ans_11_12 = ans_11 & ans_12;
						possible &= ans_3 | (ans_10 & ans_11_12 & ((ans_8 & ans_9) | (ans_13 & ans_14)));
						possible &= ans_1 | (ans_11_12) | (ans_6 & ans_7);
						possible &= ans_6 | (ans_0 & ans_1) | (ans_11_12);
					}
					{
						const uint32_t ans_3_4 = ans_3 & ans_4;
						possible &= ans_9 | (ans_3_4) | (ans_14 & ans_15);
						possible &= ans_14 | (ans_8 & ans_9) | (ans_3_4);
					}
					{
						const uint32_t ans_10_11 = ans_10 & ans_11;
						possible &= ans_5 | (ans_15 & ans_0) | (ans_10_11);
						possible &= ans_0 | (ans_10_11) | (ans_5 & ans_6);
					}

					if ( !possible )
						continue;

				}

				possible |= (possible >> 16);

				//if(possible & 0x0f) //Does this make it faster?
				{
					if ( possible & (1 << 0) )
						features( x, y, score9Pixel( ptr, offsets, threshold ) );
					if ( possible & (1 << 1) )
						features( x + 1, y, score9Pixel( ptr + 1, offsets, threshold ) );
					if ( possible & (1 << 2) )
						features( x + 2, y, score9Pixel( ptr + 2, offsets, threshold ) );
					if ( possible & (1 << 3) )
						features( x + 3, y, score9Pixel( ptr + 3, offsets, threshold ) );
					if ( possible & (1 << 4) )
						features( x + 4, y, score9Pixel( ptr + 4, offsets, threshold ) );
					if ( possible & (1 << 5) )
						features( x + 5, y, score9Pixel( ptr + 5, offsets, threshold ) );
					if ( possible & (1 << 6) )
						features( x + 6, y, score9Pixel( ptr + 6, offsets, threshold ) );
					if ( possible & (1 << 7) )
						features( x + 7, y, score9Pixel( ptr + 7, offsets, threshold ) );
				}

				//if(possible & 0xf0) //Does this mak( ,  fast)r?
				{
					if ( possible & (1 << 8) )
						features( x + 8, y, score9Pixel( ptr + 8, offsets, threshold ) );
					if ( possible & (1 << 9) )
						features( x + 9, y, score9Pixel( ptr + 9, offsets, threshold ) );
					if ( possible & (1 << 10) )
						features( x + 10, y, score9Pixel( ptr + 10, offsets, threshold ) );
					if ( possible & (1 << 11) )
						features( x + 11, y, score9Pixel( ptr + 11, offsets, threshold ) );
					if ( possible & (1 << 12) )
						features( x + 12, y, score9Pixel( ptr + 12, offsets, threshold ) );
					if ( possible & (1 << 13) )
						features( x + 13, y, score9Pixel( ptr + 13, offsets, threshold ) );
					if ( possible & (1 << 14) )
						features( x + 14, y, score9Pixel( ptr + 14, offsets, threshold ) );
					if ( possible & (1 << 15) )
						features( x + 15, y, score9Pixel( ptr + 15, offsets, threshold ) );
				}
			}

			for ( size_t x = xend; x < width - border; x++ ){
				if( isCorner9( ptr, offsets, threshold ) )
					features( x, y, score9Pixel( ptr, offsets, threshold ) );
				ptr++;
			}
			im += stride;
		}
		img.unmap( iptr );

	#undef CHECK_BARRIER
	}
Example #15
0
/*
 * Calculate the Smith-Waterman score.
 *
 * This is basically an SSE2 version of Wozniak's vectored implementation, but
 * without a score table. Further, we assume a fixed database and query size,
 * so *nogap and *b_gap must be pre-allocated (the malloc overhead for very
 * small scans is _huge_).
 *
 * NOTE THE FOLLOWING:
 *
 *	1) seqA must be padded with 7 bytes at the beginning and end. The first
 *	   element of seqA should be the first pad byte.
 *
 *	2) seqB must be padded with bytes on the end up to mod 8 characters.
 *	   The first element of seqB should be (of course) the first character.
 *
 *	3) seqA and seqB's padding _must_ be different, otherwise our logic will
 *	   consider the padding as matches!
 *
 *      4) These is no _mm_max_epu16 prior to SSE 4! We must use the signed max
 *         function. Unfortunately, this limits our maximum score to 2^15 - 1, or
 *         32767. Since bad things happen if we roll over, our caller must ensure
 *         that this will not happen.
 */
static int
vect_sw_diff_gap(int8_t *seqA, int lena, int8_t *seqB, int lenb,
                 int8_t *ls_seqA, int initbp, bool is_rna)
{
  int i, j, score = 0;
  __m128i v_score, v_zero, v_match, v_mismatch;
  __m128i v_a_gap_ext, v_a_gap_open_ext;
#ifndef v_b_gap_open_ext
  __m128i v_b_gap_ext, v_b_gap_open_ext;
#endif
  __m128i v_a_gap, v_b_gap, v_nogap;
  __m128i v_last_nogap, v_prev_nogap, v_seq_a, v_seq_b;
  __m128i v_tmp;

  /* shut up icc */
  (void)ls_seqA;
  (void)initbp;

#define SET16(a, e7, e6, e5, e4, e3, e2, e1, e0)      \
  _mm_set_epi16((int16_t)a[e7], (int16_t)a[e6], \
                (int16_t)a[e5], (int16_t)a[e4], \
                (int16_t)a[e3], (int16_t)a[e2], \
                (int16_t)a[e1], (int16_t)a[e0])

  v_score		 = _mm_setzero_si128();
  v_zero		 = _mm_setzero_si128();
  v_match		 = SET16((&match), 0, 0, 0, 0, 0, 0, 0, 0);
  v_mismatch	 = SET16((&mismatch), 0, 0, 0, 0, 0, 0, 0, 0);
  v_a_gap_ext	 = SET16((&a_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0);
  v_a_gap_open_ext = SET16((&a_gap_open), 0, 0, 0, 0, 0, 0, 0, 0);
  v_a_gap_open_ext = _mm_add_epi16(v_a_gap_open_ext, v_a_gap_ext);
  v_b_gap_ext	 = SET16((&b_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0);
  v_b_gap_open_ext = SET16((&b_gap_open), 0, 0, 0, 0, 0, 0, 0, 0);
  v_b_gap_open_ext = _mm_add_epi16(v_b_gap_open_ext, v_b_gap_ext);

  for (i = 0; i < lena + 14; i++) {
      nogap[i] = 0;
      b_gap[i] = (int16_t)-b_gap_open;
  }

  for (i = 0; i < (lenb + 7)/8; i++) {
      int k = i * 8;

      v_b_gap = SET16(b_gap, 6, 6, 5, 4, 3, 2, 1, 0);
      v_nogap = SET16(nogap, 6, 6, 5, 4, 3, 2, 1, 0);
      v_seq_a = SET16(seqA, 0, 0, 1, 2, 3, 4, 5, 6);
      v_seq_b = SET16(seqB, k+7, k+6, k+5, k+4, k+3, k+2, k+1, k+0);

      v_a_gap = v_a_gap_ext;
      v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_open_ext);

      v_last_nogap = _mm_setzero_si128();
      v_prev_nogap = _mm_setzero_si128();

      for (j = 0; j < (lena + 7); j++) {
          v_b_gap = _mm_slli_si128(v_b_gap, 2);
          v_b_gap = _mm_insert_epi16(v_b_gap, b_gap[j+7], 0);

          v_nogap = _mm_slli_si128(v_nogap, 2);
          v_nogap = _mm_insert_epi16(v_nogap, nogap[j+7], 0);

          v_seq_a = _mm_slli_si128(v_seq_a, 2);
          v_seq_a = _mm_insert_epi16(v_seq_a, seqA[j+7], 0);

          v_tmp = _mm_sub_epi16(v_last_nogap, v_a_gap_open_ext);
          v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_ext);
          v_a_gap = _mm_max_epi16(v_a_gap, v_tmp);

          v_tmp = _mm_sub_epi16(v_nogap, v_b_gap_open_ext);
          v_b_gap = _mm_sub_epi16(v_b_gap, v_b_gap_ext);
          v_b_gap = _mm_max_epi16(v_b_gap, v_tmp);

          /* compute the score (v_last_nogap is a tmp variable) */
          v_last_nogap = _mm_cmpeq_epi16(v_seq_a, v_seq_b);
          v_tmp = _mm_and_si128(v_last_nogap, v_match);
          v_last_nogap = _mm_cmpeq_epi16(v_last_nogap, v_zero);
          v_last_nogap = _mm_and_si128(v_last_nogap, v_mismatch);
          v_tmp = _mm_or_si128(v_tmp, v_last_nogap);

          v_last_nogap = _mm_add_epi16(v_prev_nogap, v_tmp);
          v_last_nogap = _mm_max_epi16(v_last_nogap, v_zero);
          v_last_nogap = _mm_max_epi16(v_last_nogap, v_a_gap);
          v_last_nogap = _mm_max_epi16(v_last_nogap, v_b_gap);

          v_prev_nogap = v_nogap;
          v_nogap = v_last_nogap;

          b_gap[j] = (int16_t)_mm_extract_epi16(v_b_gap, 7);
          nogap[j] = (int16_t)_mm_extract_epi16(v_nogap, 7);

          v_score = _mm_max_epi16(v_score, v_last_nogap);
      }
  }

  /*
   * Ugh. Old gcc can't loop and using _mm_store to an int16_t array
   * breaks strict-aliasing rules.
   */
  assert(score == 0);
  score = MAX(score, _mm_extract_epi16(v_score, 0));
  score = MAX(score, _mm_extract_epi16(v_score, 1));
  score = MAX(score, _mm_extract_epi16(v_score, 2));
  score = MAX(score, _mm_extract_epi16(v_score, 3));
  score = MAX(score, _mm_extract_epi16(v_score, 4));
  score = MAX(score, _mm_extract_epi16(v_score, 5));
  score = MAX(score, _mm_extract_epi16(v_score, 6));
  score = MAX(score, _mm_extract_epi16(v_score, 7));

  return (score);
}
Example #16
0
/*
 * Calculate the Smith-Waterman score.
 *
 * This is basically an SSE2 version of Wozniak's vectored implementation, but
 * without a score table. Further, we assume a fixed database and query size,
 * so *nogap and *b_gap must be pre-allocated (the malloc overhead for very
 * small scans is _huge_).
 *
 * NOTE THE FOLLOWING:
 *
 *	1) seqA must be padded with 7 bytes at the beginning and end. The first
 *	   element of seqA should be the first pad byte.
 *
 *	2) seqB must be padded with bytes on the end up to mod 8 characters.
 *	   The first element of seqB should be (of course) the first character.
 *
 *	3) seqA and seqB's padding _must_ be different, otherwise our logic will
 *	   consider the padding as matches!
 *
 *      4) These is no _mm_max_epu16 prior to SSE 4! We must use the signed max
 *         function. Unfortunately, this limits our maximum score to 2^15 - 1, or
 *         32767. Since bad things happen if we roll over, our caller must ensure
 *         that this will not happen.
 */
static int
vect_sw_diff_gap(int8_t *seqA, int lena, int8_t *seqB, int lenb,
    int8_t *ls_seqA, int initbp, bool is_rna)
{
	int i, j, score = 0;
	__m128i v_score, v_zero, v_match, v_mismatch;
	__m128i v_a_gap_ext, v_a_gap_open_ext;
#ifndef v_b_gap_open_ext
	__m128i v_b_gap_ext, v_b_gap_open_ext;
#endif
	__m128i v_a_gap, v_b_gap, v_nogap;
	__m128i v_last_nogap, v_prev_nogap, v_seq_a, v_seq_b;
	__m128i v_tmp;

	/* shut up icc */
	(void)ls_seqA;
	(void)initbp;

#define SET16(a, e7, e6, e5, e4, e3, e2, e1, e0)      \
	_mm_set_epi16((int16_t)a[e7], (int16_t)a[e6], \
		      (int16_t)a[e5], (int16_t)a[e4], \
		      (int16_t)a[e3], (int16_t)a[e2], \
		      (int16_t)a[e1], (int16_t)a[e0])

	v_score		 = _mm_setzero_si128();
	v_zero		 = _mm_setzero_si128();
	v_match		 = SET16((&match), 0, 0, 0, 0, 0, 0, 0, 0);
        v_mismatch	 = SET16((&mismatch), 0, 0, 0, 0, 0, 0, 0, 0);
	v_a_gap_ext	 = SET16((&a_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0);
	v_a_gap_open_ext = SET16((&a_gap_open), 0, 0, 0, 0, 0, 0, 0, 0);
	v_a_gap_open_ext = _mm_add_epi16(v_a_gap_open_ext, v_a_gap_ext);
	v_b_gap_ext	 = SET16((&b_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0);
	v_b_gap_open_ext = SET16((&b_gap_open), 0, 0, 0, 0, 0, 0, 0, 0);
	v_b_gap_open_ext = _mm_add_epi16(v_b_gap_open_ext, v_b_gap_ext);

        for (i = 0; i < lena + 14; i++) {
                nogap[i] = 0;
                b_gap[i] = (int16_t)-b_gap_open;
        }

	/*
	 * When using colour space reads, we must handle the first row
	 * specially. This is because the read will begin with some marker
	 * base, which will affect matching against the genome.
	 *
	 * For 25mer reads, this actually makes things faster, because our
	 * vectorised portion becomes evenly divisible by 8 again. Yey.
	 */
	if (use_colours) {
		int a_gap, prev_nogap, last_nogap;

		a_gap = -a_gap_open;
		last_nogap = prev_nogap = 0;
		for (i = 7; i < (lena + 7); i++) {
			int a, ms;

			a_gap = MAX((last_nogap - a_gap_open - a_gap_ext),
			    (a_gap - a_gap_ext));
			b_gap[i] =(uint16_t)MAX((nogap[i] - b_gap_open - b_gap_ext),
			    (b_gap[i] - b_gap_ext));

			a = lstocs(ls_seqA[i], initbp, is_rna);
			ms = (a == seqB[0]) ? match : mismatch;

			last_nogap = MAX((prev_nogap + ms), 0);
			last_nogap = MAX(last_nogap, a_gap);
			last_nogap = MAX(last_nogap, b_gap[i]);
			prev_nogap = nogap[i];
			nogap[i] = (uint16_t)last_nogap;
			score = MAX(score, last_nogap);
		}

		v_score = SET16((&score), 0, 0, 0, 0, 0, 0, 0, 0);
		score = 0;
		seqB++;
		lenb--;

		assert(lenb != 0);
	}

	for (i = 0; i < (lenb + 7)/8; i++) {
		int k = i * 8;

		v_b_gap = SET16(b_gap, 6, 6, 5, 4, 3, 2, 1, 0);
		v_nogap = SET16(nogap, 6, 6, 5, 4, 3, 2, 1, 0);
		v_seq_a = SET16(seqA, 0, 0, 1, 2, 3, 4, 5, 6);
		v_seq_b = SET16(seqB, k+7, k+6, k+5, k+4, k+3, k+2, k+1, k+0);

		v_a_gap = v_a_gap_ext;
		v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_open_ext);

		v_last_nogap = _mm_setzero_si128();
		v_prev_nogap = _mm_setzero_si128();

		for (j = 0; j < (lena + 7); j++) {
			v_b_gap = _mm_slli_si128(v_b_gap, 2);
			v_b_gap = _mm_insert_epi16(v_b_gap, b_gap[j+7], 0);

			v_nogap = _mm_slli_si128(v_nogap, 2);
			v_nogap = _mm_insert_epi16(v_nogap, nogap[j+7], 0);

			v_seq_a = _mm_slli_si128(v_seq_a, 2);
			v_seq_a = _mm_insert_epi16(v_seq_a, seqA[j+7], 0);

			v_tmp = _mm_sub_epi16(v_last_nogap, v_a_gap_open_ext);
			v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_ext);
			v_a_gap = _mm_max_epi16(v_a_gap, v_tmp);

			v_tmp = _mm_sub_epi16(v_nogap, v_b_gap_open_ext);
			v_b_gap = _mm_sub_epi16(v_b_gap, v_b_gap_ext);
			v_b_gap = _mm_max_epi16(v_b_gap, v_tmp);

			/* compute the score (v_last_nogap is a tmp variable) */
			v_last_nogap = _mm_cmpeq_epi16(v_seq_a, v_seq_b);
			v_tmp = _mm_and_si128(v_last_nogap, v_match);
			v_last_nogap = _mm_cmpeq_epi16(v_last_nogap, v_zero);
			v_last_nogap = _mm_and_si128(v_last_nogap, v_mismatch);
			v_tmp = _mm_or_si128(v_tmp, v_last_nogap);

			v_last_nogap = _mm_add_epi16(v_prev_nogap, v_tmp);
			v_last_nogap = _mm_max_epi16(v_last_nogap, v_zero);
			v_last_nogap = _mm_max_epi16(v_last_nogap, v_a_gap);
			v_last_nogap = _mm_max_epi16(v_last_nogap, v_b_gap);
			
			v_prev_nogap = v_nogap;
			v_nogap = v_last_nogap;

			b_gap[j] = (int16_t)_mm_extract_epi16(v_b_gap, 7);
			nogap[j] = (int16_t)_mm_extract_epi16(v_nogap, 7);

			v_score = _mm_max_epi16(v_score, v_last_nogap);
		}
	}

	/*
	 * Ugh. Old gcc can't loop and using _mm_store to an int16_t array
	 * breaks strict-aliasing rules.
	 */
	assert(score == 0);
	score = MAX(score, _mm_extract_epi16(v_score, 0));
	score = MAX(score, _mm_extract_epi16(v_score, 1));
	score = MAX(score, _mm_extract_epi16(v_score, 2));
	score = MAX(score, _mm_extract_epi16(v_score, 3));
	score = MAX(score, _mm_extract_epi16(v_score, 4));
	score = MAX(score, _mm_extract_epi16(v_score, 5));
	score = MAX(score, _mm_extract_epi16(v_score, 6));
	score = MAX(score, _mm_extract_epi16(v_score, 7));

	return (score);
}
int
global_sse2_word(int                  queryLength,
                 unsigned short      *profile,
                 const unsigned char *dbSeq,
                 int                  dbLength,
                 unsigned short       gapOpen,
                 unsigned short       gapExtend,
                 unsigned short       ceiling,
                 struct f_struct     *f_str)
{
  int     i, j;

  int     score;
  int     scale;
  int     temp;
  int     distance;

  int     offset;
  int     position;

  int     cmp;
  int     iter;
    
  __m128i *pvH;
  __m128i *pvE;

  __m128i vE, vF, vH;
  __m128i vHNext;
  __m128i vFPrev;

  __m128i vGapOpen;
  __m128i vGapExtend;
  __m128i vCeiling;

  __m128i vScale;
  __m128i vScaleAmt;
  __m128i vScaleTmp;

  __m128i vTemp;
  __m128i vNull;

  __m128i *pvScore;

  scale = 0;
  iter = (queryLength + 7) / 8;
  offset = (queryLength - 1) % iter;
  position = 7 - (queryLength - 1) / iter;

  pvH = (__m128i *)f_str->workspace;
  pvE = pvH + iter;

  /* Load gap opening penalty to all elements of a constant */
  vGapOpen = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0);
  vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
  vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);

  /* Load gap extension penalty to all elements of a constant */
  vGapExtend = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0);
  vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
  vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);

  /* Generate the ceiling before scaling */
  vTemp = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vTemp = _mm_insert_epi16 (vTemp, ceiling, 0);
  vTemp = _mm_shufflelo_epi16 (vTemp, 0);
  vTemp = _mm_shuffle_epi32 (vTemp, 0);
  vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp);
  vCeiling = _mm_srli_epi16 (vCeiling, 1);
  vCeiling = _mm_subs_epi16 (vCeiling, vTemp);
  vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen);

  vNull = _mm_cmpeq_epi16 (vTemp, vTemp);
  vNull = _mm_slli_epi16 (vNull, 15);
  vScaleAmt = _mm_xor_si128 (vNull, vNull);

  /* Zero out the storage vector */
  vTemp = _mm_adds_epi16 (vNull, vGapOpen);
  for (i = 0; i < iter; i++) {
    _mm_store_si128 (pvH + i, vTemp);
    _mm_store_si128 (pvE + i, vNull);
  }

  /* initialize F */
  vF = vNull;
  vFPrev = vNull;

  /* load and scale H for the next round */
  vTemp = _mm_srli_si128 (vGapOpen, 14);
  vH = _mm_load_si128 (pvH + iter - 1);
  vH = _mm_adds_epi16 (vH, vTemp);

  for (i = 0; i < dbLength; ++i) {
    /* fetch first data asap. */
    pvScore = (__m128i *) profile + dbSeq[i] * iter;

    vF = vNull;

    vH = _mm_max_epi16 (vH, vFPrev);
    for (j = 0; j < iter; j++) {
      /* correct H from the previous columns F */
      vHNext = _mm_load_si128 (pvH + j);
      vHNext = _mm_max_epi16 (vHNext, vFPrev);

      /* load and correct E value */
      vE = _mm_load_si128 (pvE + j);
      vTemp = _mm_subs_epi16 (vHNext, vGapOpen);
      vE = _mm_max_epi16 (vE, vTemp);
      _mm_store_si128 (pvE + j, vE);

      /* add score to vH */
      vH = _mm_adds_epi16 (vH, *pvScore++);

      /* get max from vH, vE and vF */
      vH = _mm_max_epi16 (vH, vE);
      vH = _mm_max_epi16 (vH, vF);
      _mm_store_si128 (pvH + j, vH);

      /* update vF value */
      vH = _mm_subs_epi16 (vH, vGapOpen);
      vF = _mm_max_epi16 (vF, vH);

      /* load the next h values */
      vH = vHNext;
    }

    /* check if we need to scale before the next round */
    vTemp = _mm_cmpgt_epi16 (vF, vCeiling);
    cmp  = _mm_movemask_epi8 (vTemp);

    /* broadcast F values */
    vF = _mm_xor_si128 (vF, vNull);

    vTemp  = _mm_slli_si128 (vF, 2);
    vTemp = _mm_subs_epu16 (vTemp, vScaleAmt);
    vF = max_epu16 (vF, vTemp);

    vTemp  = _mm_slli_si128 (vF, 4);
    vScaleTmp = _mm_slli_si128 (vScaleAmt, 2);
    vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt);
    vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
    vF = max_epu16 (vF, vTemp);

    vTemp = _mm_slli_si128 (vScaleTmp, 4);
    vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp);
    vTemp  = _mm_slli_si128 (vF, 8);
    vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
    vF = max_epu16 (vF, vTemp);

    /* scale if necessary */
    if (cmp != 0x0000) {
      __m128i vScale1;
      __m128i vScale2;

      vScale = _mm_slli_si128 (vF, 2);
      vScale = _mm_subs_epu16 (vScale, vGapOpen);
      vScale = _mm_subs_epu16 (vScale, vScaleAmt);

      vTemp = _mm_slli_si128 (vScale, 2);
      vTemp = _mm_subs_epu16 (vScale, vTemp);
      vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp);
      vTemp = _mm_slli_si128 (vScale, 2);
      vTemp = _mm_subs_epu16 (vTemp, vScale);
      vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp);

      /* rescale the previous F */
      vF = _mm_subs_epu16 (vF, vScale);

      /* check if we can continue in signed 16-bits */
      vTemp = _mm_xor_si128 (vF, vNull);
      vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling);
      cmp  = _mm_movemask_epi8 (vTemp);
      if (cmp != 0x0000) {
        return OVERFLOW_SCORE;
      }

      vTemp   = _mm_adds_epi16 (vCeiling, vCeiling);
      vScale1 = _mm_subs_epu16 (vScale, vTemp);
      vScale2 = _mm_subs_epu16 (vScale, vScale1);

      /* scale all the vectors */
      for (j = 0; j < iter; j++) {
        /* load H and E */
        vH = _mm_load_si128 (pvH + j);
        vE = _mm_load_si128 (pvE + j);

        /* get max from vH, vE and vF */
        vH = _mm_subs_epi16 (vH, vScale1);
        vH = _mm_subs_epi16 (vH, vScale2);
        vE = _mm_subs_epi16 (vE, vScale1);
        vE = _mm_subs_epi16 (vE, vScale2);

        /* save the H and E */
        _mm_store_si128 (pvH + j, vH);
        _mm_store_si128 (pvE + j, vE);
      }

      vScale = vScaleAmt;
      for (j = 0; j < position; ++j) {
        vScale = _mm_slli_si128 (vScale, 2);
      }

      /* calculate the final scaling amount */
      vTemp   = _mm_xor_si128 (vTemp, vTemp);
      vScale1 = _mm_unpacklo_epi16 (vScale, vTemp);
      vScale2 = _mm_unpackhi_epi16 (vScale, vTemp);
      vScale  = _mm_add_epi32 (vScale1, vScale2);
      vTemp = _mm_srli_si128 (vScale, 8);
      vScale = _mm_add_epi32 (vScale, vTemp);
      vTemp = _mm_srli_si128 (vScale, 4);
      vScale = _mm_add_epi32 (vScale, vTemp);
      scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0);
      temp  = (int) (unsigned short) _mm_extract_epi16 (vScale, 1);
      scale = scale + (temp << 16);
    }

    /* scale the F value for the next round */
    vFPrev = _mm_slli_si128 (vF, 2);
    vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt);
    vFPrev = _mm_xor_si128 (vFPrev, vNull);

    /* load and scale H for the next round */
    vH = _mm_load_si128 (pvH + iter - 1);
    vH = _mm_xor_si128 (vH, vNull);
    vH = _mm_slli_si128 (vH, 2);
    vH = _mm_subs_epu16 (vH, vScaleAmt);
    vH = _mm_insert_epi16 (vH, gapOpen, 0);
    vH = _mm_xor_si128 (vH, vNull);
  }

  vH = _mm_load_si128 (pvH + offset);
  vH = _mm_max_epi16 (vH, vFPrev);
  for (j = 0; j < position; ++j) {
    vH = _mm_slli_si128 (vH, 2);
  }
  score = (int) (signed short) _mm_extract_epi16 (vH, 7);
  score = score + SHORT_BIAS;

  /* return largest score */
  distance = (queryLength + dbLength) * gapExtend;
  score = score - (gapOpen * 2) - distance + scale;

  return score;
}
int
global_sse2_byte(int                  queryLength,
                 unsigned char       *profile,
                 const unsigned char *dbSeq,
                 int                  dbLength,
                 unsigned short       gapOpen,
                 unsigned short       gapExtend,
                 unsigned short       ceiling,
                 unsigned short       bias,
                 struct f_struct     *f_str)
{
  int     i, j;

  int     score;
  int     scale;
  int     distance;

  int     offset;
  int     position;

  int     dup;
  int     cmp;
  int     iter;
    
  __m128i *pvH;
  __m128i *pvE;

  __m128i vE, vF, vH;
  __m128i vHInit;
  __m128i vHNext;
  __m128i vFPrev;

  __m128i vBias;
  __m128i vGapOpen;
  __m128i vGapExtend;
  __m128i vCeiling;

  __m128i vScale;
  __m128i vScaleAmt;
  __m128i vScaleTmp;

  __m128i vTemp;
  __m128i vNull;

  __m128i *pvScore;

  scale = 0;
  iter = (queryLength + 15) / 16;
  offset = (queryLength - 1) % iter;
  position = 15 - (queryLength - 1) / iter;

  pvH = (__m128i *)f_str->workspace;
  pvE = pvH + iter;

  /* Load the bias to all elements of a constant */
  dup    = (bias << 8) | (bias & 0x00ff);
  vBias = _mm_setzero_si128();	/* initialize cf Apple Devel smith_waterman_sse2.c */
  vBias = _mm_insert_epi16 (vBias, dup, 0);
  vBias = _mm_shufflelo_epi16 (vBias, 0);
  vBias = _mm_shuffle_epi32 (vBias, 0);

  /* Load gap opening penalty to all elements of a constant */
  dup      = (gapOpen << 8) | (gapOpen & 0x00ff);
  vGapOpen = _mm_setzero_si128();	/* initialize cf Apple Devel smith_waterman_sse2.c */
  vGapOpen = _mm_insert_epi16 (vGapOpen, dup, 0);
  vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
  vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);

  /* Load gap extension penalty to all elements of a constant */
  dup    = (gapExtend << 8) | (gapExtend & 0x00ff);
  vGapExtend = _mm_setzero_si128();	/* initialize cf Apple Devel smith_waterman_sse2.c */
  vGapExtend = _mm_insert_epi16 (vGapExtend, dup, 0);
  vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
  vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);

  /* Generate the ceiling before scaling */
  dup    = (ceiling << 8) | (ceiling & 0x00ff);
  vTemp = _mm_setzero_si128();	/* initialize cf Apple Devel smith_waterman_sse2.c */
  vTemp = _mm_insert_epi16 (vTemp, dup, 0);
  vTemp = _mm_shufflelo_epi16 (vTemp, 0);
  vTemp = _mm_shuffle_epi32 (vTemp, 0);
  vCeiling = _mm_cmpeq_epi8 (vTemp, vTemp);
  vCeiling = _mm_subs_epu8 (vCeiling, vTemp);
  vCeiling = _mm_subs_epu8 (vCeiling, vGapOpen);

  /* since we want to use the full range, zero is redefined as */
  /* 2 * gapOpen.  the lowest scaled score will an insert followed */
  /* by a delete. */
  vHInit = _mm_srli_si128 (vGapOpen, 15);

  /* vNull = _mm_xor_si128 (vNull, vNull); */
  vNull = _mm_setzero_si128();	/* initialize cf Apple Devel smith_waterman_sse2.c */
  vScaleAmt = vNull;

  /* Zero out the storage vector */
  for (i = 0; i < iter; i++) {
    _mm_store_si128 (pvH + i, vGapOpen);
    _mm_store_si128 (pvE + i, vNull);
  }

  /* initialize F */
  vF = vNull;
  vFPrev = vNull;

  /* load and scale H for the next round */
  vH = _mm_load_si128 (pvH + iter - 1);
  vH = _mm_slli_si128 (vH, 1);
  vH = _mm_adds_epu8 (vH, vHInit);
  vH = _mm_adds_epu8 (vH, vHInit);

  for (i = 0; i < dbLength; ++i) {
    /* fetch first data asap. */
    pvScore = (__m128i *) profile + dbSeq[i] * iter;

    vF = _mm_xor_si128 (vF, vF);

    vH = _mm_max_epu8 (vH, vFPrev);
    for (j = 0; j < iter; j++) {
      /* correct H from the previous columns F */
      vHNext = _mm_load_si128 (pvH + j);
      vHNext = _mm_max_epu8 (vHNext, vFPrev);

      /* load and correct E value */
      vE = _mm_load_si128 (pvE + j);
      vTemp = _mm_subs_epu8 (vHNext, vGapOpen);
      vE = _mm_max_epu8 (vE, vTemp);
      _mm_store_si128 (pvE + j, vE);

      /* add score to vH */
      vH = _mm_adds_epu8 (vH, *pvScore++);
      vH = _mm_subs_epu8 (vH, vBias);

      /* get max from vH, vE and vF */
      vH = _mm_max_epu8 (vH, vE);
      vH = _mm_max_epu8 (vH, vF);
      _mm_store_si128 (pvH + j, vH);

      /* update vF value */
      vH = _mm_subs_epu8 (vH, vGapOpen);
      vF = _mm_max_epu8 (vF, vH);

      /* load the next h values */
      vH = vHNext;
    }

    /* check if we need to scale before the next round */
    vTemp = _mm_subs_epu8 (vCeiling, vF);
    vTemp = _mm_cmpeq_epi8 (vTemp, vNull);
    cmp  = _mm_movemask_epi8 (vTemp);

    /* broadcast F values */
    vTemp  = _mm_slli_si128 (vF, 1);
    vTemp = _mm_subs_epu8 (vTemp, vScaleAmt);
    vF = _mm_max_epu8 (vF, vTemp);

    vScaleTmp = _mm_slli_si128 (vScaleAmt, 1);
    vScaleTmp = _mm_adds_epu8 (vScaleTmp, vScaleAmt);
    vTemp  = _mm_slli_si128 (vF, 2);
    vTemp = _mm_subs_epu8 (vTemp, vScaleTmp);
    vF = _mm_max_epu8 (vF, vTemp);

    vTemp = _mm_slli_si128 (vScaleTmp, 2);
    vScaleTmp = _mm_adds_epu8 (vScaleTmp, vTemp);
    vTemp  = _mm_slli_si128 (vF, 4);
    vTemp = _mm_subs_epu8 (vTemp, vScaleTmp);
    vF = _mm_max_epu8 (vF, vTemp);

    vTemp = _mm_slli_si128 (vScaleTmp, 4);
    vScaleTmp = _mm_adds_epu8 (vScaleTmp, vTemp);
    vTemp  = _mm_slli_si128 (vF, 8);
    vTemp = _mm_subs_epu8 (vTemp, vScaleTmp);
    vF = _mm_max_epu8 (vF, vTemp);

    /* scale if necessary */
    if (cmp != 0x0000) {
      vScale = _mm_slli_si128 (vF, 1);
      vScale = _mm_subs_epu8 (vScale, vGapOpen);
      vScale = _mm_subs_epu8 (vScale, vScaleAmt);

      vTemp = _mm_slli_si128 (vScale, 1);
      vTemp = _mm_subs_epu8 (vScale, vTemp);
      vScaleAmt = _mm_adds_epu8 (vScaleAmt, vTemp);
      vTemp = _mm_slli_si128 (vScale, 1);
      vTemp = _mm_subs_epu8 (vTemp, vScale);
      vScaleAmt = _mm_subs_epu8 (vScaleAmt, vTemp);

      /* rescale the previous F */
      vF = _mm_subs_epu8 (vF, vScale);

      /* check if we can continue in 8-bits */
      vTemp = _mm_subs_epu8 (vCeiling, vF);
      vTemp = _mm_cmpeq_epi8 (vTemp, vNull);
      cmp  = _mm_movemask_epi8 (vTemp);
      if (cmp != 0x0000) {
        return OVERFLOW_SCORE;
      }

      /* scale all the vectors */
      for (j = 0; j < iter; j++) {
        /* load H and E */
        vH = _mm_load_si128 (pvH + j);
        vE = _mm_load_si128 (pvE + j);

        /* get max from vH, vE and vF */
        vH = _mm_subs_epu8 (vH, vScale);
        vE = _mm_subs_epu8 (vE, vScale);

        /* save the H and E */
        _mm_store_si128 (pvH + j, vH);
        _mm_store_si128 (pvE + j, vE);
      }

      /* calculate the final scaling amount */
      vScale = vScaleAmt;
      for (j = 0; j < position; ++j) {
        vScale = _mm_slli_si128 (vScale, 1);
      }
      vTemp = _mm_unpacklo_epi8 (vScale, vNull);
      vScale = _mm_unpackhi_epi8 (vScale, vNull);
      vScale = _mm_adds_epi16 (vScale, vTemp);
      vTemp = _mm_srli_si128 (vScale, 8);
      vScale = _mm_adds_epi16 (vScale, vTemp);
      vTemp = _mm_srli_si128 (vScale, 4);
      vScale = _mm_adds_epi16 (vScale, vTemp);
      vTemp = _mm_srli_si128 (vScale, 2);
      vScale = _mm_adds_epi16 (vScale, vTemp);
      scale = (int) _mm_extract_epi16 (vScale, 0);
    }

    /* scale the F value for the next round */
    vFPrev = _mm_slli_si128 (vF, 1);
    vFPrev = _mm_subs_epu8 (vFPrev, vScaleAmt);

    /* load and scale H for the next round */
    vH = _mm_load_si128 (pvH + iter - 1);
    vH = _mm_slli_si128 (vH, 1);
    vH = _mm_subs_epu8 (vH, vScaleAmt);
    vH = _mm_or_si128 (vH, vHInit);
  }

  /* calculate the max global score */
  vH = _mm_load_si128 (pvH + offset);
  vH = _mm_max_epu8 (vH, vF);
  for (j = 0; j < position; ++j) {
    vH = _mm_slli_si128 (vH, 1);
  }
  score = (int) (unsigned short) _mm_extract_epi16 (vH, 7);
  score >>= 8;

  /* return largest score */
  distance = (queryLength + dbLength) * gapExtend;
  score = score - (gapOpen * 2) - distance + scale;

  return score;
}