Esempio n. 1
0
// Denoise a 16x1 vector with a weaker filter.
static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
    const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
  __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
  // Calculate differences.
  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  const __m128i v_mc_running_avg_y =
      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  // Obtain the sign. FF if diff is negative.
  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
  // Clamp absolute difference to delta to get the adjustment.
  const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
  // Restore the sign and get positive and negative adjustments.
  __m128i padj, nadj;
  padj = _mm_andnot_si128(diff_sign, adj);
  nadj = _mm_and_si128(diff_sign, adj);
  // Calculate filtered value.
  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
  v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

  // Accumulate the adjustments.
  acc_diff = _mm_subs_epi8(acc_diff, padj);
  acc_diff = _mm_adds_epi8(acc_diff, nadj);
  return acc_diff;
}
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                        int stride) {
  uint8_t abs_diff;
  __m128i d;

  // Prediction data.
  __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
  __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
  __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
  __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
  __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));
  __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));
  __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));
  __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));

  p0 = _mm_unpacklo_epi64(p0, p1);
  p2 = _mm_unpacklo_epi64(p2, p3);
  p4 = _mm_unpacklo_epi64(p4, p5);
  p6 = _mm_unpacklo_epi64(p6, p7);

  // Clip diff value to [0, 255] range. Then, do addition or subtraction
  // according to its sign.
  if (diff >= 0) {
    abs_diff = (diff > 255) ? 255 : diff;
    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

    p0 = _mm_adds_epu8(p0, d);
    p2 = _mm_adds_epu8(p2, d);
    p4 = _mm_adds_epu8(p4, d);
    p6 = _mm_adds_epu8(p6, d);
  } else {
    abs_diff = (diff < -255) ? 255 : -diff;
    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

    p0 = _mm_subs_epu8(p0, d);
    p2 = _mm_subs_epu8(p2, d);
    p4 = _mm_subs_epu8(p4, d);
    p6 = _mm_subs_epu8(p6, d);
  }

  _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
  p0 = _mm_srli_si128(p0, 8);
  _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);

  _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
  p2 = _mm_srli_si128(p2, 8);
  _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);

  _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
  p4 = _mm_srli_si128(p4, 8);
  _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);

  _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
  p6 = _mm_srli_si128(p6, 8);
  _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
}
void vp9_add_constant_residual_32x32_sse2(const int16_t diff, uint8_t *dest,
                                          int stride) {
  uint8_t abs_diff;
  __m128i d;
  int i = 8;

  if (diff >= 0) {
    abs_diff = (diff > 255) ? 255 : diff;
    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
  } else {
    abs_diff = (diff < -255) ? 255 : -diff;
    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
  }

  do {
    // Prediction data.
    __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
    __m128i p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
    __m128i p2 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
    __m128i p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
    __m128i p4 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
    __m128i p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride + 16));
    __m128i p6 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
    __m128i p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride + 16));

    // Clip diff value to [0, 255] range. Then, do addition or subtraction
    // according to its sign.
    if (diff >= 0) {
      p0 = _mm_adds_epu8(p0, d);
      p1 = _mm_adds_epu8(p1, d);
      p2 = _mm_adds_epu8(p2, d);
      p3 = _mm_adds_epu8(p3, d);
      p4 = _mm_adds_epu8(p4, d);
      p5 = _mm_adds_epu8(p5, d);
      p6 = _mm_adds_epu8(p6, d);
      p7 = _mm_adds_epu8(p7, d);
    } else {
      p0 = _mm_subs_epu8(p0, d);
      p1 = _mm_subs_epu8(p1, d);
      p2 = _mm_subs_epu8(p2, d);
      p3 = _mm_subs_epu8(p3, d);
      p4 = _mm_subs_epu8(p4, d);
      p5 = _mm_subs_epu8(p5, d);
      p6 = _mm_subs_epu8(p6, d);
      p7 = _mm_subs_epu8(p7, d);
    }

    // Store results
    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
    _mm_store_si128((__m128i *)(dest + 2 * stride), p4);
    _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5);
    _mm_store_si128((__m128i *)(dest + 3 * stride), p6);
    _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7);

    dest += 4 * stride;
  } while (--i);
}
Esempio n. 4
0
// Denoise a 16x1 vector.
static INLINE __m128i vp9_denoiser_16x1_sse2(
    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
    const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
    const __m128i *k_16, const __m128i *l3, const __m128i *l32,
    const __m128i *l21, __m128i acc_diff) {
  // Calculate differences
  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  const __m128i v_mc_running_avg_y =
      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  __m128i v_running_avg_y;
  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  // Obtain the sign. FF if diff is negative.
  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
  // Clamp absolute difference to 16 to be used to get mask. Doing this
  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
  const __m128i clamped_absdiff =
      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
  // Get masks for l2 l1 and l0 adjustments.
  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
  // Get adjustments for l2, l1, and l0.
  __m128i adj2 = _mm_and_si128(mask2, *l32);
  const __m128i adj1 = _mm_and_si128(mask1, *l21);
  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
  __m128i adj, padj, nadj;

  // Combine the adjustments and get absolute adjustments.
  adj2 = _mm_add_epi8(adj2, adj1);
  adj = _mm_sub_epi8(*l3, adj2);
  adj = _mm_andnot_si128(mask0, adj);
  adj = _mm_or_si128(adj, adj0);

  // Restore the sign and get positive and negative adjustments.
  padj = _mm_andnot_si128(diff_sign, adj);
  nadj = _mm_and_si128(diff_sign, adj);

  // Calculate filtered value.
  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

  // Adjustments <=7, and each element in acc_diff can fit in signed
  // char.
  acc_diff = _mm_adds_epi8(acc_diff, padj);
  acc_diff = _mm_subs_epi8(acc_diff, nadj);
  return acc_diff;
}
Esempio n. 5
0
static WEBP_INLINE __m128i SubtractAndAccumulate(const __m128i a,
        const __m128i b) {
    // take abs(a-b) in 8b
    const __m128i a_b = _mm_subs_epu8(a, b);
    const __m128i b_a = _mm_subs_epu8(b, a);
    const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
    // zero-extend to 16b
    const __m128i C0 = _mm_cvtepu8_epi16(abs_a_b);
    const __m128i C1 = _mm_cvtepu8_epi16(_mm_srli_si128(abs_a_b, 8));
    // multiply with self
    const __m128i D0 = _mm_madd_epi16(C0, C0);
    const __m128i D1 = _mm_madd_epi16(C1, C1);
    // accumulate
    const __m128i sum = _mm_add_epi32(D0, D1);
    return sum;
}
Esempio n. 6
0
// input/output is uint8_t
static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
                                  const __m128i* const p0,
                                  const __m128i* const q0,
                                  const __m128i* const q1,
                                  int hev_thresh, __m128i* const not_hev) {
    const __m128i zero = _mm_setzero_si128();
    const __m128i t_1 = MM_ABS(*p1, *p0);
    const __m128i t_2 = MM_ABS(*q1, *q0);

    const __m128i h = _mm_set1_epi8(hev_thresh);
    const __m128i t_3 = _mm_subs_epu8(t_1, h);  // abs(p1 - p0) - hev_tresh
    const __m128i t_4 = _mm_subs_epu8(t_2, h);  // abs(q1 - q0) - hev_tresh

    *not_hev = _mm_or_si128(t_3, t_4);
    *not_hev = _mm_cmpeq_epi8(*not_hev, zero);  // not_hev <= t1 && not_hev <= t2
}
Esempio n. 7
0
/*!
  Compute the image addition: \f$ Ires = I1 - I2 \f$.

  \param I1 : The first image.
  \param I2 : The second image.
  \param Ires : \f$ Ires = I1 - I2 \f$
  \param saturate : If true, saturate the result to [0 ; 255] using vpMath::saturate, otherwise overflow may occur.
*/
void
vpImageTools::imageSubtract(const vpImage<unsigned char> &I1,
                            const vpImage<unsigned char> &I2,
                            vpImage<unsigned char> &Ires,
                            const bool saturate)
{
  if ((I1.getHeight() != I2.getHeight()) || (I1.getWidth() != I2.getWidth())) {
    throw (vpException(vpException::dimensionError, "The two images do not have the same size"));
  }

  if ((I1.getHeight() != Ires.getHeight()) || (I1.getWidth() != Ires.getWidth())) {
    Ires.resize(I1.getHeight(), I1.getWidth());
  }

  unsigned char *ptr_I1   = I1.bitmap;
  unsigned char *ptr_I2   = I2.bitmap;
  unsigned char *ptr_Ires = Ires.bitmap;
  unsigned int cpt = 0;

#if VISP_HAVE_SSE2
  if (Ires.getSize() >= 16) {
    for (; cpt <= Ires.getSize() - 16 ; cpt += 16, ptr_I1 += 16, ptr_I2 += 16, ptr_Ires += 16) {
      const __m128i v1   = _mm_loadu_si128( (const __m128i*) ptr_I1);
      const __m128i v2   = _mm_loadu_si128( (const __m128i*) ptr_I2);
      const __m128i vres = saturate ? _mm_subs_epu8(v1, v2) : _mm_sub_epi8(v1, v2);

      _mm_storeu_si128( (__m128i*) ptr_Ires, vres );
    }
  }
#endif

  for (; cpt < Ires.getSize(); cpt++, ++ptr_I1, ++ptr_I2, ++ptr_Ires) {
    *ptr_Ires = saturate ? vpMath::saturate<unsigned char>( (short int) *ptr_I1 - (short int) *ptr_I2 ) : *ptr_I1 - *ptr_I2;
  }
}
Esempio n. 8
0
static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
    const __m128i zero = _mm_setzero_si128();

    // Load values. Note that we read 8 pixels instead of 4,
    // but the a/b buffers are over-allocated to that effect.
    const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
    const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
    const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
    const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]);
    const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]);
    const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
    const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
    const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);

    // Combine pair of lines and convert to 16b.
    const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
    const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
    const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
    const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
    const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
    const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
    const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
    const __m128i b23s = _mm_unpacklo_epi8(b23, zero);

    // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2
    // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't
    //                  need absolute values, there is no need to do calculation
    //                  in 8bit as we are already in 16bit, ... Yet this is what
    //                  benchmarks the fastest!
    const __m128i d0 = _mm_subs_epu8(a01s, b01s);
    const __m128i d1 = _mm_subs_epu8(b01s, a01s);
    const __m128i d2 = _mm_subs_epu8(a23s, b23s);
    const __m128i d3 = _mm_subs_epu8(b23s, a23s);

    // Square and add them all together.
    const __m128i madd0 = _mm_madd_epi16(d0, d0);
    const __m128i madd1 = _mm_madd_epi16(d1, d1);
    const __m128i madd2 = _mm_madd_epi16(d2, d2);
    const __m128i madd3 = _mm_madd_epi16(d3, d3);
    const __m128i sum0 = _mm_add_epi32(madd0, madd1);
    const __m128i sum1 = _mm_add_epi32(madd2, madd3);
    const __m128i sum2 = _mm_add_epi32(sum0, sum1);

    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, sum2);
    return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
Esempio n. 9
0
__m128i test_mm_subs_epu8(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_subs_epu8
  // DAG: call <16 x i8> @llvm.x86.sse2.psubus.b
  //
  // ASM-LABEL: test_mm_subs_epu8
  // ASM: psubusb
  return _mm_subs_epu8(A, B);
}
Esempio n. 10
0
__m64 _m_psubusb(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_subs_epu8(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
Esempio n. 11
0
static WEBP_INLINE void ComplexMask(const __m128i* const p1,
                                    const __m128i* const p0,
                                    const __m128i* const q0,
                                    const __m128i* const q1,
                                    int thresh, int ithresh,
                                    __m128i* const mask) {
    const __m128i it = _mm_set1_epi8(ithresh);
    const __m128i diff = _mm_subs_epu8(*mask, it);
    const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
    __m128i filter_mask;
    NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
    *mask = _mm_and_si128(thresh_mask, filter_mask);
}
Esempio n. 12
0
static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
  int pa_minus_pb;
  const __m128i zero = _mm_setzero_si128();
  const __m128i A0 = _mm_cvtsi32_si128(a);
  const __m128i B0 = _mm_cvtsi32_si128(b);
  const __m128i C0 = _mm_cvtsi32_si128(c);
  const __m128i AC0 = _mm_subs_epu8(A0, C0);
  const __m128i CA0 = _mm_subs_epu8(C0, A0);
  const __m128i BC0 = _mm_subs_epu8(B0, C0);
  const __m128i CB0 = _mm_subs_epu8(C0, B0);
  const __m128i AC = _mm_or_si128(AC0, CA0);
  const __m128i BC = _mm_or_si128(BC0, CB0);
  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
  const __m128i diff = _mm_sub_epi16(pb, pa);
  {
    int16_t out[8];
    _mm_storeu_si128((__m128i*)out, diff);
    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
  }
  return (pa_minus_pb <= 0) ? a : b;
}
Esempio n. 13
0
static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
                        const __m128i* q1, int thresh, __m128i *mask) {
    __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
    *mask = _mm_set1_epi8(0xFE);
    t1 = _mm_and_si128(t1, *mask);        // set lsb of each byte to zero
    t1 = _mm_srli_epi16(t1, 1);           // abs(p1 - q1) / 2

    *mask = MM_ABS(*p0, *q0);             // abs(p0 - q0)
    *mask = _mm_adds_epu8(*mask, *mask);  // abs(p0 - q0) * 2
    *mask = _mm_adds_epu8(*mask, t1);     // abs(p0 - q0) * 2 + abs(p1 - q1) / 2

    t1 = _mm_set1_epi8(thresh);
    *mask = _mm_subs_epu8(*mask, t1);     // mask <= thresh
    *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
}
Esempio n. 14
0
// input pixels are uint8_t
static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
                                    const __m128i* const p0,
                                    const __m128i* const q0,
                                    const __m128i* const q1,
                                    int thresh, __m128i* const mask) {
    const __m128i m_thresh = _mm_set1_epi8(thresh);
    const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
    const __m128i kFE = _mm_set1_epi8(0xFE);
    const __m128i t2 = _mm_and_si128(t1, kFE);  // set lsb of each byte to zero
    const __m128i t3 = _mm_srli_epi16(t2, 1);   // abs(p1 - q1) / 2

    const __m128i t4 = MM_ABS(*p0, *q0);        // abs(p0 - q0)
    const __m128i t5 = _mm_adds_epu8(t4, t4);   // abs(p0 - q0) * 2
    const __m128i t6 = _mm_adds_epu8(t5, t3);   // abs(p0-q0)*2 + abs(p1-q1)/2

    const __m128i t7 = _mm_subs_epu8(t6, m_thresh);  // mask <= m_thresh
    *mask = _mm_cmpeq_epi8(t7, _mm_setzero_si128());
}
Esempio n. 15
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *dstp, const uint8_t *srcp, int th, int *enable)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *orig = p0, *end = p2;
    uint8_t threshold = th > 255 ? 255 : (uint8_t)th;

    line_copy8(p0, srcp, width, 1);
    line_copy8(p1, srcp, width, 1);

    __m128i xth = _mm_set1_epi8((int8_t)threshold);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 1 ? 1 : -1);
        line_copy8(p2, srcp, width, 1);
        uint8_t *coordinates[] = {p0 - 1, p0, p0 + 1,
                                  p1 - 1,     p1 + 1,
                                  p2 - 1, p2, p2 + 1};
        for (int x = 0; x < width; x += 16) {
            __m128i src = _mm_load_si128((__m128i *)(p1 + x));
            __m128i min = src;

            for (int i = 0; i < 8; i++) {
                if (enable[i]) {
                    __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x));
                    min = _mm_min_epu8(target, min);
                }
            }

            __m128i limit = _mm_subs_epu8(src, xth);
            min = _mm_max_epu8(min, limit);
            _mm_store_si128((__m128i *)(dstp + x), min);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig : p2 + bstride;
    }
}
void
IntensityDescriptorExtractor::normalizeDescriptor(uint8_t* desc) const
{
  assert(FOVIS_IS_ALIGNED16(_descriptor_brightness_offset));

  // get mean of patch
  uint32_t desc_mean = std::accumulate(desc, desc + _descriptor_len, 0)/_descriptor_len;
  // subtract mean, adding offset so 0 -> 128
  if(desc_mean < 128) {
    std::fill(_descriptor_brightness_offset, _descriptor_brightness_offset+16, 128-desc_mean);
    for(int op=0; op<_brightess_offset_num_sse_ops; op++) {
      ((__m128i*)desc)[op] = _mm_adds_epu8(((__m128i*)desc)[op],
        *(__m128i*)_descriptor_brightness_offset);
    }
  } else if (desc_mean > 128){
    std::fill(_descriptor_brightness_offset, _descriptor_brightness_offset+16, desc_mean-128);
    for(int op=0; op<_brightess_offset_num_sse_ops; op++) {
      ((__m128i*)desc)[op] = _mm_subs_epu8(((__m128i*)desc)[op],
        *(__m128i*)_descriptor_brightness_offset);
    }
  }
}
Esempio n. 17
0
void imageFilterSubFrom_SSE2(unsigned char *dst, unsigned char *src, int length)
{
    int n = length;

    // Compute first few values so we're on a 16-byte boundary in dst
    while( (((long)dst & 0xF) > 0) && (n > 0) ) {
        SUBFROM_PIXEL();
        --n; ++dst; ++src;
    }

    // Do bulk of processing using SSE2 (sub 16 8-bit unsigned integers, with saturation)
    while(n >= 16) {
        __m128i s = _mm_loadu_si128((__m128i*)src);
        __m128i d = _mm_load_si128((__m128i*)dst);
        __m128i r = _mm_subs_epu8(d, s);
        _mm_store_si128((__m128i*)dst, r);

        n -= 16; src += 16; dst += 16;
    }

    // If any bytes are left over, deal with them individually
    ++n;
    BASIC_SUBFROM();
}
 SIMD_INLINE __m128i FeatureDifference(__m128i value, __m128i lo, __m128i hi)
 {
     return _mm_max_epu8(_mm_subs_epu8(value, hi), _mm_subs_epu8(lo, value));
 }
Esempio n. 19
0
static FORCE_INLINE __m128i mm_subs_epu(const __m128i &a, const __m128i &b) {
    if (sizeof(PixelType) == 1)
        return _mm_subs_epu8(a, b);
    else
        return _mm_subs_epu16(a, b);
}
Esempio n. 20
0
EXPORT double swps3_alignmentByteSSE( ProfileByte * query, const char * db, int dbLen, Options * options )
{

	/**********************************************************************
	* This version of the code implements the idea presented in
	*
	***********************************************************************
	* Striped Smith-Waterman speeds database searches six times over other
	* SIMD implementations
	*
	* Michael Farrar, Bioinformatics, 23(2), pp. 156-161, 2007
	**********************************************************************/

	int i, j;
	unsigned char MaxScore = 0;
	int segLength = (query->len+15)/16; /* the segment length */

	__m128i * loadOpt  = query->loadOpt;
	__m128i * storeOpt = query->storeOpt;
	__m128i * rD       = query->rD;
	__m128i * current_profile;
	__m128i * swap;

	__m128i vMinimums = _mm_set1_epi32(0);

	__m128i vDelIncr  = _mm_set1_epi8(-options->gapExt);
	__m128i vDelFixed = _mm_set1_epi8(-options->gapOpen);
	__m128i vBias     = _mm_set1_epi8(query->bias);

	__m128i vMaxScore = vMinimums;	/* vMaxScore = [0,0] */

	__m128i vStoreOpt;				/* the new optimal score */
	__m128i vRD;					/* the new row deletion score */
	__m128i vCD = vMinimums;		/* the column deletion score */
	__m128i zero = vMinimums;		/* the column deletion score */
	__m128i vTmp;
#ifdef DEBUG
	int ii,jj;
#endif

	/* initialize the other arrays used for the dynProg code */
	/*********************************************************/
	for(i=0; LIKELY(i<segLength); i++){
		_mm_store_si128(loadOpt+i,zero);
		_mm_store_si128(storeOpt+i,zero);
		_mm_store_si128(rD+i,zero);
	}

	/* looping through all the columns */
	/***********************************/

	for(j=0; LIKELY(j<dbLen); j++){


		/* compute the opt and cd score depending on the previous column
		 *******************************************************************
		 * set the column deletion score to zero, has to be fixed later on */
		vCD = zero;

		/* set the opt score to the elements computed in the previous column*/
		/* set the low of storeOpt to MaxS[j]                               */
		vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
		vStoreOpt = _mm_slli_si128(vStoreOpt, 1);

		/* compute the current profile, depending on the character in s2 */
		/*****************************************************************/
		current_profile = query->profile + db[j]*segLength;

		/* swap the old optimal score with the new one */
		/***********************************************/
		swap = storeOpt;
		storeOpt = loadOpt;
		loadOpt = swap;

		/* main loop computing the max, precomputing etc. */
		/**************************************************/
		for(i=0; LIKELY(i<segLength); i++){
			vRD = _mm_load_si128(rD+i);
			vRD = _mm_subs_epu8(vRD, vDelIncr);
			vTmp = _mm_load_si128(loadOpt+i);
			vTmp = _mm_subs_epu8(vTmp,vDelFixed);
			vRD = _mm_max_epu8(vRD,vTmp);
			_mm_store_si128(rD+i, vRD);

			/* add the profile the prev. opt */
			vStoreOpt = _mm_adds_epu8(vStoreOpt, *(current_profile+i));
			vStoreOpt = _mm_subs_epu8(vStoreOpt, vBias);

			/* update the maxscore found so far */
			vMaxScore = _mm_max_epu8(vMaxScore, vStoreOpt);

			/* compute the correct opt score of the cell */
			vStoreOpt = _mm_max_epu8(vStoreOpt, vRD);
			vStoreOpt = _mm_max_epu8(vStoreOpt, vCD);

			/* store the opt score of the cell */
			_mm_store_si128(storeOpt+i, vStoreOpt);

			/* precompute cd for next iteration */
			vStoreOpt = _mm_subs_epu8(vStoreOpt, vDelFixed);
			vCD = _mm_subs_epu8(vCD, vDelIncr);
			vCD = _mm_max_epu8(vCD, vStoreOpt);

			/* load precomputed opt for next iteration */
			vStoreOpt = _mm_load_si128(loadOpt+i);
		}


		for(i=0;LIKELY(i<16);++i){
			int k;
		        /* compute the gap extend penalty for the current cell */
		        vCD = _mm_slli_si128(vCD,1);

			for(k=0;LIKELY(k<segLength);++k) {
			   /* compute the current optimal value of the cell */
			   vStoreOpt = _mm_load_si128(storeOpt+k);
			   vStoreOpt = _mm_max_epu8(vStoreOpt,vCD);
			   _mm_store_si128(storeOpt+k,vStoreOpt);

			   /* precompute the scores for the next cell */
			   vStoreOpt = _mm_subs_epu8(vStoreOpt,vDelFixed);
			   vCD = _mm_subs_epu8(vCD, vDelIncr);

			   if(UNLIKELY(_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(vCD,vStoreOpt),zero)) == 0xFFFF)) goto shortcut;
			}
		}
                shortcut:

#ifdef DEBUG
		debug("%c\t",db[j]);
		for(ii=0; ii<16;++ii) {
		   for(jj=0; jj<segLength;++jj) {
		      if(ii*segLength+jj < query->len)
			debug("%d\t",(int)((unsigned char*)storeOpt)[ii+jj*16]);
		   }
		}
		debug("\n");
#endif

		/* store the new MaxScore for the next line block */
		/**************************************************/

		/* store the element of storeOpt in MaxS */
		vStoreOpt = _mm_load_si128(storeOpt+segLength-1);
	}

	vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 8));
	vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 4));
	vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 2));
	vMaxScore = _mm_max_epu8(vMaxScore, _mm_srli_si128(vMaxScore, 1));
	MaxScore = (unsigned char)_mm_extract_epi16(vMaxScore,0);
	if ((int)MaxScore + (int)query->bias >=255)
		return DBL_MAX;
	return((double)MaxScore);
}
 SIMD_INLINE __m128i AdjustEdge(const __m128i & count, const __m128i & value, const __m128i & mask, const __m128i & threshold)
 {
     const __m128i inc = _mm_and_si128(mask, Greater8u(count, threshold));
     const __m128i dec = _mm_and_si128(mask, Lesser8u(count, threshold));
     return _mm_subs_epu8(_mm_adds_epu8(value, inc), dec);
 }
Esempio n. 22
0
/* Function:  p7_MSVFilter()
 * Synopsis:  Calculates MSV score, vewy vewy fast, in limited precision.
 * Incept:    SRE, Wed Dec 26 15:12:25 2007 [Janelia]
 *
 * Purpose:   Calculates an approximation of the MSV score for sequence
 *            <dsq> of length <L> residues, using optimized profile <om>,
 *            and a preallocated one-row DP matrix <ox>. Return the 
 *            estimated MSV score (in nats) in <ret_sc>.
 *            
 *            Score may overflow (and will, on high-scoring
 *            sequences), but will not underflow.
 *            
 *            The model may be in any mode, because only its match
 *            emission scores will be used. The MSV filter inherently
 *            assumes a multihit local mode, and uses its own special
 *            state transition scores, not the scores in the profile.
 *
 * Args:      dsq     - digital target sequence, 1..L
 *            L       - length of dsq in residues          
 *            om      - optimized profile
 *            ox      - DP matrix
 *            ret_sc  - RETURN: MSV score (in nats)          
 *                      
 * Note:      We misuse the matrix <ox> here, using only a third of the
 *            first dp row, accessing it as <dp[0..Q-1]> rather than
 *            in triplets via <{MDI}MX(q)> macros, since we only need
 *            to store M state values. We know that if <ox> was big
 *            enough for normal DP calculations, it must be big enough
 *            to hold the MSVFilter calculation.
 *
 * Returns:   <eslOK> on success.
 *            <eslERANGE> if the score overflows the limited range; in
 *            this case, this is a high-scoring hit.
 *
 * Throws:    <eslEINVAL> if <ox> allocation is too small.
 */
int
p7_MSVFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc)
{
  register __m128i mpv;            /* previous row values                                       */
  register __m128i xEv;		   /* E state: keeps max for Mk->E as we go                     */
  register __m128i xBv;		   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  register __m128i sv;		   /* temp storage of 1 curr row value in progress              */
  register __m128i biasv;	   /* emission bias in a vector                                 */
  uint8_t  xJ;                     /* special states' scores                                    */
  int i;			   /* counter over sequence positions 1..L                      */
  int q;			   /* counter over vectors 0..nq-1                              */
  int Q        = p7O_NQB(om->M);   /* segment length: # of vectors                              */
  __m128i *dp  = ox->dpb[0];	   /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/
  __m128i *rsc;			   /* will point at om->rbv[x] for residue x[i]                 */

  __m128i xJv;                     /* vector for states score                                   */
  __m128i tjbmv;                   /* vector for cost of moving from either J or N through B to an M state */
  __m128i tecv;                    /* vector for E->C  cost                                     */
  __m128i basev;                   /* offset for scores                                         */
  __m128i ceilingv;                /* saturateed simd value used to test for overflow           */
  __m128i tempv;                   /* work vector                                               */

  int cmp;
  int status = eslOK;

  /* Check that the DP matrix is ok for us. */
  if (Q > ox->allocQ16)  ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small");
  ox->M   = om->M;

  /* Try highly optimized ssv filter first */
  status = p7_SSVFilter(dsq, L, om, ret_sc);
  if (status != eslENORESULT) return status;

  /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base.
   */
  biasv = _mm_set1_epi8((int8_t) om->bias_b); /* yes, you can set1() an unsigned char vector this way */
  for (q = 0; q < Q; q++) dp[q] = _mm_setzero_si128();
  xJ   = 0;

  /* saturate simd register for overflow test */
  ceilingv = _mm_cmpeq_epi8(biasv, biasv);
  basev = _mm_set1_epi8((int8_t) om->base_b);

  tjbmv = _mm_set1_epi8((int8_t) om->tjb_b + (int8_t) om->tbm_b);
  tecv = _mm_set1_epi8((int8_t) om->tec_b);

  xJv = _mm_subs_epu8(biasv, biasv);
  xBv = _mm_subs_epu8(basev, tjbmv);

#if p7_DEBUGGING
  if (ox->debugging)
  {
      uint8_t xB;
      xB = _mm_extract_epi16(xBv, 0);
      xJ = _mm_extract_epi16(xJv, 0);
      p7_omx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ);
  }
#endif


  for (i = 1; i <= L; i++)
  {
      rsc = om->rbv[dsq[i]];
      xEv = _mm_setzero_si128();      

      /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. 
       * Because ia32 is littlendian, this means a left bit shift.
       * Zeros shift on automatically, which is our -infinity.
       */
      mpv = _mm_slli_si128(dp[Q-1], 1);   
      for (q = 0; q < Q; q++)
      {
        /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
        sv   = _mm_max_epu8(mpv, xBv);
        sv   = _mm_adds_epu8(sv, biasv);
        sv   = _mm_subs_epu8(sv, *rsc);   rsc++;
        xEv  = _mm_max_epu8(xEv, sv);

        mpv   = dp[q];   	  /* Load {MDI}(i-1,q) into mpv */
        dp[q] = sv;       	  /* Do delayed store of M(i,q) now that memory is usable */
      }

      /* test for the overflow condition */
      tempv = _mm_adds_epu8(xEv, biasv);
      tempv = _mm_cmpeq_epi8(tempv, ceilingv);
      cmp = _mm_movemask_epi8(tempv);

      /* Now the "special" states, which start from Mk->E (->C, ->J->B)
       * Use shuffles instead of shifts so when the last max has completed,
       * the last four elements of the simd register will contain the
       * max value.  Then the last shuffle will broadcast the max value
       * to all simd elements.
       */
      tempv = _mm_shuffle_epi32(xEv, _MM_SHUFFLE(2, 3, 0, 1));
      xEv = _mm_max_epu8(xEv, tempv);
      tempv = _mm_shuffle_epi32(xEv, _MM_SHUFFLE(0, 1, 2, 3));
      xEv = _mm_max_epu8(xEv, tempv);
      tempv = _mm_shufflelo_epi16(xEv, _MM_SHUFFLE(2, 3, 0, 1));
      xEv = _mm_max_epu8(xEv, tempv);
      tempv = _mm_srli_si128(xEv, 1);
      xEv = _mm_max_epu8(xEv, tempv);
      xEv = _mm_shuffle_epi32(xEv, _MM_SHUFFLE(0, 0, 0, 0));

      /* immediately detect overflow */
      if (cmp != 0x0000)
      {
        *ret_sc = eslINFINITY;
        return eslERANGE;
      }

      xEv = _mm_subs_epu8(xEv, tecv);
      xJv = _mm_max_epu8(xJv,xEv);
      
      xBv = _mm_max_epu8(basev, xJv);
      xBv = _mm_subs_epu8(xBv, tjbmv);
	  
#if p7_DEBUGGING
      if (ox->debugging)
      {
        uint8_t xB, xE;
        xB = _mm_extract_epi16(xBv, 0);
        xE = _mm_extract_epi16(xEv, 0);
        xJ = _mm_extract_epi16(xJv, 0);
        p7_omx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ);
      }
#endif
  } /* end loop over sequence residues 1..L */

  xJ = (uint8_t) _mm_extract_epi16(xJv, 0);

  /* finally C->T, and add our missing precision on the NN,CC,JJ back */
  *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b);
  *ret_sc /= om->scale_b;
  *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */

  return eslOK;
}
Esempio n. 23
0
/* Function:  p7_SSVFilter_longtarget()
 * Synopsis:  Finds windows with SSV scores above some threshold (vewy vewy fast, in limited precision)
 *
 * Purpose:   Calculates an approximation of the SSV (single ungapped diagonal)
 *            score for regions of sequence <dsq> of length <L> residues, using
 *            optimized profile <om>, and a preallocated one-row DP matrix <ox>,
 *            and captures the positions at which such regions exceed the score
 *            required to be significant in the eyes of the calling function,
 *            which depends on the <bg> and <p> (usually p=0.02 for nhmmer).
 *            Note that this variant performs only SSV computations, never
 *            passing through the J state - the score required to pass SSV at
 *            the default threshold (or less restrictive) is sufficient to
 *            pass MSV in essentially all DNA models we've tested.
 *
 *            Above-threshold diagonals are captured into a preallocated list
 *            <windowlist>. Rather than simply capturing positions at which a
 *            score threshold is reached, this function establishes windows
 *            around those high-scoring positions, using scores in <msvdata>.
 *            These windows can be merged by the calling function.
 *
 *
 * Args:      dsq     - digital target sequence, 1..L
 *            L       - length of dsq in residues
 *            om      - optimized profile
 *            ox      - DP matrix
 *            msvdata    - compact representation of substitution scores, for backtracking diagonals
 *            bg         - the background model, required for translating a P-value threshold into a score threshold
 *            P          - p-value below which a region is captured as being above threshold
 *            windowlist - preallocated container for all hits (resized if necessary)
 *
 *
 * Note:      We misuse the matrix <ox> here, using only a third of the
 *            first dp row, accessing it as <dp[0..Q-1]> rather than
 *            in triplets via <{MDI}MX(q)> macros, since we only need
 *            to store M state values. We know that if <ox> was big
 *            enough for normal DP calculations, it must be big enough
 *            to hold the MSVFilter calculation.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEINVAL> if <ox> allocation is too small.
 */
int
p7_SSVFilter_longtarget(const ESL_DSQ *dsq, int L, P7_OPROFILE *om, P7_OMX *ox, const P7_SCOREDATA *msvdata,
                        P7_BG *bg, double P, P7_HMM_WINDOWLIST *windowlist)
{

  register __m128i mpv;            /* previous row values                                       */
  register __m128i xEv;		   /* E state: keeps max for Mk->E for a single iteration       */
  register __m128i xBv;		   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  register __m128i sv;		   /* temp storage of 1 curr row value in progress              */
  register __m128i biasv;	   /* emission bias in a vector                                 */
  uint8_t  xJ;                     /* special states' scores                                    */
  int i;			   /* counter over sequence positions 1..L                      */
  int q;			   /* counter over vectors 0..nq-1                              */
  int Q        = p7O_NQB(om->M);   /* segment length: # of vectors                              */
  __m128i *dp  = ox->dpb[0];	   /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/
  __m128i *rsc;			   /* will point at om->rbv[x] for residue x[i]                 */
  __m128i tecv;                    /* vector for E->C  cost                                     */
  __m128i tjbmv;                   /* vector for J->B move cost + B->M move costs               */
  __m128i basev;                   /* offset for scores                                         */
  __m128i ceilingv;                /* saturated simd value used to test for overflow           */
  __m128i tempv;                   /* work vector                                               */
  int cmp;
  int k;
  int n;
  int end;
  int rem_sc;
  int start;
  int target_end;
  int target_start;
  int max_end;
  int max_sc;
  int sc;
  int pos_since_max;
  float ret_sc;

  union { __m128i v; uint8_t b[16]; } u;


  /*
   * Computing the score required to let P meet the F1 prob threshold
   * In original code, converting from a scaled int MSV
   * score S (the score getting to state E) to a probability goes like this:
   *  usc =  S - om->tec_b - om->tjb_b - om->base_b;
   *  usc /= om->scale_b;
   *  usc -= 3.0;
   *  P = f ( (usc - nullsc) / eslCONST_LOG2 , mu, lambda)
   * and we're computing the threshold usc, so reverse it:
   *  (usc - nullsc) /  eslCONST_LOG2 = inv_f( P, mu, lambda)
   *  usc = nullsc + eslCONST_LOG2 * inv_f( P, mu, lambda)
   *  usc += 3
   *  usc *= om->scale_b
   *  S = usc + om->tec_b + om->tjb_b + om->base_b
   *
   *  Here, I compute threshold with length model based on max_length.  Doesn't
   *  matter much - in any case, both the bg and om models will change with roughly
   *  1 bit for each doubling of the length model, so they offset.
   */
  float nullsc;
  __m128i sc_threshv;
  uint8_t sc_thresh;
  float invP = esl_gumbel_invsurv(P, om->evparam[p7_MMU],  om->evparam[p7_MLAMBDA]);

  /* Check that the DP matrix is ok for us. */
  if (Q > ox->allocQ16)  ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small");
  ox->M   = om->M;


  p7_bg_SetLength(bg, om->max_length);
  p7_oprofile_ReconfigMSVLength(om, om->max_length);
  p7_bg_NullOne  (bg, dsq, om->max_length, &nullsc);

  sc_thresh = (int) ceil( ( ( nullsc  + (invP * eslCONST_LOG2) + 3.0 )  * om->scale_b ) + om->base_b +  om->tec_b  + om->tjb_b );
  sc_threshv = _mm_set1_epi8((int8_t) 255 - sc_thresh);

  /* Initialization. In offset unsigned  arithmetic, -infinity is 0, and 0 is om->base.
   */
  biasv = _mm_set1_epi8((int8_t) om->bias_b); /* yes, you can set1() an unsigned char vector this way */
  ceilingv = _mm_cmpeq_epi8(biasv, biasv);
  for (q = 0; q < Q; q++) dp[q] = _mm_setzero_si128();
  xJ   = 0;

  basev = _mm_set1_epi8((int8_t) om->base_b);
  tecv = _mm_set1_epi8((int8_t) om->tec_b);
  tjbmv = _mm_set1_epi8((int8_t) om->tjb_b + (int8_t) om->tbm_b);

  xBv = _mm_subs_epu8(basev, tjbmv);

  for (i = 1; i <= L; i++) {
    rsc = om->rbv[dsq[i]];
    xEv = _mm_setzero_si128();

	  /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12.
	   * Because ia32 is littlendian, this means a left bit shift.
	   * Zeros shift on automatically, which is our -infinity.
	   */
	  mpv = _mm_slli_si128(dp[Q-1], 1);
	  for (q = 0; q < Q; q++) {
		  /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
		  sv   = _mm_max_epu8(mpv, xBv);
		  sv   = _mm_adds_epu8(sv, biasv);
		  sv   = _mm_subs_epu8(sv, *rsc);   rsc++;
		  xEv  = _mm_max_epu8(xEv, sv);

		  mpv   = dp[q];   	  /* Load {MDI}(i-1,q) into mpv */
		  dp[q] = sv;       	  /* Do delayed store of M(i,q) now that memory is usable */
	  }

	  /* test if the pthresh significance threshold has been reached;
	   * note: don't use _mm_cmpgt_epi8, because it's a signed comparison, which won't work on uint8s */
	  tempv = _mm_adds_epu8(xEv, sc_threshv);
	  tempv = _mm_cmpeq_epi8(tempv, ceilingv);
	  cmp = _mm_movemask_epi8(tempv);

	  if (cmp != 0) {  //hit pthresh, so add position to list and reset values

	    //figure out which model state hit threshold
	    end = -1;
	    rem_sc = -1;
	    for (q = 0; q < Q; q++) {  /// Unpack and unstripe, so we can find the state that exceeded pthresh
          u.v = dp[q];
          for (k = 0; k < 16; k++) { // unstripe
            //(q+Q*k+1) is the model position k at which the xE score is found
            if (u.b[k] >= sc_thresh && u.b[k] > rem_sc && (q+Q*k+1) <= om->M) {
              end = (q+Q*k+1);
              rem_sc = u.b[k];
            }
          }
          dp[q] = _mm_set1_epi8(0); // while we're here ... this will cause values to get reset to xB in next dp iteration
	    }

	    //recover the diagonal that hit threshold
	    start = end;
	    target_end = target_start = i;
	    sc = rem_sc;
	    while (rem_sc > om->base_b - om->tjb_b - om->tbm_b) {
	      rem_sc -= om->bias_b -  msvdata->msv_scores[start*om->abc->Kp + dsq[target_start]];
	      --start;
	      --target_start;
	    }
	    start++;
	    target_start++;


	    //extend diagonal further with single diagonal extension
	    k = end+1;
	    n = target_end+1;
	    max_end = target_end;
	    max_sc = sc;
	    pos_since_max = 0;
	    while (k<om->M && n<=L) {
	      sc += om->bias_b -  msvdata->msv_scores[k*om->abc->Kp + dsq[n]];

	      if (sc >= max_sc) {
	        max_sc = sc;
	        max_end = n;
	        pos_since_max=0;
	      } else {
	        pos_since_max++;
	        if (pos_since_max == 5)
	          break;
	      }
	      k++;
	      n++;
	    }

	    end  +=  (max_end - target_end);
	    k    +=  (max_end - target_end);
      target_end = max_end;

      ret_sc = ((float) (max_sc - om->tjb_b) - (float) om->base_b);
      ret_sc /= om->scale_b;
      ret_sc -= 3.0; // that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ

      p7_hmmwindow_new(windowlist, 0, target_start, k, end, end-start+1 , ret_sc, p7_NOCOMPLEMENT );

      i = target_end; // skip forward
	  }


  } /* end loop over sequence residues 1..L */

  return eslOK;

}
Esempio n. 24
0
static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
                       int num_quads, int do_16) {
    const __m128i zero = _mm_setzero_si128();
    __m128i sum1 = zero;
    __m128i sum2 = zero;

    while (num_quads-- > 0) {
        // Note: for the !do_16 case, we read 16 pixels instead of 8 but that's ok,
        // thanks to buffer over-allocation to that effect.
        const __m128i a0 = _mm_loadu_si128((__m128i*)&a[BPS * 0]);
        const __m128i a1 = _mm_loadu_si128((__m128i*)&a[BPS * 1]);
        const __m128i a2 = _mm_loadu_si128((__m128i*)&a[BPS * 2]);
        const __m128i a3 = _mm_loadu_si128((__m128i*)&a[BPS * 3]);
        const __m128i b0 = _mm_loadu_si128((__m128i*)&b[BPS * 0]);
        const __m128i b1 = _mm_loadu_si128((__m128i*)&b[BPS * 1]);
        const __m128i b2 = _mm_loadu_si128((__m128i*)&b[BPS * 2]);
        const __m128i b3 = _mm_loadu_si128((__m128i*)&b[BPS * 3]);

        // compute clip0(a-b) and clip0(b-a)
        const __m128i a0p = _mm_subs_epu8(a0, b0);
        const __m128i a0m = _mm_subs_epu8(b0, a0);
        const __m128i a1p = _mm_subs_epu8(a1, b1);
        const __m128i a1m = _mm_subs_epu8(b1, a1);
        const __m128i a2p = _mm_subs_epu8(a2, b2);
        const __m128i a2m = _mm_subs_epu8(b2, a2);
        const __m128i a3p = _mm_subs_epu8(a3, b3);
        const __m128i a3m = _mm_subs_epu8(b3, a3);

        // compute |a-b| with 8b arithmetic as clip0(a-b) | clip0(b-a)
        const __m128i diff0 = _mm_or_si128(a0p, a0m);
        const __m128i diff1 = _mm_or_si128(a1p, a1m);
        const __m128i diff2 = _mm_or_si128(a2p, a2m);
        const __m128i diff3 = _mm_or_si128(a3p, a3m);

        // unpack (only four operations, instead of eight)
        const __m128i low0 = _mm_unpacklo_epi8(diff0, zero);
        const __m128i low1 = _mm_unpacklo_epi8(diff1, zero);
        const __m128i low2 = _mm_unpacklo_epi8(diff2, zero);
        const __m128i low3 = _mm_unpacklo_epi8(diff3, zero);

        // multiply with self
        const __m128i low_madd0 = _mm_madd_epi16(low0, low0);
        const __m128i low_madd1 = _mm_madd_epi16(low1, low1);
        const __m128i low_madd2 = _mm_madd_epi16(low2, low2);
        const __m128i low_madd3 = _mm_madd_epi16(low3, low3);

        // collect in a cascading way
        const __m128i low_sum0 = _mm_add_epi32(low_madd0, low_madd1);
        const __m128i low_sum1 = _mm_add_epi32(low_madd2, low_madd3);
        sum1 = _mm_add_epi32(sum1, low_sum0);
        sum2 = _mm_add_epi32(sum2, low_sum1);

        if (do_16) {  // if necessary, process the higher 8 bytes similarly
            const __m128i hi0 = _mm_unpackhi_epi8(diff0, zero);
            const __m128i hi1 = _mm_unpackhi_epi8(diff1, zero);
            const __m128i hi2 = _mm_unpackhi_epi8(diff2, zero);
            const __m128i hi3 = _mm_unpackhi_epi8(diff3, zero);

            const __m128i hi_madd0 = _mm_madd_epi16(hi0, hi0);
            const __m128i hi_madd1 = _mm_madd_epi16(hi1, hi1);
            const __m128i hi_madd2 = _mm_madd_epi16(hi2, hi2);
            const __m128i hi_madd3 = _mm_madd_epi16(hi3, hi3);
            const __m128i hi_sum0 = _mm_add_epi32(hi_madd0, hi_madd1);
            const __m128i hi_sum1 = _mm_add_epi32(hi_madd2, hi_madd3);
            sum1 = _mm_add_epi32(sum1, hi_sum0);
            sum2 = _mm_add_epi32(sum2, hi_sum1);
        }
        a += 4 * BPS;
        b += 4 * BPS;
    }
    {
        int32_t tmp[4];
        const __m128i sum = _mm_add_epi32(sum1, sum2);
        _mm_storeu_si128((__m128i*)tmp, sum);
        return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
    }
}
void vp9_add_constant_residual_16x16_sse2(const int16_t diff, uint8_t *dest,
                                          int stride) {
  uint8_t abs_diff;
  __m128i d;

  // Prediction data.
  __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
  __m128i p1 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
  __m128i p2 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
  __m128i p3 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
  __m128i p4 = _mm_load_si128((const __m128i *)(dest + 4 * stride));
  __m128i p5 = _mm_load_si128((const __m128i *)(dest + 5 * stride));
  __m128i p6 = _mm_load_si128((const __m128i *)(dest + 6 * stride));
  __m128i p7 = _mm_load_si128((const __m128i *)(dest + 7 * stride));
  __m128i p8 = _mm_load_si128((const __m128i *)(dest + 8 * stride));
  __m128i p9 = _mm_load_si128((const __m128i *)(dest + 9 * stride));
  __m128i p10 = _mm_load_si128((const __m128i *)(dest + 10 * stride));
  __m128i p11 = _mm_load_si128((const __m128i *)(dest + 11 * stride));
  __m128i p12 = _mm_load_si128((const __m128i *)(dest + 12 * stride));
  __m128i p13 = _mm_load_si128((const __m128i *)(dest + 13 * stride));
  __m128i p14 = _mm_load_si128((const __m128i *)(dest + 14 * stride));
  __m128i p15 = _mm_load_si128((const __m128i *)(dest + 15 * stride));

  // Clip diff value to [0, 255] range. Then, do addition or subtraction
  // according to its sign.
  if (diff >= 0) {
    abs_diff = (diff > 255) ? 255 : diff;
    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

    p0 = _mm_adds_epu8(p0, d);
    p1 = _mm_adds_epu8(p1, d);
    p2 = _mm_adds_epu8(p2, d);
    p3 = _mm_adds_epu8(p3, d);
    p4 = _mm_adds_epu8(p4, d);
    p5 = _mm_adds_epu8(p5, d);
    p6 = _mm_adds_epu8(p6, d);
    p7 = _mm_adds_epu8(p7, d);
    p8 = _mm_adds_epu8(p8, d);
    p9 = _mm_adds_epu8(p9, d);
    p10 = _mm_adds_epu8(p10, d);
    p11 = _mm_adds_epu8(p11, d);
    p12 = _mm_adds_epu8(p12, d);
    p13 = _mm_adds_epu8(p13, d);
    p14 = _mm_adds_epu8(p14, d);
    p15 = _mm_adds_epu8(p15, d);
  } else {
    abs_diff = (diff < -255) ? 255 : -diff;
    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

    p0 = _mm_subs_epu8(p0, d);
    p1 = _mm_subs_epu8(p1, d);
    p2 = _mm_subs_epu8(p2, d);
    p3 = _mm_subs_epu8(p3, d);
    p4 = _mm_subs_epu8(p4, d);
    p5 = _mm_subs_epu8(p5, d);
    p6 = _mm_subs_epu8(p6, d);
    p7 = _mm_subs_epu8(p7, d);
    p8 = _mm_subs_epu8(p8, d);
    p9 = _mm_subs_epu8(p9, d);
    p10 = _mm_subs_epu8(p10, d);
    p11 = _mm_subs_epu8(p11, d);
    p12 = _mm_subs_epu8(p12, d);
    p13 = _mm_subs_epu8(p13, d);
    p14 = _mm_subs_epu8(p14, d);
    p15 = _mm_subs_epu8(p15, d);
  }

  // Store results
  _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
  _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
  _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
  _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
  _mm_store_si128((__m128i *)(dest + 4 * stride), p4);
  _mm_store_si128((__m128i *)(dest + 5 * stride), p5);
  _mm_store_si128((__m128i *)(dest + 6 * stride), p6);
  _mm_store_si128((__m128i *)(dest + 7 * stride), p7);
  _mm_store_si128((__m128i *)(dest + 8 * stride), p8);
  _mm_store_si128((__m128i *)(dest + 9 * stride), p9);
  _mm_store_si128((__m128i *)(dest + 10 * stride), p10);
  _mm_store_si128((__m128i *)(dest + 11 * stride), p11);
  _mm_store_si128((__m128i *)(dest + 12 * stride), p12);
  _mm_store_si128((__m128i *)(dest + 13 * stride), p13);
  _mm_store_si128((__m128i *)(dest + 14 * stride), p14);
  _mm_store_si128((__m128i *)(dest + 15 * stride), p15);
}
Esempio n. 26
0
PRBool
gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
                                   const gfxImageSurface* whiteSurf)
{
    gfxIntSize size = blackSurf->GetSize();

    if (size != whiteSurf->GetSize() ||
            (blackSurf->Format() != gfxASurface::ImageFormatARGB32 &&
             blackSurf->Format() != gfxASurface::ImageFormatRGB24) ||
            (whiteSurf->Format() != gfxASurface::ImageFormatARGB32 &&
             whiteSurf->Format() != gfxASurface::ImageFormatRGB24))
        return PR_FALSE;

    blackSurf->Flush();
    whiteSurf->Flush();

    unsigned char* blackData = blackSurf->Data();
    unsigned char* whiteData = whiteSurf->Data();

    if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
            (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
        // Cannot keep these in alignment.
        return PR_FALSE;
    }

    __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
    __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);

    for (PRInt32 i = 0; i < size.height; ++i) {
        PRInt32 j = 0;
        // Loop single pixels until at 4 byte alignment.
        while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
            *((PRUint32*)blackData) =
                RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
                             *reinterpret_cast<PRUint32*>(whiteData));
            blackData += 4;
            whiteData += 4;
            j++;
        }
        // This extra loop allows the compiler to do some more clever registry
        // management and makes it about 5% faster than with only the 4 pixel
        // at a time loop.
        for (; j < size.width - 8; j += 8) {
            __m128i black1 = _mm_load_si128((__m128i*)blackData);
            __m128i white1 = _mm_load_si128((__m128i*)whiteData);
            __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
            __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));

            // Execute the same instructions as described in RecoverPixel, only
            // using an SSE2 packed saturated subtract.
            white1 = _mm_subs_epu8(white1, black1);
            white2 = _mm_subs_epu8(white2, black2);
            white1 = _mm_subs_epu8(greenMask, white1);
            white2 = _mm_subs_epu8(greenMask, white2);
            // Producing the final black pixel in an XMM register and storing
            // that is actually faster than doing a masked store since that
            // does an unaligned storage. We have the black pixel in a register
            // anyway.
            black1 = _mm_andnot_si128(alphaMask, black1);
            black2 = _mm_andnot_si128(alphaMask, black2);
            white1 = _mm_slli_si128(white1, 2);
            white2 = _mm_slli_si128(white2, 2);
            white1 = _mm_and_si128(alphaMask, white1);
            white2 = _mm_and_si128(alphaMask, white2);
            black1 = _mm_or_si128(white1, black1);
            black2 = _mm_or_si128(white2, black2);

            _mm_store_si128((__m128i*)blackData, black1);
            _mm_store_si128((__m128i*)(blackData + 16), black2);
            blackData += 32;
            whiteData += 32;
        }
        for (; j < size.width - 4; j += 4) {
            __m128i black = _mm_load_si128((__m128i*)blackData);
            __m128i white = _mm_load_si128((__m128i*)whiteData);

            white = _mm_subs_epu8(white, black);
            white = _mm_subs_epu8(greenMask, white);
            black = _mm_andnot_si128(alphaMask, black);
            white = _mm_slli_si128(white, 2);
            white = _mm_and_si128(alphaMask, white);
            black = _mm_or_si128(white, black);
            _mm_store_si128((__m128i*)blackData, black);
            blackData += 16;
            whiteData += 16;
        }
        // Loop single pixels until we're done.
        while (j < size.width) {
            *((PRUint32*)blackData) =
                RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
                             *reinterpret_cast<PRUint32*>(whiteData));
            blackData += 4;
            whiteData += 4;
            j++;
        }
        blackData += blackSurf->Stride() - j * 4;
        whiteData += whiteSurf->Stride() - j * 4;
    }

    blackSurf->MarkDirty();

    return PR_TRUE;
}
int
smith_waterman_sse2_byte(const unsigned char *     query_sequence,
                         unsigned char *     query_profile_byte,
                         const int                 query_length,
                         const unsigned char *     db_sequence,
                         const int                 db_length,
                         unsigned char       bias,
                         unsigned char       gap_open,
                         unsigned char       gap_extend,
                         struct f_struct *   f_str)
{
    int     i, j, k;
    int     score;

    int     dup;
    int     cmp;
    int     iter = (query_length + 15) / 16;
    
    __m128i *p;
    __m128i *workspace = (__m128i *) f_str->workspace;

    __m128i E, F, H;

    __m128i v_maxscore;
    __m128i v_bias;
    __m128i v_gapopen;
    __m128i v_gapextend;

    __m128i v_temp;
    __m128i v_zero;

    __m128i *pHLoad, *pHStore;
    __m128i *pE;

    __m128i *pScore;

    /* Load the bias to all elements of a constant */
    dup    = ((short) bias << 8) | bias;
    v_bias = _mm_setzero_si128();
    v_bias = _mm_insert_epi16 (v_bias, dup, 0);
    v_bias = _mm_shufflelo_epi16 (v_bias, 0);
    v_bias = _mm_shuffle_epi32 (v_bias, 0);

    /* Load gap opening penalty to all elements of a constant */
    dup  = ((short) gap_open << 8) | gap_open;
    v_gapopen = _mm_setzero_si128();
    v_gapopen = _mm_insert_epi16 (v_gapopen, dup, 0);
    v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
    v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);

    /* Load gap extension penalty to all elements of a constant */
    dup  = ((short) gap_extend << 8) | gap_extend;
    v_gapextend = _mm_setzero_si128();
    v_gapextend = _mm_insert_epi16 (v_gapextend, dup, 0);
    v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
    v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);

    /* initialize the max score */
    /*     v_maxscore = _mm_xor_si128 (v_maxscore, v_maxscore);  - Apple Devel*/
    v_maxscore = _mm_setzero_si128();	/* Apple Devel */

    /* create a constant of all zeros for comparison */
    /* v_zero = _mm_xor_si128 (v_zero, v_zero);   - Apple Devel */
    v_zero = _mm_setzero_si128();	/* Apple Devel */

    /* Zero out the storage vector */
    k = iter * 2;

    p = workspace;
    for (i = 0; i < k; i++)
    {
        _mm_store_si128 (p++, v_maxscore);
    }

    pE = workspace;
    pHStore = pE + iter;
    pHLoad = pHStore + iter;

    for (i = 0; i < db_length; ++i)
    {
        /* fetch first data asap. */
        pScore = (__m128i *) query_profile_byte + db_sequence[i] * iter;

        /* zero out F value. */
        /* F = _mm_xor_si128 (F, F);  -Apple Devel */
        F = _mm_setzero_si128();	/* Apple Devel */

        /* load the next h value */
        H = _mm_load_si128 (pHStore + iter - 1);
        H = _mm_slli_si128 (H, 1);

        p = pHLoad;
        pHLoad = pHStore;
        pHStore = p;

        for (j = 0; j < iter; j++)
        {
            /* load values E. */
            E = _mm_load_si128 (pE + j);

            /* add score to H */
            H = _mm_adds_epu8 (H, *pScore++);
            H = _mm_subs_epu8 (H, v_bias);

            /* Update highest score encountered this far */
            v_maxscore = _mm_max_epu8 (v_maxscore, H);

            /* get max from H, E and F */
            H = _mm_max_epu8 (H, E);
            H = _mm_max_epu8 (H, F);

            /* save H values */
            _mm_store_si128 (pHStore + j, H);

            /* subtract the gap open penalty from H */
            H = _mm_subs_epu8 (H, v_gapopen);

            /* update E value */
            E = _mm_subs_epu8 (E, v_gapextend);
            E = _mm_max_epu8 (E, H);

            /* update F value */
            F = _mm_subs_epu8 (F, v_gapextend);
            F = _mm_max_epu8 (F, H);

            /* save E values */
            _mm_store_si128 (pE + j, E);

            /* load the next h value */
            H = _mm_load_si128 (pHLoad + j);
        }

        /* reset pointers to the start of the saved data */
        j = 0;
        H = _mm_load_si128 (pHStore + j);

        /*  the computed F value is for the given column.  since */
        /*  we are at the end, we need to shift the F value over */
        /*  to the next column. */
        F = _mm_slli_si128 (F, 1);
        v_temp = _mm_subs_epu8 (H, v_gapopen);
        v_temp = _mm_subs_epu8 (F, v_temp);
        v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
        cmp  = _mm_movemask_epi8 (v_temp);

        while (cmp != 0xffff) 
        {
            E = _mm_load_si128 (pE + j);

            H = _mm_max_epu8 (H, F);

            /* save H values */
            _mm_store_si128 (pHStore + j, H);

            /* update E in case the new H value would change it */
            H = _mm_subs_epu8 (H, v_gapopen);
            E = _mm_max_epu8 (E, H);
            _mm_store_si128 (pE + j, E);

            /* update F value */
            F = _mm_subs_epu8 (F, v_gapextend);

            j++;
            if (j >= iter)
            {
                j = 0;
                F = _mm_slli_si128 (F, 1);
            }
            H = _mm_load_si128 (pHStore + j);

            v_temp = _mm_subs_epu8 (H, v_gapopen);
            v_temp = _mm_subs_epu8 (F, v_temp);
            v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
            cmp  = _mm_movemask_epi8 (v_temp);
        }
    }

    /* find largest score in the v_maxscore vector */
    v_temp = _mm_srli_si128 (v_maxscore, 8);
    v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
    v_temp = _mm_srli_si128 (v_maxscore, 4);
    v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
    v_temp = _mm_srli_si128 (v_maxscore, 2);
    v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
    v_temp = _mm_srli_si128 (v_maxscore, 1);
    v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);

    /* store in temporary variable */
    score = _mm_extract_epi16 (v_maxscore, 0);
    score = score & 0x00ff;

    /*  check if we might have overflowed */
    if (score + bias >= 255)
    {
        score = 255;
    }

    /* return largest score */
    return score;
}
Esempio n. 28
0
void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression)
{
    Mat img = _img.getMat();
    const int K = patternSize/2, N = patternSize + K + 1;
#if CV_SSE2
    const int quarterPatternSize = patternSize/4;
    (void)quarterPatternSize;
#endif
    int i, j, k, pixel[25];
    makeOffsets(pixel, (int)img.step, patternSize);

    keypoints.clear();

    threshold = std::min(std::max(threshold, 0), 255);

#if CV_SSE2
    __m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K);
    (void)K16;
    (void)delta;
    (void)t;
#endif
    uchar threshold_tab[512];
    for( i = -255; i <= 255; i++ )
        threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);

    AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128);
    uchar* buf[3];
    buf[0] = _buf; buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols;
    int* cpbuf[3];
    cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1;
    cpbuf[1] = cpbuf[0] + img.cols + 1;
    cpbuf[2] = cpbuf[1] + img.cols + 1;
    memset(buf[0], 0, img.cols*3);

    for(i = 3; i < img.rows-2; i++)
    {
        const uchar* ptr = img.ptr<uchar>(i) + 3;
        uchar* curr = buf[(i - 3)%3];
        int* cornerpos = cpbuf[(i - 3)%3];
        memset(curr, 0, img.cols);
        int ncorners = 0;

        if( i < img.rows - 3 )
        {
            j = 3;
    #if CV_SSE2
            if( patternSize == 16 )
            {
                for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
                {
                    __m128i m0, m1;
                    __m128i v0 = _mm_loadu_si128((const __m128i*)ptr);
                    __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta);
                    v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta);

                    __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta);
                    __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta);
                    __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta);
                    __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta);
                    m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0));
                    m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2)));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3)));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0)));
                    m0 = _mm_or_si128(m0, m1);
                    int mask = _mm_movemask_epi8(m0);
                    if( mask == 0 )
                        continue;
                    if( (mask & 255) == 0 )
                    {
                        j -= 8;
                        ptr -= 8;
                        continue;
                    }

                    __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0;
                    for( k = 0; k < N; k++ )
                    {
                        __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta);
                        m0 = _mm_cmpgt_epi8(x, v0);
                        m1 = _mm_cmpgt_epi8(v1, x);

                        c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0);
                        c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1);

                        max0 = _mm_max_epu8(max0, c0);
                        max1 = _mm_max_epu8(max1, c1);
                    }

                    max0 = _mm_max_epu8(max0, max1);
                    int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16));

                    for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
                        if(m & 1)
                        {
                            cornerpos[ncorners++] = j+k;
                            if(nonmax_suppression)
                                curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
                        }
                }
            }
    #endif
            for( ; j < img.cols - 3; j++, ptr++ )
            {
                int v = ptr[0];
                const uchar* tab = &threshold_tab[0] - v + 255;
                int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
                d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
                d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
                d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
                d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
                d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];

                if( d & 1 )
                {
                    int vt = v - threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x < vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }

                if( d & 2 )
                {
                    int vt = v + threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x > vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }
            }
        }

        cornerpos[-1] = ncorners;

        if( i == 3 )
            continue;

        const uchar* prev = buf[(i - 4 + 3)%3];
        const uchar* pprev = buf[(i - 5 + 3)%3];
        cornerpos = cpbuf[(i - 4 + 3)%3];
        ncorners = cornerpos[-1];

        for( k = 0; k < ncorners; k++ )
        {
            j = cornerpos[k];
            int score = prev[j];
            if( !nonmax_suppression ||
               (score > prev[j+1] && score > prev[j-1] &&
                score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
                score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
            {
                keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score));
            }
        }
    }
Esempio n. 29
0
 template <> SIMD_INLINE __m128i OperationBinary8u<SimdOperationBinary8uSaturatedSubtraction>(const __m128i & a, const __m128i & b)
 {
     return _mm_subs_epu8(a, b);
 }
/* Striped Smith-Waterman
   Record the highest score of each reference position.
   Return the alignment score and ending position of the best alignment, 2nd best alignment, etc.
   Gap begin and gap extension are different.
   wight_match > 0, all other weights < 0.
   The returned positions are 0-based.
 */
static alignment_end* sw_sse2_byte (const int8_t* ref,
							 int8_t ref_dir,	// 0: forward ref; 1: reverse ref
							 int32_t refLen,
							 int32_t readLen,
							 const uint8_t weight_gapO, /* will be used as - */
							 const uint8_t weight_gapE, /* will be used as - */
							 const __m128i* vProfile,
							 uint8_t terminate,	/* the best alignment score: used to terminate
												   the matrix calculation when locating the
												   alignment beginning point. If this score
												   is set to 0, it will not be used */
	 						 uint8_t bias,  /* Shift 0 point to a positive value. */
							 int32_t maskLen) {

#define max16(m, vm) (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 8)); \
					  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 4)); \
					  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 2)); \
					  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 1)); \
					  (m) = _mm_extract_epi16((vm), 0)

	uint8_t max = 0;		                     /* the max alignment score */
	int32_t end_read = readLen - 1;
	int32_t end_ref = -1; /* 0_based best alignment ending point; Initialized as isn't aligned -1. */
	int32_t segLen = (readLen + 15) / 16; /* number of segment */

	/* array to record the largest score of each reference position */
	uint8_t* maxColumn = (uint8_t*) calloc(refLen, 1);

	/* array to record the alignment read ending position of the largest score of each reference position */
	int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t));

	/* Define 16 byte 0 vector. */
	__m128i vZero = _mm_set1_epi32(0);

	__m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i));
	__m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i));
	__m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i));
	__m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i));

	int32_t i, j;
	/* 16 byte insertion begin vector */
	__m128i vGapO = _mm_set1_epi8(weight_gapO);

	/* 16 byte insertion extension vector */
	__m128i vGapE = _mm_set1_epi8(weight_gapE);

	/* 16 byte bias vector */
	__m128i vBias = _mm_set1_epi8(bias);

	__m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
	__m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
	__m128i vTemp;
	int32_t edge, begin = 0, end = refLen, step = 1;
//	int32_t distance = readLen * 2 / 3;
//	int32_t distance = readLen / 2;
//	int32_t distance = readLen;

	/* outer loop to process the reference sequence */
	if (ref_dir == 1) {
		begin = refLen - 1;
		end = -1;
		step = -1;
	}
	for (i = begin; LIKELY(i != end); i += step) {
		int32_t cmp;
		__m128i e, vF = vZero, vMaxColumn = vZero; /* Initialize F value to 0.
							   Any errors to vH values will be corrected in the Lazy_F loop.
							 */
//		max16(maxColumn[i], vMaxColumn);
//		fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]);

		__m128i vH = pvHStore[segLen - 1];
		vH = _mm_slli_si128 (vH, 1); /* Shift the 128-bit value in vH left by 1 byte. */
		const __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */

		/* Swap the 2 H buffers. */
		__m128i* pv = pvHLoad;
		pvHLoad = pvHStore;
		pvHStore = pv;

		/* inner loop to process the query sequence */
		for (j = 0; LIKELY(j < segLen); ++j) {
			vH = _mm_adds_epu8(vH, _mm_load_si128(vP + j));
			vH = _mm_subs_epu8(vH, vBias); /* vH will be always > 0 */
	//	max16(maxColumn[i], vH);
	//	fprintf(stderr, "H[%d]: %d\n", i, maxColumn[i]);
//	int8_t* t;
//	int32_t ti;
//for (t = (int8_t*)&vH, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++);

			/* Get max from vH, vE and vF. */
			e = _mm_load_si128(pvE + j);
			vH = _mm_max_epu8(vH, e);
			vH = _mm_max_epu8(vH, vF);
			vMaxColumn = _mm_max_epu8(vMaxColumn, vH);

	//	max16(maxColumn[i], vMaxColumn);
	//	fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]);
//	for (t = (int8_t*)&vMaxColumn, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++);

			/* Save vH values. */
			_mm_store_si128(pvHStore + j, vH);

			/* Update vE value. */
			vH = _mm_subs_epu8(vH, vGapO); /* saturation arithmetic, result >= 0 */
			e = _mm_max_epu8(e, vH);
			e = _mm_subs_epu8(e, vGapE);
			_mm_store_si128(pvE + j, e);

			/* Update vF value. */
			vF = _mm_max_epu8(vF, vH);
			vF = _mm_subs_epu8(vF, vGapE);

			/* Load the next vH. */
			vH = _mm_load_si128(pvHLoad + j);
		}

		/* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */
        /* reset pointers to the start of the saved data */
        j = 0;
        vH = _mm_load_si128 (pvHStore + j);

        /*  the computed vF value is for the given column.  since */
        /*  we are at the end, we need to shift the vF value over */
        /*  to the next column. */
        vF = _mm_slli_si128 (vF, 1);
        vTemp = _mm_subs_epu8 (vH, vGapO);
		vTemp = _mm_subs_epu8 (vF, vTemp);
		vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
		cmp  = _mm_movemask_epi8 (vTemp);

        while (cmp != 0xffff)
        {
            vH = _mm_max_epu8 (vH, vF);
			vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
            _mm_store_si128 (pvHStore + j, vH);
            vF = _mm_subs_epu8 (vF, vGapE);
            j++;
            if (j >= segLen)
            {
                j = 0;
                vF = _mm_slli_si128 (vF, 1);
            }
            vH = _mm_load_si128 (pvHStore + j);

            vTemp = _mm_subs_epu8 (vH, vGapO);
            vTemp = _mm_subs_epu8 (vF, vTemp);
            vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
            cmp  = _mm_movemask_epi8 (vTemp);
        }

		vMaxScore = _mm_max_epu8(vMaxScore, vMaxColumn);
		vTemp = _mm_cmpeq_epi8(vMaxMark, vMaxScore);
		cmp = _mm_movemask_epi8(vTemp);
		if (cmp != 0xffff) {
			uint8_t temp;
			vMaxMark = vMaxScore;
			max16(temp, vMaxScore);
			vMaxScore = vMaxMark;

			if (LIKELY(temp > max)) {
				max = temp;
				if (max + bias >= 255) break;	//overflow
				end_ref = i;

				/* Store the column with the highest alignment score in order to trace the alignment ending position on read. */
				for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j];
			}
		}

		/* Record the max score of current column. */
		max16(maxColumn[i], vMaxColumn);
//		fprintf(stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]);
		if (maxColumn[i] == terminate) break;
	}

	/* Trace the alignment ending position on read. */
	uint8_t *t = (uint8_t*)pvHmax;
	int32_t column_len = segLen * 16;
	for (i = 0; LIKELY(i < column_len); ++i, ++t) {
		int32_t temp;
		if (*t == max) {
			temp = i / 16 + i % 16 * segLen;
			if (temp < end_read) end_read = temp;
		}
	}

	free(pvHmax);
	free(pvE);
	free(pvHLoad);
	free(pvHStore);

	/* Find the most possible 2nd best alignment. */
	alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end));
	bests[0].score = max + bias >= 255 ? 255 : max;
	bests[0].ref = end_ref;
	bests[0].read = end_read;

	bests[1].score = 0;
	bests[1].ref = 0;
	bests[1].read = 0;

	edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0;
	for (i = 0; i < edge; i ++) {
//			fprintf (stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]);
		if (maxColumn[i] > bests[1].score) {
			bests[1].score = maxColumn[i];
			bests[1].ref = i;
		}
	}
	edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen);
	for (i = edge + 1; i < refLen; i ++) {
//			fprintf (stderr, "refLen: %d\tmaxColumn[%d]: %d\n", refLen, i, maxColumn[i]);
		if (maxColumn[i] > bests[1].score) {
			bests[1].score = maxColumn[i];
			bests[1].ref = i;
		}
	}

	free(maxColumn);
	free(end_read_column);
	return bests;
}