コード例 #1
0
ファイル: mmintrin64.c プロジェクト: Fluffiest/splayer
__m64 _m_psubb(__m64 _MM1, __m64 _MM2)
{
	__m128i lhs = {0}, rhs = {0};
	lhs.m128i_i64[0] = _MM1.m64_i64;

	rhs.m128i_i64[0] = _MM2.m64_i64;

	lhs = _mm_sub_epi8(lhs, rhs);

	_MM1.m64_i64 = lhs.m128i_i64[0];
	return _MM1;
}
コード例 #2
0
ファイル: adddiff.cpp プロジェクト: tp7/masktools
static void adddiff_sse2_t(Byte *pDst, ptrdiff_t dst_pitch, const Byte *pSrc, ptrdiff_t src_pitch, int width, int height)
{
    int mod32_width = (width / 32) * 32;
    auto pDst2 = pDst;
    auto pSrc2 = pSrc;
    auto v128 = _mm_set1_epi32(0x80808080);

    for ( int j = 0; j < height; ++j ) {
        for ( int i = 0; i < mod32_width; i+=32 ) {
            _mm_prefetch(reinterpret_cast<const char*>(pDst)+i+128, _MM_HINT_T0);
            _mm_prefetch(reinterpret_cast<const char*>(pSrc)+i+128, _MM_HINT_T0);

            auto dst = simd_load_si128<mem_mode>(pDst+i);
            auto dst2 = simd_load_si128<mem_mode>(pDst+i+16);
            auto src = simd_load_si128<mem_mode>(pSrc+i);
            auto src2 = simd_load_si128<mem_mode>(pSrc+i+16);

            auto dstsub = _mm_sub_epi8(dst, v128);
            auto dstsub2 = _mm_sub_epi8(dst2, v128);

            auto srcsub = _mm_sub_epi8(src, v128);
            auto srcsub2 = _mm_sub_epi8(src2, v128);

            auto added = _mm_adds_epi8(dstsub, srcsub);
            auto added2 = _mm_adds_epi8(dstsub2, srcsub2);

            auto result = _mm_add_epi8(added, v128);
            auto result2 = _mm_add_epi8(added2, v128);

            simd_store_si128<mem_mode>(pDst+i, result);
            simd_store_si128<mem_mode>(pDst+i+16, result2);
        }
        pDst += dst_pitch;
        pSrc += src_pitch;
    }

    if (width > mod32_width) {
        adddiff_c(pDst2 + mod32_width, dst_pitch, pSrc2 + mod32_width, src_pitch, width - mod32_width, height);
    }
}
コード例 #3
0
ファイル: demod_soft.c プロジェクト: srsLTE/srsLTE
void demod_16qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) {
  float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2, symbol3, symbol4;
  __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_12, symbol_34;
  __m128i offset = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM16/sqrt(10));
  __m128i result1n, result1a, result2n, result2a;
  __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM16);

  __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
  __m128i shuffle_abs_1     = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);

  __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
  __m128i shuffle_abs_2     = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);

  for (int i=0;i<nsymbols/8;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol3   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol4   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v));
    symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v));
    symbol_12  = _mm_packs_epi32(symbol_i1, symbol_i2);
    symbol_34  = _mm_packs_epi32(symbol_i3, symbol_i4);
    symbol_i   = _mm_packs_epi16(symbol_12, symbol_34);

    symbol_abs  = _mm_abs_epi8(symbol_i);
    symbol_abs  = _mm_sub_epi8(symbol_abs, offset);

    result1n = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
    result1a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);

    result2n = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
    result2a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);

    _mm_store_si128(resultPtr, _mm_or_si128(result1n, result1a)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(result2n, result2a)); resultPtr++;

  }
  // Demodulate last symbols
  for (int i=8*(nsymbols/8);i<nsymbols;i++) {
    short yre = (int8_t) (SCALE_BYTE_CONV_QAM16*crealf(symbols[i]));
    short yim = (int8_t) (SCALE_BYTE_CONV_QAM16*cimagf(symbols[i]));

    llr[4*i+0] = -yre;
    llr[4*i+1] = -yim;
    llr[4*i+2] = abs(yre)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
    llr[4*i+3] = abs(yim)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
  }
}
コード例 #4
0
ファイル: lossless_enc_sse41.c プロジェクト: 03050903/godot
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  int i;
  const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
                                           -1,  5, -1,  5, -1, 1, -1, 1);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
    const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle);
    const __m128i out = _mm_sub_epi8(in, in_0g0g);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // fallthrough and finish off with plain-C
  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}
コード例 #5
0
ファイル: lossless_enc_sse2.c プロジェクト: 03050903/godot
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
    const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
    const __m128i out = _mm_sub_epi8(in, C);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // fallthrough and finish off with plain-C
  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}
コード例 #6
0
ファイル: lossless_enc_sse2.c プロジェクト: Achraf33/opencv
// Predictor0: ARGB_BLACK.
static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
                               int num_pixels, uint32_t* out) {
  int i;
  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    const __m128i res = _mm_sub_epi8(src, black);
    _mm_storeu_si128((__m128i*)&out[i], res);
  }
  if (i != num_pixels) {
    VP8LPredictorsSub_C[0](in + i, upper + i, num_pixels - i, out + i);
  }
}
コード例 #7
0
// Denoise a 16x1 vector.
static INLINE __m128i vp9_denoiser_16x1_sse2(
    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
    const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
    const __m128i *k_16, const __m128i *l3, const __m128i *l32,
    const __m128i *l21, __m128i acc_diff) {
  // Calculate differences
  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  const __m128i v_mc_running_avg_y =
      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  __m128i v_running_avg_y;
  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  // Obtain the sign. FF if diff is negative.
  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
  // Clamp absolute difference to 16 to be used to get mask. Doing this
  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
  const __m128i clamped_absdiff =
      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
  // Get masks for l2 l1 and l0 adjustments.
  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
  // Get adjustments for l2, l1, and l0.
  __m128i adj2 = _mm_and_si128(mask2, *l32);
  const __m128i adj1 = _mm_and_si128(mask1, *l21);
  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
  __m128i adj, padj, nadj;

  // Combine the adjustments and get absolute adjustments.
  adj2 = _mm_add_epi8(adj2, adj1);
  adj = _mm_sub_epi8(*l3, adj2);
  adj = _mm_andnot_si128(mask0, adj);
  adj = _mm_or_si128(adj, adj0);

  // Restore the sign and get positive and negative adjustments.
  padj = _mm_andnot_si128(diff_sign, adj);
  nadj = _mm_and_si128(diff_sign, adj);

  // Calculate filtered value.
  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

  // Adjustments <=7, and each element in acc_diff can fit in signed
  // char.
  acc_diff = _mm_adds_epi8(acc_diff, padj);
  acc_diff = _mm_subs_epi8(acc_diff, nadj);
  return acc_diff;
}
コード例 #8
0
ファイル: dsp.lossless_sse2.c プロジェクト: Antranilan/Sparky
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const __m128i mask = _mm_set1_epi32(0x0000ff00);
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
    const __m128i out = _mm_sub_epi8(in, in_0g0g);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // fallthrough and finish off with plain-C
  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}
コード例 #9
0
static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                            int src_stride,
                                            const uint8_t *a_ptr, int a_stride,
                                            const uint8_t *b_ptr, int b_stride,
                                            const uint8_t *m_ptr, int m_stride,
                                            int width, int height) {
  int x, y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x += 16) {
      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
      const __m128i m_inv = _mm_sub_epi8(mask_max, m);

      // Calculate 16 predicted pixels.
      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
      // is 64 * 255, so we have plenty of space to add rounding constants.
      const __m128i data_l = _mm_unpacklo_epi8(a, b);
      const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
      __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
      pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);

      const __m128i data_r = _mm_unpackhi_epi8(a, b);
      const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
      __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
      pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);

      const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
      res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
    }

    src_ptr += src_stride;
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  }
  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
  int32_t sad =
      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
  return (sad + 31) >> 6;
}
コード例 #10
0
ファイル: lossless_enc_sse2.c プロジェクト: Achraf33/opencv
// Predictor5: avg2(avg2(L, TR), T)
static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
                               int num_pixels, uint32_t* out) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    __m128i avg, pred, res;
    Average2_m128i(&L, &TR, &avg);
    Average2_m128i(&avg, &T, &pred);
    res = _mm_sub_epi8(src, pred);
    _mm_storeu_si128((__m128i*)&out[i], res);
  }
  if (i != num_pixels) {
    VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i);
  }
}
コード例 #11
0
ファイル: exponent_sse2.c プロジェクト: buserror/aften
int
exponent_sum_square_error_sse2(uint8_t *exp0, uint8_t *exp1, int ncoefs)
{
    int i, err;
    int exp_error = 0;
    union {
        __m128i v;
        int32_t res[4];
    } ures;
    __m128i vzero = _mm_setzero_si128();
    __m128i vres = vzero;

    for (i = 0; i < (ncoefs & ~15); i+=16) {
        __m128i vexp = _mm_loadu_si128((__m128i*)&exp0[i]);
        __m128i vexp2 = _mm_loadu_si128((__m128i*)&exp1[i]);
#if 0
        //safer but needed?
        __m128i vexphi = _mm_unpackhi_epi8(vexp, vzero);
        __m128i vexp2hi = _mm_unpackhi_epi8(vexp2, vzero);
        __m128i vexplo = _mm_unpacklo_epi8(vexp, vzero);
        __m128i vexp2lo = _mm_unpacklo_epi8(vexp2, vzero);
        __m128i verrhi = _mm_sub_epi16(vexphi, vexp2hi);
        __m128i verrlo = _mm_sub_epi16(vexplo, vexp2lo);
#else
        __m128i verr = _mm_sub_epi8(vexp, vexp2);
        __m128i vsign = _mm_cmplt_epi8(verr, vzero);
        __m128i verrhi = _mm_unpackhi_epi8(verr, vsign);
        __m128i verrlo = _mm_unpacklo_epi8(verr, vsign);
#endif
        verrhi = _mm_madd_epi16(verrhi, verrhi);
        verrlo = _mm_madd_epi16(verrlo, verrlo);
        verrhi = _mm_add_epi32(verrhi, verrlo);
        vres = _mm_add_epi32(vres, verrhi);
    }
    _mm_store_si128(&ures.v, vres);
    ures.res[0]+=ures.res[1];
    ures.res[2]+=ures.res[3];
    exp_error += ures.res[0]+ures.res[2];
    for (; i < ncoefs; ++i) {
        err = exp0[i] - exp1[i];
        exp_error += (err * err);
    }
    return exp_error;
}
コード例 #12
0
ファイル: dec_sse2.c プロジェクト: 8l/insieme
// Applies filter on 4 pixels (p1, p0, q0 and q1)
static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
                                  __m128i* const q0, __m128i* const q1,
                                  const __m128i* const mask, int hev_thresh) {
    const __m128i sign_bit = _mm_set1_epi8(0x80);
    const __m128i k64 = _mm_set1_epi8(0x40);
    const __m128i zero = _mm_setzero_si128();
    __m128i not_hev;
    __m128i t1, t2, t3;

    // compute hev mask
    GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);

    // convert to signed values
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);

    t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
    t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
    t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
    t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about

    t2 = _mm_set1_epi8(3);
    t3 = _mm_set1_epi8(4);
    t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 3
    t3 = _mm_adds_epi8(t1, t3);        // 3 * (q0 - p0) + (p1 - q1) + 4
    SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
    SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
    *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
    *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
    FLIP_SIGN_BIT2(*p0, *q0);

    // this is equivalent to signed (a + 1) >> 1 calculation
    t2 = _mm_add_epi8(t3, sign_bit);
    t3 = _mm_avg_epu8(t2, zero);
    t3 = _mm_sub_epi8(t3, k64);

    t3 = _mm_and_si128(not_hev, t3);   // if !hev
    *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
    *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
    FLIP_SIGN_BIT2(*p1, *q1);
}
コード例 #13
0
static INLINE unsigned int masked_sad8xh_ssse3(
    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y += 2) {
    const __m128i src = _mm_unpacklo_epi64(
        _mm_loadl_epi64((const __m128i *)src_ptr),
        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
    const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
    const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
    const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
    const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
    const __m128i m =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
    const __m128i m_inv = _mm_sub_epi8(mask_max, m);

    const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
    const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
    __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
    pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);

    const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
    const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
    __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
    pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  int32_t sad =
      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
  return (sad + 31) >> 6;
}
コード例 #14
0
ファイル: ScanlineFilter.cpp プロジェクト: cdyk/imgcompbench
void
filterScanlinesSSE( unsigned char* filtered,
                    unsigned char* image,
                    unsigned int WIDTH,
                    unsigned int HEIGHT )
{
    int blocks = 3*WIDTH/16;

    // Create move-mask for last block of each scanline
    __m128i mask = _mm_cmplt_epi8( _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8,
                                                  7,  6,  5,  4,  3,  2, 1, 0 ),
                                   _mm_set1_epi8( 3*WIDTH-16*blocks ) );
    {
        const unsigned char* in = image;
        unsigned char* out = filtered;
        *out++ = 0;
        for(int b=0; b<blocks; b++ ) {
            _mm_storeu_si128( (__m128i*)out, _mm_lddqu_si128( (__m128i const*)in ) );
            in += 16;
            out += 16;
        }
        _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ), mask, (char*)out );
    }

    for( unsigned int j=1; j<HEIGHT; j++ ) {
        const unsigned char* in = image + 3*WIDTH*(j-1);
        unsigned char* out = filtered + (3*WIDTH+1)*j;
        *out++ = 2;
        for(int b=0; b<blocks; b++ ) {
            __m128i _t0 = _mm_lddqu_si128( (__m128i const*)in );
            __m128i _t1 = _mm_lddqu_si128( (__m128i const*)(in + 3*WIDTH ) );

            _mm_storeu_si128( (__m128i*)out,
                              _mm_sub_epi8( _t1, _t0 ) );
            in += 16;
            out += 16;
        }
        _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ),
                             mask,
                             (char*)out );

    }
}
コード例 #15
0
static INLINE unsigned int masked_sad4xh_ssse3(
    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y += 2) {
    // Load two rows at a time, this seems to be a bit faster
    // than four rows at a time in this case.
    const __m128i src = _mm_unpacklo_epi32(
        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
    const __m128i a =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
    const __m128i b =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
    const __m128i m =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
    const __m128i m_inv = _mm_sub_epi8(mask_max, m);

    const __m128i data = _mm_unpacklo_epi8(a, b);
    const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
    __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
    pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  // At this point, the SAD is stored in lane 0 of 'res'
  int32_t sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
}
コード例 #16
0
ファイル: lossless_enc_sse2.c プロジェクト: Achraf33/opencv
static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    __m128i pa, pb;
    GetSumAbsDiff32_SSE2(&T, &TL, &pa);   // pa = sum |T-TL|
    GetSumAbsDiff32_SSE2(&L, &TL, &pb);   // pb = sum |L-TL|
    {
      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
      const __m128i A = _mm_and_si128(mask, L);
      const __m128i B = _mm_andnot_si128(mask, T);
      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
      const __m128i res = _mm_sub_epi8(src, pred);
      _mm_storeu_si128((__m128i*)&out[i], res);
    }
  }
  if (i != num_pixels) {
    VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
  }
}
コード例 #17
0
static void GradientPredictDirect(const uint8_t* const row,
                                  const uint8_t* const top,
                                  uint8_t* const out, int length) {
  const int max_pos = length & ~7;
  int i;
  const __m128i zero = _mm_setzero_si128();
  for (i = 0; i < max_pos; i += 8) {
    const __m128i A0 = _mm_loadl_epi64((const __m128i*)&row[i - 1]);
    const __m128i B0 = _mm_loadl_epi64((const __m128i*)&top[i]);
    const __m128i C0 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
    const __m128i D = _mm_loadl_epi64((const __m128i*)&row[i]);
    const __m128i A1 = _mm_unpacklo_epi8(A0, zero);
    const __m128i B1 = _mm_unpacklo_epi8(B0, zero);
    const __m128i C1 = _mm_unpacklo_epi8(C0, zero);
    const __m128i E = _mm_add_epi16(A1, B1);
    const __m128i F = _mm_sub_epi16(E, C1);
    const __m128i G = _mm_packus_epi16(F, zero);
    const __m128i H = _mm_sub_epi8(D, G);
    _mm_storel_epi64((__m128i*)(out + i), H);
  }
  for (; i < length; ++i) {
    out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
  }
}
コード例 #18
0
ファイル: SimdAvx2Ann.cpp プロジェクト: ashh87/Simd
		template <> __m128i Invert<true>(__m128i value)
		{
			return _mm_sub_epi8(Sse2::K_INV_ZERO, value);
		}
コード例 #19
0
ファイル: fast.cpp プロジェクト: AnnaPetrovicheva/opencv
void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression)
{
    Mat img = _img.getMat();
    const int K = patternSize/2, N = patternSize + K + 1;
#if CV_SSE2
    const int quarterPatternSize = patternSize/4;
    (void)quarterPatternSize;
#endif
    int i, j, k, pixel[25];
    makeOffsets(pixel, (int)img.step, patternSize);

    keypoints.clear();

    threshold = std::min(std::max(threshold, 0), 255);

#if CV_SSE2
    __m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K);
    (void)K16;
    (void)delta;
    (void)t;
#endif
    uchar threshold_tab[512];
    for( i = -255; i <= 255; i++ )
        threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);

    AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128);
    uchar* buf[3];
    buf[0] = _buf; buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols;
    int* cpbuf[3];
    cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1;
    cpbuf[1] = cpbuf[0] + img.cols + 1;
    cpbuf[2] = cpbuf[1] + img.cols + 1;
    memset(buf[0], 0, img.cols*3);

    for(i = 3; i < img.rows-2; i++)
    {
        const uchar* ptr = img.ptr<uchar>(i) + 3;
        uchar* curr = buf[(i - 3)%3];
        int* cornerpos = cpbuf[(i - 3)%3];
        memset(curr, 0, img.cols);
        int ncorners = 0;

        if( i < img.rows - 3 )
        {
            j = 3;
    #if CV_SSE2
            if( patternSize == 16 )
            {
                for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
                {
                    __m128i m0, m1;
                    __m128i v0 = _mm_loadu_si128((const __m128i*)ptr);
                    __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta);
                    v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta);

                    __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta);
                    __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta);
                    __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta);
                    __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta);
                    m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0));
                    m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2)));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3)));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0)));
                    m0 = _mm_or_si128(m0, m1);
                    int mask = _mm_movemask_epi8(m0);
                    if( mask == 0 )
                        continue;
                    if( (mask & 255) == 0 )
                    {
                        j -= 8;
                        ptr -= 8;
                        continue;
                    }

                    __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0;
                    for( k = 0; k < N; k++ )
                    {
                        __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta);
                        m0 = _mm_cmpgt_epi8(x, v0);
                        m1 = _mm_cmpgt_epi8(v1, x);

                        c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0);
                        c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1);

                        max0 = _mm_max_epu8(max0, c0);
                        max1 = _mm_max_epu8(max1, c1);
                    }

                    max0 = _mm_max_epu8(max0, max1);
                    int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16));

                    for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
                        if(m & 1)
                        {
                            cornerpos[ncorners++] = j+k;
                            if(nonmax_suppression)
                                curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
                        }
                }
            }
    #endif
            for( ; j < img.cols - 3; j++, ptr++ )
            {
                int v = ptr[0];
                const uchar* tab = &threshold_tab[0] - v + 255;
                int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
                d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
                d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
                d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
                d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
                d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];

                if( d & 1 )
                {
                    int vt = v - threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x < vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }

                if( d & 2 )
                {
                    int vt = v + threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x > vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }
            }
        }

        cornerpos[-1] = ncorners;

        if( i == 3 )
            continue;

        const uchar* prev = buf[(i - 4 + 3)%3];
        const uchar* pprev = buf[(i - 5 + 3)%3];
        cornerpos = cpbuf[(i - 4 + 3)%3];
        ncorners = cornerpos[-1];

        for( k = 0; k < ncorners; k++ )
        {
            j = cornerpos[k];
            int score = prev[j];
            if( !nonmax_suppression ||
               (score > prev[j+1] && score > prev[j-1] &&
                score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
                score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
            {
                keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score));
            }
        }
    }
コード例 #20
0
}bool validate_utf8_sse(const char *src, size_t len) {
  const char *end = src + len;
  while (src + 16 < end) {
    __m128i chunk = _mm_loadu_si128((const __m128i *)(src));

    int asciiMask = _mm_movemask_epi8(chunk);
    if (!asciiMask) {
      src += 16;
      continue;
    }

    __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80));
    __m128i cond2 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed);
    __m128i state = _mm_set1_epi8((char)(0x0 | 0x80));
    state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2);

    __m128i cond3 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed);

    state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3);
    __m128i mask3 = _mm_slli_si128(cond3, 1);

    __m128i cond4 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed);

    // Fall back to the scalar processing
    if (_mm_movemask_epi8(cond4)) {
      break;
    }

    __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));

    __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));

    __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1));

    __m128i shifts = count_sub1;
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
    counts = _mm_add_epi8(
        counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2));
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));

    if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0))))
      return false; // error
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));

    if (_mm_movemask_epi8(_mm_cmpgt_epi8(
            _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1))))
      return false; // error

    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));

    __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
    shifts =
        _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1

    chunk =
        _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1));

    __m128i chunk_right = _mm_slli_si128(chunk, 1);

    __m128i chunk_low = _mm_blendv_epi8(
        chunk,
        _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6),
                                          _mm_set1_epi8(0xc0))),
        _mm_cmpeq_epi8(counts, _mm_set1_epi8(1)));

    __m128i chunk_high =
        _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2));
    chunk_high = _mm_srli_epi32(chunk_high, 2);

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4));
    chunk_high = _mm_or_si128(
        chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4),
                                                _mm_set1_epi8(0xf0)),
                                  mask3));
    int c = _mm_extract_epi16(counts, 7);
    int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14;

    __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8));
    if (!_mm_testz_si128(
            mask3,
            _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)),
                         _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8)))))
      return false;

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8));

    chunk_high = _mm_slli_si128(chunk_high, 1);

    __m128i shuf =
        _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5,
                                          4, 3, 2, 1, 0));

    chunk_low = _mm_shuffle_epi8(chunk_low, shuf);
    chunk_high = _mm_shuffle_epi8(chunk_high, shuf);
    __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high);
    __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high);

    if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8,
                     _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) |
        _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8,
                     _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) {
      return false;
    }

    src += source_advance;
  }
  return validate_utf8(src, end - src);
}
コード例 #21
0
ファイル: turbodecoder_sse.c プロジェクト: HankW507/srsLTE
/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */
static inline int16_t hMax(__m128i buffer)
{
  __m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer);
  __m128i tmp3 = _mm_minpos_epu16(tmp1);
  return (int16_t)(_mm_cvtsi128_si32(tmp3));
}
コード例 #22
0
ファイル: demod_soft.c プロジェクト: srsLTE/srsLTE
void demod_64qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols)
{
  float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2, symbol3, symbol4;
  __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_abs2,symbol_12, symbol_34;
  __m128i offset1 = _mm_set1_epi8(4*SCALE_BYTE_CONV_QAM64/sqrt(42));
  __m128i offset2 = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM64/sqrt(42));
  __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM64);
  __m128i result11, result12, result13, result22, result21,result23, result31, result32, result33;

  __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0);
  __m128i shuffle_negated_2 = _mm_set_epi8(11,10,0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff);
  __m128i shuffle_negated_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff);

  __m128i shuffle_abs_1 = _mm_set_epi8(5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff);
  __m128i shuffle_abs_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff);
  __m128i shuffle_abs_3 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10);

  __m128i shuffle_abs2_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff,0xff,0xff);
  __m128i shuffle_abs2_2 = _mm_set_epi8(0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff,5,4);
  __m128i shuffle_abs2_3 = _mm_set_epi8(15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10,0xff,0xff);

  for (int i=0;i<nsymbols/8;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol3   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol4   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v));
    symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v));
    symbol_12  = _mm_packs_epi32(symbol_i1, symbol_i2);
    symbol_34  = _mm_packs_epi32(symbol_i3, symbol_i4);
    symbol_i   = _mm_packs_epi16(symbol_12, symbol_34);

    symbol_abs  = _mm_abs_epi8(symbol_i);
    symbol_abs  = _mm_sub_epi8(symbol_abs, offset1);
    symbol_abs2 = _mm_sub_epi8(_mm_abs_epi8(symbol_abs), offset2);

    result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
    result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);
    result13 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_1);

    result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
    result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);
    result23 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_2);

    result31 = _mm_shuffle_epi8(symbol_i, shuffle_negated_3);
    result32 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_3);
    result33 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_3);

    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++;

  }
  for (int i=8*(nsymbols/8);i<nsymbols;i++) {
    float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i]));
    float yim = (int8_t) (SCALE_BYTE_CONV_QAM64*cimagf(symbols[i]));

    llr[6*i+0] = -yre;
    llr[6*i+1] = -yim;
    llr[6*i+2] = abs(yre)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+3] = abs(yim)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
  }
}
コード例 #23
0
ファイル: Sobel.cpp プロジェクト: Yanzqing/BHumanCodeRelease
void Sobel::sobelSSE(const Image1D& srcImage, SobelImage& destImage)
{
  ASSERT(srcImage.width % 16 == 0);
  ASSERT(srcImage.height >= 3);
  ASSERT(srcImage.yStart >= 0);
  ASSERT(srcImage.yStart <= srcImage.height);

  destImage.setResolution(srcImage.width, srcImage.height);
  destImage.yStart = srcImage.yStart;

  if(srcImage.yStart >= srcImage.height)
    return;

  // a b c    0 1 2
  // d e f    3 4 5
  // g h i    6 7 8
  __m128i valA, valB, valC, valD, valF, valG, valH, valI;
  __m128i sumX;
  __m128i sumY;
  __m128i tmp;

  __m128i zeros = _mm_setzero_si128();

  __m128i* pDestImg;
  __m128i* pDestImgLineEnd;

  // Fill top line
  for(pDestImg = reinterpret_cast<__m128i*>(destImage[destImage.yStart]), pDestImgLineEnd = reinterpret_cast<__m128i*>(destImage[destImage.yStart + 1]);
      pDestImg < pDestImgLineEnd; ++pDestImg)
  {
    *pDestImg = zeros;
  }


  int lastRow = destImage.height - 1;
  const Image1D::Pixel* p0 = srcImage[destImage.yStart];
  const Image1D::Pixel* p1 = srcImage[destImage.yStart + 1];
  const Image1D::Pixel* p2 = srcImage[destImage.yStart + 2];
  const Image1D::Pixel* p0LineEnd;

  for(int y = destImage.yStart + 1; y < lastRow; ++y)
  {
    for(p0LineEnd = srcImage[y], pDestImg = reinterpret_cast<__m128i*>(destImage[y]); p0 < p0LineEnd;
        p0 += 16, p1 += 16, p2 += 16, pDestImg += 2)
    {
      // laod values
      valA = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p0 - 1));
      valB = _mm_load_si128(reinterpret_cast<const __m128i*>(p0));
      valC = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p0 + 1));

      valD = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p1 - 1));
      valF = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p1 + 1));

      valG = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p2 - 1));
      valH = _mm_load_si128(reinterpret_cast<const __m128i*>(p2));
      valI = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p2 + 1));

      sumX = _mm_avg_epu8(valA, valG); // sumX = (a + g) / 2
      sumX = _mm_avg_epu8(sumX, valD); // sumX = (sumX + d) / 2
      sumX = _mm_avg_epu8(sumX, zeros); // sumX = sumX / 2 with average, because there is no 8 bit shift
      tmp = _mm_avg_epu8(valC, valI); // tmp = (c + i) / 2
      tmp = _mm_avg_epu8(tmp, valF); // tnp = (tmp + f) / 2
      tmp = _mm_avg_epu8(tmp, zeros); // tnp = tnp / 2 with average, because there is no 8 bit shift
      sumX = _mm_sub_epi8(sumX, tmp);

      sumY = _mm_avg_epu8(valA, valC); // sumX = (a + c) / 2
      sumY = _mm_avg_epu8(sumY, valB); // sumX = (sumX + b) / 2
      sumY = _mm_avg_epu8(sumY, zeros); // sumX = sumX / 2 with average, because there is no 8 bit shift
      tmp = _mm_avg_epu8(valG, valI); // tmp = (g + i) / 2
      tmp = _mm_avg_epu8(tmp, valH); // tnp = (tmp + h) / 2
      tmp = _mm_avg_epu8(tmp, zeros); // tnp = tnp / 2 with average, because there is no 8 bit shift
      sumY = _mm_sub_epi8(sumY, tmp);

      *pDestImg = _mm_unpacklo_epi8(sumX, sumY);
      *(pDestImg + 1) = _mm_unpackhi_epi8(sumX, sumY);
    }
  }

  // Fill bottom line
  for(pDestImg = reinterpret_cast<__m128i*>(destImage[destImage.height - 1]), pDestImgLineEnd = reinterpret_cast<__m128i*>(destImage[destImage.height]);
      pDestImg < pDestImgLineEnd; ++pDestImg)
  {
    *pDestImg = zeros;
  }

  // Fill right and left border
  for(int y = destImage.yStart; y < destImage.height - 1; y++)
  {
    destImage[y]->index = 0;
    (destImage[y + 1] - 1)->index = 0;
  }
}
コード例 #24
0
mlib_status
__mlib_VectorSumAbsDiff_S8_Sat(
    mlib_d64 *z,
    const mlib_s8 *x,
    const mlib_s8 *y,
    mlib_s32 n)
{
    if (n <= 0)
        return (MLIB_FAILURE);

    mlib_s32 i, nstep, ax, ay, n1, n2, n3, diff, sum = 0;
    mlib_s8 *px = (mlib_s8 *)x, *py = (mlib_s8 *)y;
    __m128i zero, xbuf, ybuf, zbuf, mext, mbuf;
    zero = _mm_setzero_si128();
    zbuf = zero;

    nstep = 16 / sizeof (mlib_s8);
    ax = (mlib_addr)x & 15;
    ay = (mlib_addr)y & 15;
    n1 = ((16 - ax) & 15) / sizeof (mlib_s8);
    n2 = (n - n1) / nstep;
    n3 = n - n1 - n2 * nstep;

    if (n2 < 1) {
        for (i = 0; i < n; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);
        }
        *z = sum;
    } else {
        for (i = 0; i < n1; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);
        }
        if (ax == ay) {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_load_si128((__m128i *)py);
                mext = _mm_cmpgt_epi8(ybuf, xbuf);
                mbuf = _mm_sub_epi8(xbuf, ybuf);
                mbuf = _mm_xor_si128(mbuf, mext);
                mbuf = _mm_sub_epi8(mbuf, mext);
                mbuf = _mm_sad_epu8(mbuf, zero);
                zbuf = _mm_add_epi64(zbuf, mbuf);
                px += nstep;
                py += nstep;
            }
        } else {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_loadu_si128((__m128i *)py);
                mext = _mm_cmpgt_epi8(ybuf, xbuf);
                mbuf = _mm_sub_epi8(xbuf, ybuf);
                mbuf = _mm_xor_si128(mbuf, mext);
                mbuf = _mm_sub_epi8(mbuf, mext);
                mbuf = _mm_sad_epu8(mbuf, zero);
                zbuf = _mm_add_epi64(zbuf, mbuf);
                px += nstep;
                py += nstep;
            }
        }
        for (i = 0; i < n3; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);
        }

        mlib_d64 dsum = sum;
        long long pz[2];
        _mm_storeu_si128((__m128i *)pz, zbuf);
        dsum += pz[0];
        dsum += pz[1];
        *z = dsum;
    }
    return (MLIB_SUCCESS);
}
コード例 #25
0
EB_ERRORTYPE GatherSaoStatisticsLcu_OnlyEo_90_45_135_16bit_SSE2_INTRIN(
    EB_U16                   *inputSamplePtr,       // input parameter, source Picture Ptr
    EB_U32                   inputStride,           // input parameter, source stride
    EB_U16                   *reconSamplePtr,       // input parameter, deblocked Picture Ptr
    EB_U32                   reconStride,           // input parameter, deblocked stride
    EB_U32                   lcuWidth,              // input parameter, LCU width
    EB_U32                   lcuHeight,             // input parameter, LCU height
    EB_S32                   eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1],    // output parameter, used to store Edge Offset diff, eoDiff[SAO_EO_TYPES] [SAO_EO_CATEGORIES]
    EB_U16                   eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1])   // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES]
    // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES]
{
#define boShift 5

    EB_ERRORTYPE return_error = EB_ErrorNone;
    EB_U64 count_x, count_y;
    EB_S32 diff;
    __m128i xmm0, xmm_1, xmm_N1, xmm_N3, xmm_N4, xmm_skip_mask, xmm9, xmm10, xmm11, xmm12, xmm13, xmm15;
    __m128i xmm_temp_input1, xmm_temp_input2, xmm_temp_recon1, xmm_temp_recon2, xmm_diff1, xmm_diff2;
    __m128i xmm_sign_1, xmm_sign_1a, xmm_sign_1b, xmm_sign_2a, xmm_sign_2b, xmm_sign_2, xmm_eoIndex;

    xmm0 = _mm_setzero_si128();
    xmm12 = _mm_setzero_si128();
    xmm15 = _mm_set1_epi16(0x0001);
    xmm_N1 = _mm_set1_epi8((signed char)0xFF);
    xmm_N3 = _mm_set1_epi8((signed char)0xFD);
    xmm_N4 = _mm_set1_epi8((signed char)0xFC);
    xmm_1 = _mm_sub_epi8(xmm0, xmm_N1);

    // Initialize SAO Arrays
    EB_ALIGN(16) EB_S8 rTemp[512] = { 0 };
    EB_U64 reconStrideTemp;

    lcuHeight -= 2;                          
    inputSamplePtr += inputStride + 1;       
    reconSamplePtr++;                        

    if (lcuWidth == 16) {

        xmm_skip_mask = _mm_srli_si128(xmm_N1, 2);
        for (count_y = 0; count_y < lcuHeight; ++count_y) {

            xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride));
            xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8));
            xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr));
            xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8));
            xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1);
            xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2);

            xmm_diff2 = _mm_slli_si128(xmm_diff2, 4); //skip last 2 samples
            xmm_diff2 = _mm_srli_si128(xmm_diff2, 4); //skip last 2 samples

            // EO-90
            MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride)
            xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples
            MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1)

            // EO-135
            MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1)
            xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples
            MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2)

           // EO-45
           MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1)
           xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples
           MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3)
           
           inputSamplePtr += inputStride;
           reconSamplePtr += reconStride;
        }
        lcuWidth = 2;
    }
    else if (lcuWidth == 28) {

        xmm_skip_mask = _mm_srli_si128(xmm_N1, 6);

        for (count_y = 0; count_y < lcuHeight; ++count_y) {
            //----------- 0-15 -----------
            xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride));
            xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8));
            xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr));
            xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8));
            xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1);
            xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2);

            // EO-90
            MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride)
            MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1)

            // EO-135
            MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1)
            MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2)

            // EO-45
            MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1)
            MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3)
            
            //----------- 16-25 -----------
            xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 16));
            xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 24));
            xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 16));
            xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 24));
            xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1);
            xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2);

            xmm_diff2 = _mm_slli_si128(xmm_diff2, 12); //skip last 6 samples
            xmm_diff2 = _mm_srli_si128(xmm_diff2, 12); //skip last 6 samples

            // EO-90
            MACRO_CALC_EO_INDEX(reconSamplePtr+16, reconSamplePtr+2*reconStride+16)
            xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples
            MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1)

            // EO-135
            MACRO_CALC_EO_INDEX(reconSamplePtr+15, reconSamplePtr+2*reconStride+17)
            xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples
            MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2)

            // EO-45
            MACRO_CALC_EO_INDEX(reconSamplePtr+17, reconSamplePtr+2*reconStride+15)
            xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples
            MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3)

            inputSamplePtr += inputStride;
            reconSamplePtr += reconStride;
        }
        lcuWidth = 6;
    }
    else if (lcuWidth == 56) {

        xmm_skip_mask = _mm_srli_si128(xmm_N1, 10);
        lcuWidth -= 8;
        inputStride -= lcuWidth;
        reconStrideTemp = reconStride - lcuWidth;

        for (count_y = 0; count_y < lcuHeight; ++count_y) {
            for (count_x = 0; count_x < lcuWidth; count_x += 16) {
                //----------- 0-15 -----------
                xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride));
                xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8));
                xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr));
                xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8));
                xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1);
                xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2);

                // EO-90
                MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr + 2 * reconStride)
                MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1)

                // EO-135
                MACRO_CALC_EO_INDEX(reconSamplePtr - 1, reconSamplePtr + 2 * reconStride + 1)
                MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2)

                // EO-45
                MACRO_CALC_EO_INDEX(reconSamplePtr + 1, reconSamplePtr + 2 * reconStride - 1)
                MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3)

                inputSamplePtr += 16;
                reconSamplePtr += 16;                 
            }
            //----------- 48-53 -----------
            xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride));
            xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr));
            xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1);

            xmm_diff1 = _mm_slli_si128(xmm_diff1, 4); //skip last 10 samples
            xmm_diff1 = _mm_srli_si128(xmm_diff1, 4); //skip last 10 samples

            // EO-90
            MACRO_CALC_EO_INDEX_HALF(reconSamplePtr, reconSamplePtr+2*reconStride)
            xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples
            MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1)

            // EO-135
            MACRO_CALC_EO_INDEX_HALF(reconSamplePtr-1, reconSamplePtr+2*reconStride+1)
            xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples
            MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2)

            // EO-45
            MACRO_CALC_EO_INDEX_HALF(reconSamplePtr+1, reconSamplePtr+2*reconStride-1)
            xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples
            MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3)

            inputSamplePtr += inputStride;
            reconSamplePtr += reconStrideTemp;
        }
        lcuWidth = 10;
    }
    else {

        lcuWidth -= 16;
        inputStride -= lcuWidth;
        reconStrideTemp = reconStride - lcuWidth;
        xmm_skip_mask = _mm_srli_si128(xmm_N1, 2);

        for (count_y = 0; count_y < lcuHeight; ++count_y) {
            for (count_x = 0; count_x < lcuWidth; count_x += 16) {
                //----------- 0-15 -----------
                xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride));
                xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8));
                xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr));
                xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8));
                xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1);
                xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2);

                //EO-90
                MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr + 2 * reconStride)
                MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1)

                //EO-135
                MACRO_CALC_EO_INDEX(reconSamplePtr - 1, reconSamplePtr + 2 * reconStride + 1)
                MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2)

                //EO-45
                MACRO_CALC_EO_INDEX(reconSamplePtr + 1, reconSamplePtr + 2 * reconStride - 1)
                MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3)

                inputSamplePtr += 16;
                reconSamplePtr += 16;
            }
            //----------- 48-61 -----------
            xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride));
            xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8));
            xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr));
            xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8));
            xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1);
            xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2);

            xmm_diff2 = _mm_slli_si128(xmm_diff2, 4); //skip last 2 samples
            xmm_diff2 = _mm_srli_si128(xmm_diff2, 4); //skip last 2 samples

            // EO-90
            MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride)
            xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples
            MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1)

            // EO-135
            MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1)
            xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples
            MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2)

            // EO-45
            MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1)
            xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples
            MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3)

            inputSamplePtr += inputStride;
            reconSamplePtr += reconStrideTemp;
        }
        lcuWidth = 2;
    }

    lcuWidth = (EB_U16)lcuWidth * (EB_U16)lcuHeight;

    MACRO_SAVE_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1, 1)
    MACRO_SAVE_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2, 2)
    MACRO_SAVE_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3, 3)
                                    
    return return_error;
}
コード例 #26
0
void vec_i8_cnt_dosage2(const int8_t *p, int8_t *out, size_t n, int8_t val,
	int8_t missing, int8_t missing_substitute)
{
#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)out & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p+=2)
	{
		*out ++ = ((p[0] == missing) || (p[1] == missing)) ?
			missing_substitute :
			(p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0);
	}

	// body, SSE2
	const __m128i val16  = _mm_set1_epi8(val);
	const __m128i miss16 = _mm_set1_epi8(missing);
	const __m128i sub16  = _mm_set1_epi8(missing_substitute);
	const __m128i mask   = _mm_set1_epi16(0x00FF);

#   ifdef COREARRAY_SIMD_AVX2

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)out & 0x10))
	{
		__m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16;
		__m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16;

		__m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask));
		__m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8));

		__m128i c = _mm_setzero_si128();
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16));
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16));

		w1 = _mm_cmpeq_epi8(v1, miss16);
		w2 = _mm_cmpeq_epi8(v2, miss16);
		__m128i w  = _mm_or_si128(w1, w2);
		c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c));

		_mm_store_si128((__m128i *)out, c);
		n -= 16; out += 16;
	}

	const __m256i val32  = _mm256_set1_epi8(val);
	const __m256i miss32 = _mm256_set1_epi8(missing);
	const __m256i sub32  = _mm256_set1_epi8(missing_substitute);
	const __m256i mask2  = _mm256_set1_epi16(0x00FF);

	for (; n >= 32; n-=32)
	{
		__m256i w1 = MM_LOADU_256((__m256i const*)p); p += 32;
		__m256i w2 = MM_LOADU_256((__m256i const*)p); p += 32;

		__m256i v1 = _mm256_packus_epi16(_mm256_and_si256(w1, mask2), _mm256_and_si256(w2, mask2));
		__m256i v2 = _mm256_packus_epi16(_mm256_srli_epi16(w1, 8), _mm256_srli_epi16(w2, 8));

		__m256i c = _mm256_setzero_si256();
		c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v1, val32));
		c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v2, val32));

		w1 = _mm256_cmpeq_epi8(v1, miss32);
		w2 = _mm256_cmpeq_epi8(v2, miss32);
		__m256i w = _mm256_or_si256(w1, w2);
		c = _mm256_or_si256(_mm256_and_si256(w, sub32), _mm256_andnot_si256(w, c));

		c = _mm256_permute4x64_epi64(c, 0xD8);
		_mm256_store_si256((__m256i *)out, c);
		out += 32;
	}

#   endif

	// SSE2 only
	for (; n >= 16; n-=16)
	{
		__m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16;
		__m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16;

		__m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask));
		__m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8));

		__m128i c = _mm_setzero_si128();
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16));
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16));

		w1 = _mm_cmpeq_epi8(v1, miss16);
		w2 = _mm_cmpeq_epi8(v2, miss16);
		__m128i w = _mm_or_si128(w1, w2);
		c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c));

		_mm_store_si128((__m128i *)out, c);
		out += 16;
	}

#endif

	// tail
	for (; n > 0; n--, p+=2)
	{
		*out ++ = ((p[0] == missing) || (p[1] == missing)) ?
			missing_substitute :
			(p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0);
	}
}
コード例 #27
0
// this function performs precise calculations
void PreOver_SSE2(void* dest, const void* source1, const void* source2, size_t size)
{
	static const size_t stride = sizeof(__m128i)*4;
	static const u32 PSD = 64;

	static const __m128i round = _mm_set1_epi16(128);
	static const __m128i lomask = _mm_set1_epi32(0x00FF00FF);

	assert(source1 != NULL && source2 != NULL && dest != NULL);
	assert(size % stride == 0);

	const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1);
	const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2);
	__m128i*	   dest128 = reinterpret_cast<__m128i*>(dest);	
	
	__m128i d, s, a, rb, ag, t;

	// TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N)

	for(size_t k = 0, length = size/stride; k < length; ++k)	
	{
		// TODO: put prefetch between calculations?(R.N)
		_mm_prefetch(reinterpret_cast<const s8*>(source128_1+PSD), _MM_HINT_NTA);
		_mm_prefetch(reinterpret_cast<const s8*>(source128_2+PSD), _MM_HINT_NTA);	

		// work on entire cacheline before next prefetch
		for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2)
		{
			// TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/

			// TODO: load entire cacheline at the same time? are there enough registers? 32 bit mode (special compile for 64bit?) (R.N)
			s = _mm_load_si128(source128_1);		// AABGGRR
			d = _mm_load_si128(source128_2);		// AABGGRR
						
			// PRELERP(S, D) = S+D - ((S*D[A]+0x80)>>8)+(S*D[A]+0x80))>>8
			// T = S*D[A]+0x80 => PRELERP(S,D) = S+D - ((T>>8)+T)>>8

			// set alpha to lo16 from dest_
			a = _mm_srli_epi32(d, 24);			// 000000AA	
			rb = _mm_slli_epi32(a, 16);			// 00AA0000
			a = _mm_or_si128(rb, a);			// 00AA00AA

			rb = _mm_and_si128(lomask, s);		// 00BB00RR		
			rb = _mm_mullo_epi16(rb, a);		// BBBBRRRR	
			rb = _mm_add_epi16(rb, round);		// BBBBRRRR
			t = _mm_srli_epi16(rb, 8);			// 00BB00RR	
			t = _mm_add_epi16(t, rb);
			rb = _mm_srli_epi16(t, 8);

			ag = _mm_srli_epi16(s, 8); 			// 00AA00GG		
			ag = _mm_mullo_epi16(ag, a);		// AAAAGGGG		
			ag = _mm_add_epi16(ag, round);
			t = _mm_srli_epi16(ag, 8);
			t = _mm_add_epi16(t, ag);
			ag = _mm_andnot_si128(lomask, t);	// AA00GG00		
					
			rb = _mm_or_si128(rb, ag);			// AABGGRR		pack
					
			rb = _mm_sub_epi8(s, rb);			// sub S-[(D[A]*S)/255]
			d = _mm_add_epi8(d, rb);			// add D+[S-(D[A]*S)/255]

			_mm_store_si128(dest128, d);
		}
	}		
}
コード例 #28
0
ファイル: sse2-psubb-1.c プロジェクト: 0day-ci/gcc
test (__m128i s1, __m128i s2)
{
  return _mm_sub_epi8 (s1, s2); 
}
コード例 #29
0
void alphaBlendSSE_8u(Mat& src1, Mat& src2, Mat& alpha, Mat& dest)
{
	if(dest.empty())dest.create(src1.size(),CV_8U);

	const int imsize = (src1.size().area()/16);
	uchar* s1 = src1.data;
	uchar* s2 = src2.data;
	uchar* a = alpha.data;
	uchar* d = dest.data;

	const __m128i zero = _mm_setzero_si128();
	const __m128i amax = _mm_set1_epi8(char(255));
	int i=0;
	if(s1==d)
	{
		for(;i<imsize;++i)
		{
			__m128i ms1h = _mm_load_si128((__m128i*)(s1));
			__m128i ms2h = _mm_load_si128((__m128i*)(s2));
			__m128i mah = _mm_load_si128((__m128i*)(a));
			__m128i imah = _mm_sub_epi8(amax,mah);

			__m128i ms1l = _mm_unpacklo_epi8(ms1h, zero);
			ms1h = _mm_unpackhi_epi8(ms1h, zero);

			__m128i ms2l = _mm_unpacklo_epi8(ms2h, zero);
			ms2h = _mm_unpackhi_epi8(ms2h, zero);

			__m128i mal = _mm_unpacklo_epi8(mah, zero);
			mah = _mm_unpackhi_epi8(mah, zero);

			__m128i imal = _mm_unpacklo_epi8(imah, zero);
			imah = _mm_unpackhi_epi8(imah, zero);

			ms1l = _mm_mullo_epi16(ms1l,mal);
			ms2l = _mm_mullo_epi16(ms2l,imal);
			ms1l = _mm_add_epi16(ms1l,ms2l);
			//ms1l = _mm_srli_epi16(ms1l,8);
			ms1l = _mm_srai_epi16(ms1l,8);

			ms1h = _mm_mullo_epi16(ms1h,mah);
			ms2h = _mm_mullo_epi16(ms2h,imah);
			ms1h = _mm_add_epi16(ms1h,ms2h);
			//ms1h = _mm_srli_epi16(ms1h,8);
			ms1h = _mm_srai_epi16(ms1h,8);

			_mm_stream_si128((__m128i*)s1,_mm_packs_epi16(ms1l,ms1h));

			s1+=16;
			s2+=16;
			a+=16;
		}
	}
	else
	{
		for(;i<imsize;++i)
		{
			__m128i ms1h = _mm_load_si128((__m128i*)(s1));
			__m128i ms2h = _mm_load_si128((__m128i*)(s2));
			__m128i mah = _mm_load_si128((__m128i*)(a));
			__m128i imah = _mm_sub_epi8(amax,mah);

			__m128i ms1l = _mm_unpacklo_epi8(ms1h, zero);
			ms1h = _mm_unpackhi_epi8(ms1h, zero);

			__m128i ms2l = _mm_unpacklo_epi8(ms2h, zero);
			ms2h = _mm_unpackhi_epi8(ms2h, zero);

			__m128i mal = _mm_unpacklo_epi8(mah, zero);
			mah = _mm_unpackhi_epi8(mah, zero);

			__m128i imal = _mm_unpacklo_epi8(imah, zero);
			imah = _mm_unpackhi_epi8(imah, zero);

			ms1l = _mm_mullo_epi16(ms1l,mal);
			ms2l = _mm_mullo_epi16(ms2l,imal);
			ms1l = _mm_add_epi16(ms1l,ms2l);
			//ms1l = _mm_srli_epi16(ms1l,8);
			ms1l = _mm_srai_epi16(ms1l,8);

			ms1h = _mm_mullo_epi16(ms1h,mah);
			ms2h = _mm_mullo_epi16(ms2h,imah);
			ms1h = _mm_add_epi16(ms1h,ms2h);
			//ms1h = _mm_srli_epi16(ms1h,8);
			ms1h = _mm_srai_epi16(ms1h,8);

			_mm_store_si128((__m128i*)d,_mm_packs_epi16(ms1l,ms1h));

			s1+=16;
			s2+=16;
			a+=16;
			d+=16;
		}
	}

	{
		uchar* s1 = src1.data;
		uchar* s2 = src2.data;
		uchar* a = alpha.data;
		uchar* d = dest.data;
		for(int n=i*16;n<src1.size().area();n++)
		{
			d[n] = (a[n]*s1[n] + (255-a[n])*s2[n])>>8;
		}
	}
}
コード例 #30
0
ファイル: cgp_sse.c プロジェクト: kacer/coco
/**
 * Calculate output of given chromosome and inputs using SSE instructions
 * @param chr
 * @param inputs
 * @param outputs
 */
void cgp_get_output_sse(ga_chr_t chromosome,
    __m128i_aligned inputs[CGP_INPUTS], __m128i_aligned outputs[CGP_OUTPUTS])
{
#ifdef SSE2
    assert(CGP_OUTPUTS == 1);
    assert(CGP_ROWS == 4);
    assert(CGP_LBACK == 1);

    // previous and currently computed column
    register __m128i prev0, prev1, prev2, prev3;
    register __m128i current0, current1, current2, current3;

    // 0xFF constant
    static __m128i_aligned FF;
    FF = _mm_set1_epi8(0xFF);

    cgp_genome_t genome = (cgp_genome_t) chromosome->genome;

    /* if primary output is connected to primary input, skip evaluation

    This cannot happen - CGP does not generate circuits like that

    if (genome->outputs[0] < CGP_INPUTS) {
        int i = genome->outputs[0];
        _mm_store_si128(&outputs[0], inputs[i]);
        return;
    }
    */

#ifdef TEST_EVAL_SSE2
    for (int i = 0; i < CGP_INPUTS; i++) {
        unsigned char *_tmp = (unsigned char*) &inputs[i];
        printf("I: %2d = " UCFMT16 "\n", i, UCVAL16(0));
    }
#endif

    int offset = -CGP_ROWS;

    for (int x = 0; x < CGP_COLS; x++) {
        for (int y = 0; y < CGP_ROWS; y++) {
            int idx = cgp_node_index(x, y);
            cgp_node_t *n = &(genome->nodes[idx]);

            // skip inactive blocks
            if (!n->is_active) continue;

            register __m128i A;
            register __m128i B;
            register __m128i Y;
            register __m128i TMP;
            register __m128i mask;

            LOAD_INPUT(A, n->inputs[0]);
            LOAD_INPUT(B, n->inputs[1]);

            switch (n->function) {
                case c255:
                    Y = FF;
                    break;

                case identity:
                    Y = A;
                    break;

                case inversion:
                    Y = _mm_sub_epi8(FF, A);
                    break;

                case b_or:
                    Y = _mm_or_si128(A, B);
                    break;

                case b_not1or2:
                    // we don't have NOT instruction, we need to XOR with FF
                    Y = _mm_xor_si128(FF, A);
                    Y = _mm_or_si128(Y, B);
                    break;

                case b_and:
                    Y = _mm_and_si128(A, B);
                    break;

                case b_nand:
                    Y = _mm_and_si128(A, B);
                    Y = _mm_xor_si128(FF, Y);
                    break;

                case b_xor:
                    Y = _mm_xor_si128(A, B);
                    break;

                case rshift1:
                    // no SR instruction for 8bit data, we need to shift
                    // 16 bits and apply mask
                    // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H]
                    // SHR: [ 0 1 2 3 4 5 6 7 | 8 A B C D E F G]
                    // MSK: [ 0 1 2 3 4 5 6 7 | 0 A B C D E F G]
                    mask = _mm_set1_epi8(0x7F);
                    Y = _mm_srli_epi16(A, 1);
                    Y = _mm_and_si128(Y, mask);
                    break;

                case rshift2:
                    // similar to rshift1
                    // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H]
                    // SHR: [ 0 0 1 2 3 4 5 6 | 7 8 A B C D E F]
                    // MSK: [ 0 0 1 2 3 4 5 6 | 0 0 A B C D E F]
                    mask = _mm_set1_epi8(0x3F);
                    Y = _mm_srli_epi16(A, 2);
                    Y = _mm_and_si128(Y, mask);
                    break;

                case swap:
                    // SWAP(A, B) (((A & 0x0F) << 4) | ((B & 0x0F)))
                    // Shift A left by 4 bits
                    // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H]
                    // SHL: [ 5 6 7 8 A B C D | E F G H 0 0 0 0]
                    // MSK: [ 5 6 7 8 0 0 0 0 | E F G H 0 0 0 0]
                    mask = _mm_set1_epi8(0xF0);
                    TMP = _mm_slli_epi16(A, 4);
                    TMP = _mm_and_si128(TMP, mask);

                    // Mask B
                    // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H]
                    // MSK: [ 0 0 0 0 5 6 7 8 | 0 0 0 0 E F G H]
                    mask = _mm_set1_epi8(0x0F);
                    Y = _mm_and_si128(B, mask);

                    // Combine
                    Y = _mm_or_si128(Y, TMP);
                    break;

                case add:
                    Y = _mm_add_epi8(A, B);
                    break;

                case add_sat:
                    Y = _mm_adds_epu8(A, B);
                    break;

                case avg:
                    // shift right first, then add, to avoid overflow
                    mask = _mm_set1_epi8(0x7F);
                    TMP = _mm_srli_epi16(A, 1);
                    TMP = _mm_and_si128(TMP, mask);

                    Y = _mm_srli_epi16(B, 1);
                    Y = _mm_and_si128(Y, mask);

                    Y = _mm_add_epi8(Y, TMP);
                    break;

                case max:
                    Y = _mm_max_epu8(A, B);
                    break;

                case min:
                    Y = _mm_min_epu8(A, B);
                    break;
            }


#ifdef TEST_EVAL_SSE2
            __m128i _tmpval = Y;
            unsigned char *_tmp = (unsigned char*) &_tmpval;
            printf("N: %2d = " UCFMT16 "\n", idx + CGP_INPUTS, UCVAL16(0));

            bool mismatch = false;
            for (int i = 1; i < 16; i++) {
                if (_tmp[i] != _tmp[0]) {
                    fprintf(stderr,
                        "Value mismatch on index %2d (%u instead of %u)\n",
                        i, _tmp[i], _tmp[0]);
                    mismatch = true;
                }
            }
            if (mismatch) {
                abort();
            }
#endif

            if (idx + CGP_INPUTS == genome->outputs[0]) {
                _mm_store_si128(&outputs[0], Y);
#ifndef TEST_EVAL_SSE2
                return;
#endif
            }

            ASSIGN_CURRENT(y, Y);

        } // end of column

        offset += CGP_ROWS;
        prev0 = current0;
        prev1 = current1;
        prev2 = current2;
        prev3 = current3;
    } // end of row

#ifdef TEST_EVAL_SSE2
    for (int i = 0; i < CGP_OUTPUTS; i++) {
        unsigned char *_tmp = (unsigned char*) &outputs[i];
        printf("O: %2d = " UCFMT16 "\n", i, UCVAL16(0));
    }
#endif


#endif
}