예제 #1
png_read_filter_row_paeth3_sse(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
   png_size_t i;
   png_bytep rp = row;
   png_const_bytep prp = prev_row;
   __m128i npix = _mm_cvtsi32_si128(*(uint32_t*)rp);
   __m128i ppix = _mm_setzero_si128();           // Same as 'a' in C version.
   __m128i prppix = _mm_setzero_si128();         // Same as 'c' in C version.
   const __m128i zero = _mm_setzero_si128();

   for (i = 0; i < row_info->rowbytes; i += 3, rp += 3, prp += 3)
      __m128i prpix = _mm_cvtsi32_si128(*(uint32_t*)prp);  // Same as 'b' in C ver.
      __m128i pix, pa, pb, pc, temp;

      prpix = _mm_unpacklo_epi8(prpix, zero);
      temp = _mm_sub_epi16(prpix, prppix);  // p = b - c
      pc = _mm_sub_epi16(ppix, prppix);     // pc = a - c

#ifndef __SSSE3__
      pa = _mm_max_epi16(temp, _mm_sub_epi16(prppix, prpix));
      pb = _mm_max_epi16(pc, _mm_sub_epi16(prppix, ppix));
      temp = _mm_add_epi16(temp, pc);
      pc = _mm_max_epi16(temp, _mm_sub_epi16(zero, temp));
      pa = _mm_abs_epi16(temp);             // pa = abs(p)
      pb = _mm_abs_epi16(pc);               // pb = abs(pc)
      temp = _mm_add_epi16(temp, pc);
      pc = _mm_abs_epi16(temp);             // pc = abs(p + pc)

      temp = _mm_cmplt_epi16(pb, pa);       // if (pb < pa) pa = pb, a = b
      pa = _mm_andnot_si128(temp, pa);
      pa = _mm_or_si128(pa, _mm_and_si128(temp, pb));
      ppix = _mm_andnot_si128(temp, ppix);
      ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prpix));

      pix = npix;
      npix = _mm_cvtsi32_si128(*(uint32_t*)(rp + 3));
      temp = _mm_cmplt_epi16(pc, pa);       // if (pc < pa) a = c
      ppix = _mm_andnot_si128(temp, ppix);
      ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prppix));

      pix = _mm_unpacklo_epi8(pix, zero);
      prppix = prpix;
      ppix = _mm_add_epi16(ppix, pix);

      ppix = _mm_slli_epi16(ppix, 8);
      ppix = _mm_srli_epi16(ppix, 8);
      pix = _mm_packus_epi16(ppix, zero);
      *(uint32_t*)rp = _mm_cvtsi128_si32(pix);
예제 #2
__m128i test_mm_abs_epi16(__m128i a) {
  // CHECK-LABEL: test_mm_abs_epi16
  // CHECK: [[SUB:%.+]] = sub <8 x i16> zeroinitializer, [[A:%.+]]
  // CHECK: [[CMP:%.+]] = icmp sgt <8 x i16> [[A]], zeroinitializer
  // CHECK: %{{.*}} = select <8 x i1> [[CMP]], <8 x i16> [[A]], <8 x i16> [[SUB]]
  return _mm_abs_epi16(a);
예제 #3
static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
                                         uint8_t *dst, int dst_stride,
                                         int alpha_q3, int width, int height) {
  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
  const __m128i dc_q0 = _mm_set1_epi16(*dst);
  __m128i *row = (__m128i *)pred_buf_q3;
  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
  do {
    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
    if (width < 16) {
      res = _mm_packus_epi16(res, res);
      if (width == 4)
        _mm_storeh_epi32((__m128i *)dst, res);
        _mm_storel_epi64((__m128i *)dst, res);
    } else {
      __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
      res = _mm_packus_epi16(res, next);
      _mm_storeu_si128((__m128i *)dst, res);
      if (width == 32) {
        res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
        next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
        res = _mm_packus_epi16(res, next);
        _mm_storeu_si128((__m128i *)(dst + 16), res);
    dst += dst_stride;
  } while ((row += CFL_BUF_LINE_I128) < row_end);
예제 #4
static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
                                        __m128i alpha_sign, __m128i dc_q0) {
  __m128i ac_q3 = _mm_loadu_si128(input);
  __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
  return _mm_add_epi16(scaled_luma_q0, dc_q0);
예제 #5
static INLINE unsigned int highbd_masked_sad4xh_ssse3(
    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
  const __m128i round_const =
      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
  const __m128i one = _mm_set1_epi16(1);

  for (y = 0; y < height; y += 2) {
    const __m128i src = _mm_unpacklo_epi64(
        _mm_loadl_epi64((const __m128i *)src_ptr),
        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
    const __m128i a =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
                           _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
    const __m128i b =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
                           _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
    // Zero-extend mask to 16 bits
    const __m128i m = _mm_unpacklo_epi8(
            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
    const __m128i m_inv = _mm_sub_epi16(mask_max, m);

    const __m128i data_l = _mm_unpacklo_epi16(a, b);
    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),

    const __m128i data_r = _mm_unpackhi_epi16(a, b);
    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),

    const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
    const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
    res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  res = _mm_hadd_epi32(res, res);
  res = _mm_hadd_epi32(res, res);
  int sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
예제 #6
static INLINE unsigned int highbd_masked_sad_ssse3(
    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
    int width, int height) {
  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
  int x, y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
  const __m128i round_const =
      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
  const __m128i one = _mm_set1_epi16(1);

  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x += 8) {
      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
      // Zero-extend mask to 16 bits
      const __m128i m = _mm_unpacklo_epi8(
          _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
      const __m128i m_inv = _mm_sub_epi16(mask_max, m);

      const __m128i data_l = _mm_unpacklo_epi16(a, b);
      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),

      const __m128i data_r = _mm_unpackhi_epi16(a, b);
      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),

      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
      // so it is safe to do signed saturation here.
      const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
      // There is no 16-bit SAD instruction, so we have to synthesize
      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
      // and accumulating them at the end
      const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
      res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));

    src_ptr += src_stride;
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  // At this point, we have four 32-bit partial SADs stored in 'res'.
  res = _mm_hadd_epi32(res, res);
  res = _mm_hadd_epi32(res, res);
  int sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
예제 #7
void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
		     int **rxdataF_comp,
		     short *ulsch_llr,
		     int **ul_ch_mag,
		     int **ul_ch_magb,
		     unsigned char symbol,
		     unsigned short nb_rb) {

  __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  __m128i *ch_mag,*ch_magb;
  int j=0,i;
  //  unsigned char symbol_mod;

  if (symbol == 0)
    llrU = ulsch_llr;

  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;

  ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
  ch_magb =(__m128i*)&ul_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)];

  for (i=0;i<(nb_rb*3);i++) {

    mmtmpU1 = _mm_abs_epi16(rxF[i]);
    mmtmpU1  = _mm_subs_epi16(mmtmpU1,ch_mag[i]);
    mmtmpU2 = _mm_abs_epi16(mmtmpU1);
    mmtmpU2 = _mm_subs_epi16(mmtmpU2,ch_magb[i]);

    for (j=0;j<8;j++) {
      llrU[0] = ((short *)&rxF[i])[j];
      llrU[1] = ((short *)&mmtmpU1)[j];
      llrU[2] = ((short *)&mmtmpU2)[j];



예제 #8
static unsigned satd_8bit_4x4_avx2(const kvz_pixel *org, const kvz_pixel *cur)

  __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org));
  __m128i current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)cur));

  __m128i diff_lo = _mm_sub_epi16(current, original);

  original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(org + 8)));
  current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(cur + 8)));

  __m128i diff_hi = _mm_sub_epi16(current, original);

  __m128i row0 = _mm_hadd_epi16(diff_lo, diff_hi);
  __m128i row1 = _mm_hsub_epi16(diff_lo, diff_hi);

  __m128i row2 = _mm_hadd_epi16(row0, row1);
  __m128i row3 = _mm_hsub_epi16(row0, row1);

  row0 = _mm_hadd_epi16(row2, row3);
  row1 = _mm_hsub_epi16(row2, row3);

  row2 = _mm_hadd_epi16(row0, row1);
  row3 = _mm_hsub_epi16(row0, row1);

  //Abs and sum
  row2 = _mm_abs_epi16(row2);
  row3 = _mm_abs_epi16(row3);

  row3 = _mm_add_epi16(row2, row3);

  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
  row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));

  unsigned sum = _mm_extract_epi16(row3, 0);
  unsigned satd = (sum + 1) >> 1;

  return satd;
예제 #9
// Returns |x| for 16-bit lanes.
static __m128i abs_i16(__m128i x) {
#if defined(__SSSE3__)
    return _mm_abs_epi16(x);
    // Read this all as, return x<0 ? -x : x.
    // To negate two's complement, you flip all the bits then add 1.
    __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
    x = _mm_xor_si128(x, is_negative);                      // Flip negative lanes.
    x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15));  // +1 to negative lanes, else +0.
    return x;
예제 #10
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
                             int start_block, int end_block,
                             VP8Histogram* const histo) {
    const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
    int j;
    int distribution[MAX_COEFF_THRESH + 1] = { 0 };
    for (j = start_block; j < end_block; ++j) {
        int16_t out[16];
        int k;

        VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

        // Convert coefficients to bin (within out[]).
            // Load.
            const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
            const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
            // v = abs(out) >> 3
            const __m128i abs0 = _mm_abs_epi16(out0);
            const __m128i abs1 = _mm_abs_epi16(out1);
            const __m128i v0 = _mm_srai_epi16(abs0, 3);
            const __m128i v1 = _mm_srai_epi16(abs1, 3);
            // bin = min(v, MAX_COEFF_THRESH)
            const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
            const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
            // Store.
            _mm_storeu_si128((__m128i*)&out[0], bin0);
            _mm_storeu_si128((__m128i*)&out[8], bin1);

        // Convert coefficients to bin.
        for (k = 0; k < 16; ++k) {
    VP8SetHistogramData(distribution, histo);
예제 #11
/* Returns |x| for 16-bit lanes. */
static __m128i abs_i16(__m128i x) {
   return _mm_abs_epi16(x);
   /* Read this all as, return x<0 ? -x : x.
   * To negate two's complement, you flip all the bits then add 1.
   __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());

   /* Flip negative lanes. */
   x = _mm_xor_si128(x, is_negative);

   /* +1 to negative lanes, else +0. */
   x = _mm_sub_epi16(x, is_negative);
   return x;
예제 #12
파일: demod_soft.c 프로젝트: srsLTE/srsLTE
void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) {
    float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2; 
  __m128i symbol_i1, symbol_i2, symbol_i, symbol_abs;
  __m128i offset = _mm_set1_epi16(2*SCALE_SHORT_CONV_QAM16/sqrt(10));
  __m128i result11, result12, result22, result21; 
  __m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV_QAM16);
  __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0);
  __m128i shuffle_abs_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff);

  __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8);
  __m128i shuffle_abs_2 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff);

  for (int i=0;i<nsymbols/4;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i  = _mm_packs_epi32(symbol_i1, symbol_i2);
    symbol_abs  = _mm_abs_epi16(symbol_i);
    symbol_abs  = _mm_sub_epi16(symbol_abs, offset);
    result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);  
    result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);  

    result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);  
    result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);  

    _mm_store_si128(resultPtr, _mm_or_si128(result11, result12)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(result21, result22)); resultPtr++;
  // Demodulate last symbols 
  for (int i=4*(nsymbols/4);i<nsymbols;i++) {
    short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i]));
    short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i]));
    llr[4*i+0] = -yre;
    llr[4*i+1] = -yim;
    llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
    llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);    
예제 #13
void ulsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
		     int **rxdataF_comp,
		     short *ulsch_llr,
		     int **ul_ch_mag,
		     unsigned char symbol,
		     unsigned short nb_rb) {

  __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  __m128i *ch_mag;
  int i;
  //  unsigned char symbol_mod;

  //  printf("ulsch_rx.c: ulsch_16qam_llr: symbol %d\n",symbol);

  if (symbol == 0)
    llr128U = (__m128i*)&ulsch_llr[0];

  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;

  ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];

  for (i=0;i<(nb_rb*3);i++) {

    mmtmpU0 = _mm_abs_epi16(rxF[i]);
    //    print_shorts("tmp0",&tmp0);

    mmtmpU0 = _mm_subs_epi16(mmtmpU0,ch_mag[i]);

    llr128U[0] = _mm_unpacklo_epi16(rxF[i],mmtmpU0);
    llr128U[1] = _mm_unpackhi_epi16(rxF[i],mmtmpU0);

    //    print_bytes("rxF[i]",&rxF[i]);
    //    print_bytes("rxF[i+1]",&rxF[i+1]);


예제 #14
static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
                                         uint16_t *dst, int dst_stride,
                                         int alpha_q3, int bd, int width,
                                         int height) {
  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
  const __m128i dc_q0 = _mm_set1_epi16(*dst);
  const __m128i max = highbd_max_epi16(bd);
  const __m128i zeros = _mm_setzero_si128();
  __m128i *row = (__m128i *)pred_buf_q3;
  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
  do {
    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
    res = highbd_clamp_epi16(res, zeros, max);
    if (width == 4) {
      _mm_storel_epi64((__m128i *)dst, res);
    } else {
      _mm_storeu_si128((__m128i *)dst, res);
    if (width >= 16) {
      const __m128i res_1 =
          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
      _mm_storeu_si128(((__m128i *)dst) + 1,
                       highbd_clamp_epi16(res_1, zeros, max));
    if (width == 32) {
      const __m128i res_2 =
          predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
      _mm_storeu_si128((__m128i *)(dst + 16),
                       highbd_clamp_epi16(res_2, zeros, max));
      const __m128i res_3 =
          predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
      _mm_storeu_si128((__m128i *)(dst + 24),
                       highbd_clamp_epi16(res_3, zeros, max));
    dst += dst_stride;
  } while ((row += CFL_BUF_LINE_I128) < row_end);
예제 #15
INLINE static void haddwd_accumulate_avx2(__m128i *accumulate, __m128i *ver_row)
  __m128i abs_value = _mm_abs_epi16(*ver_row);
  *accumulate = _mm_add_epi32(*accumulate, _mm_madd_epi16(abs_value, _mm_set1_epi16(1)));
예제 #16
파일: demod_soft.c 프로젝트: srsLTE/srsLTE
void demod_64qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) 
  float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2; 
  __m128i symbol_i1, symbol_i2, symbol_i, symbol_abs, symbol_abs2;
  __m128i offset1 = _mm_set1_epi16(4*SCALE_SHORT_CONV_QAM64/sqrt(42));
  __m128i offset2 = _mm_set1_epi16(2*SCALE_SHORT_CONV_QAM64/sqrt(42));
  __m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV_QAM64);
  __m128i result11, result12, result13, result22, result21,result23, result31, result32, result33; 

  __m128i shuffle_negated_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,3,2,1,0);
  __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
  __m128i shuffle_negated_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff);

  __m128i shuffle_abs_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff);
  __m128i shuffle_abs_2 = _mm_set_epi8(11,10,9,8,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,7,6,5,4);
  __m128i shuffle_abs_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);

  __m128i shuffle_abs2_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
  __m128i shuffle_abs2_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff);
  __m128i shuffle_abs2_3 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,9,8);

  for (int i=0;i<nsymbols/4;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i  = _mm_packs_epi32(symbol_i1, symbol_i2);
    symbol_abs  = _mm_abs_epi16(symbol_i);
    symbol_abs  = _mm_sub_epi16(symbol_abs, offset1);
    symbol_abs2 = _mm_sub_epi16(_mm_abs_epi16(symbol_abs), offset2);
    result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);  
    result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);  
    result13 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_1);  

    result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);  
    result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);  
    result23 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_2);  

    result31 = _mm_shuffle_epi8(symbol_i, shuffle_negated_3);  
    result32 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_3);  
    result33 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_3);  

    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++;
  for (int i=4*(nsymbols/4);i<nsymbols;i++) {
    float yre = (short) (SCALE_SHORT_CONV_QAM64*crealf(symbols[i]));
    float yim = (short) (SCALE_SHORT_CONV_QAM64*cimagf(symbols[i]));

    llr[6*i+0] = -yre;
    llr[6*i+1] = -yim;
    llr[6*i+2] = abs(yre)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);
    llr[6*i+3] = abs(yim)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);
    llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_SHORT_CONV_QAM64/sqrt(42);
    llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_SHORT_CONV_QAM64/sqrt(42);        
예제 #17
static void
ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
			int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
    uint32_t *bits = image->bits + y * image->rowstride;
    __m128i vx = _mm_set_epi16 (
	- (x + 1), x, - (x + 1), x,
	- (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
    __m128i vux = _mm_set_epi16 (
	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
    __m128i *b = (__m128i *)line->buffer;
    __m128i vrl0, vrl1;

    while ((n -= 2) >= 0)
	__m128i vw, vr, s;

	vrl1 = _mm_loadl_epi64 (
	    (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
	/* vrl1: R1, L1 */

	vrl0 = _mm_loadl_epi64 (
	    (__m128i *)(bits + pixman_fixed_to_int (x)));
	/* vrl0: R0, L0 */

	/* The weights are based on vx which is a vector of 
	 *    - (x + 1), x, - (x + 1), x,
	 *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
	 * so the 16 bit weights end up like this:
	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
	 * and after shifting and packing, we get these bytes:
	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
	 *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
	 * which means the first and the second input pixel 
	 * have to be interleaved like this:
	 *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
	 *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
	 * before maddubsw can be used.

	vw = _mm_add_epi16 (
	    vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1

	vw = _mm_packus_epi16 (vw, vw);
	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
	 *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
	vx = _mm_add_epi16 (vx, vux);

	x += 2 * ux;

	vr = _mm_unpacklo_epi16 (vrl1, vrl0);
	/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */

	s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
	/* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */

	vr = _mm_unpackhi_epi8 (vr, s);
	/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
	 *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1

	vr = _mm_maddubs_epi16 (vr, vw);

	/* When the weight is 0, the inverse weight is
	 * 128 which can't be represented in a signed byte.
	 * As a result maddubsw computes the following:
	 *     r = l * -128 + r * 0
	 * rather than the desired
	 *     r = l * 128 + r * 0
	 * We fix this by taking the absolute value of the
	 * result.
	vr = _mm_abs_epi16 (vr);

	/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
	_mm_store_si128 (b++, vr);

    if (n == -1)
	vrl1 = _mm_setzero_si128();
	goto final_pixel;

    line->y = y;
예제 #18
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
                                       const uint16_t* const sharpen,
                                       const VP8Matrix* const mtx) {
    const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
    const __m128i zero = _mm_setzero_si128();
    __m128i out0, out8;
    __m128i packed_out;

    // Load all inputs.
    // TODO(cduvivier): Make variable declarations and allocations aligned so that
    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
    __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
    __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
    const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
    const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
    const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
    const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);

    // coeff = abs(in)
    __m128i coeff0 = _mm_abs_epi16(in0);
    __m128i coeff8 = _mm_abs_epi16(in8);

    // coeff = abs(in) + sharpen
    if (sharpen != NULL) {
        const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
        const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
        coeff0 = _mm_add_epi16(coeff0, sharpen0);
        coeff8 = _mm_add_epi16(coeff8, sharpen8);

    // out = (coeff * iQ + B) >> QFIX
        // doing calculations with 32b precision (QFIX=17)
        // out = (coeff * iQ)
        const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
        const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
        const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
        const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
        __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
        __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
        __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
        __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
        // out = (coeff * iQ + B)
        const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
        const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
        const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
        const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
        out_00 = _mm_add_epi32(out_00, bias_00);
        out_04 = _mm_add_epi32(out_04, bias_04);
        out_08 = _mm_add_epi32(out_08, bias_08);
        out_12 = _mm_add_epi32(out_12, bias_12);
        // out = QUANTDIV(coeff, iQ, B, QFIX)
        out_00 = _mm_srai_epi32(out_00, QFIX);
        out_04 = _mm_srai_epi32(out_04, QFIX);
        out_08 = _mm_srai_epi32(out_08, QFIX);
        out_12 = _mm_srai_epi32(out_12, QFIX);

        // pack result as 16b
        out0 = _mm_packs_epi32(out_00, out_04);
        out8 = _mm_packs_epi32(out_08, out_12);

        // if (coeff > 2047) coeff = 2047
        out0 = _mm_min_epi16(out0, max_coeff_2047);
        out8 = _mm_min_epi16(out8, max_coeff_2047);

    // put sign back
    out0 = _mm_sign_epi16(out0, in0);
    out8 = _mm_sign_epi16(out8, in8);

    // in = out * Q
    in0 = _mm_mullo_epi16(out0, q0);
    in8 = _mm_mullo_epi16(out8, q8);

    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);

    // zigzag the output before storing it. The re-ordering is:
    //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
    // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
    // There's only two misplaced entries ([8] and [7]) that are crossing the
    // reg's boundaries.
    // We use pshufb instead of pshuflo/pshufhi.
        const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
        const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
        const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
        const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7);  // extract #7
        const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
        const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
        const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
        const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8);  // extract #8
        const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
        const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
        _mm_storeu_si128((__m128i*)&out[0], out_z0);
        _mm_storeu_si128((__m128i*)&out[8], out_z8);
        packed_out = _mm_packs_epi16(out_z0, out_z8);

    // detect if all 'out' values are zeroes or not
    return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
예제 #19
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
static int TTransform(const uint8_t* inA, const uint8_t* inB,
                      const uint16_t* const w) {
    __m128i tmp_0, tmp_1, tmp_2, tmp_3;

    // Load, combine and transpose inputs.
        const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
        const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
        const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
        const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
        const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
        const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
        const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
        const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);

        // Combine inA and inB (we'll do two transforms in parallel).
        const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
        const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
        const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
        const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
        // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
        // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
        // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
        // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0

        // Transpose the two 4x4, discarding the filling zeroes.
        const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
        const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
        // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
        // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
        const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
        const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
        // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
        // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33

        // Convert to 16b.
        tmp_0 = _mm_cvtepu8_epi16(transpose1_0);
        tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8));
        tmp_2 = _mm_cvtepu8_epi16(transpose1_1);
        tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8));
        // a00 a10 a20 a30   b00 b10 b20 b30
        // a01 a11 a21 a31   b01 b11 b21 b31
        // a02 a12 a22 a32   b02 b12 b22 b32
        // a03 a13 a23 a33   b03 b13 b23 b33

    // Horizontal pass and subsequent transpose.
        // Calculate a and b (two 4x4 at once).
        const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
        const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
        const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
        const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
        const __m128i b0 = _mm_add_epi16(a0, a1);
        const __m128i b1 = _mm_add_epi16(a3, a2);
        const __m128i b2 = _mm_sub_epi16(a3, a2);
        const __m128i b3 = _mm_sub_epi16(a0, a1);
        // a00 a01 a02 a03   b00 b01 b02 b03
        // a10 a11 a12 a13   b10 b11 b12 b13
        // a20 a21 a22 a23   b20 b21 b22 b23
        // a30 a31 a32 a33   b30 b31 b32 b33

        // Transpose the two 4x4.
        const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
        const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
        const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
        const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
        // a00 a10 a01 a11   a02 a12 a03 a13
        // a20 a30 a21 a31   a22 a32 a23 a33
        // b00 b10 b01 b11   b02 b12 b03 b13
        // b20 b30 b21 b31   b22 b32 b23 b33
        const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
        const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
        const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
        const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
        // a00 a10 a20 a30 a01 a11 a21 a31
        // b00 b10 b20 b30 b01 b11 b21 b31
        // a02 a12 a22 a32 a03 a13 a23 a33
        // b02 b12 a22 b32 b03 b13 b23 b33
        tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
        tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
        tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
        tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
        // a00 a10 a20 a30   b00 b10 b20 b30
        // a01 a11 a21 a31   b01 b11 b21 b31
        // a02 a12 a22 a32   b02 b12 b22 b32
        // a03 a13 a23 a33   b03 b13 b23 b33

    // Vertical pass and difference of weighted sums.
        // Load all inputs.
        const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
        const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);

        // Calculate a and b (two 4x4 at once).
        const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
        const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
        const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
        const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
        const __m128i b0 = _mm_add_epi16(a0, a1);
        const __m128i b1 = _mm_add_epi16(a3, a2);
        const __m128i b2 = _mm_sub_epi16(a3, a2);
        const __m128i b3 = _mm_sub_epi16(a0, a1);

        // Separate the transforms of inA and inB.
        __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
        __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
        __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
        __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);

        A_b0 = _mm_abs_epi16(A_b0);
        A_b2 = _mm_abs_epi16(A_b2);
        B_b0 = _mm_abs_epi16(B_b0);
        B_b2 = _mm_abs_epi16(B_b2);

        // weighted sums
        A_b0 = _mm_madd_epi16(A_b0, w_0);
        A_b2 = _mm_madd_epi16(A_b2, w_8);
        B_b0 = _mm_madd_epi16(B_b0, w_0);
        B_b2 = _mm_madd_epi16(B_b2, w_8);
        A_b0 = _mm_add_epi32(A_b0, A_b2);
        B_b0 = _mm_add_epi32(B_b0, B_b2);

        // difference of weighted sums
        A_b2 = _mm_sub_epi32(A_b0, B_b0);
        // cascading summation of the differences
        B_b0 = _mm_hadd_epi32(A_b2, A_b2);
        B_b2 = _mm_hadd_epi32(B_b0, B_b0);
        return _mm_cvtsi128_si32(B_b2);
예제 #20
파일: enc_sse41.c 프로젝트: 93i/godot
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
                            const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;

  // Load and combine inputs.
    const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
    const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
    const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
    // In SSE4.1, with gcc 4.8 at least (maybe other versions),
    // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
    // of inA and inB, _mm_loadl_epi64 is still used not to have an out of
    // bound read.
    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
    const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
    const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
    const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);

    // Combine inA and inB (we'll do two transforms in parallel).
    const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
    const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
    const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
    const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
    tmp_0 = _mm_cvtepu8_epi16(inAB_0);
    tmp_1 = _mm_cvtepu8_epi16(inAB_1);
    tmp_2 = _mm_cvtepu8_epi16(inAB_2);
    tmp_3 = _mm_cvtepu8_epi16(inAB_3);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33

  // Vertical pass first to avoid a transpose (vertical and horizontal passes
  // are commutative because w/kWeightY is symmetric) and subsequent transpose.
    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33

    // Transpose the two 4x4.
    VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);

  // Horizontal pass and difference of weighted sums.
    // Load all inputs.
    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);

    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);

    // Separate the transforms of inA and inB.
    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);

    A_b0 = _mm_abs_epi16(A_b0);
    A_b2 = _mm_abs_epi16(A_b2);
    B_b0 = _mm_abs_epi16(B_b0);
    B_b2 = _mm_abs_epi16(B_b2);

    // weighted sums
    A_b0 = _mm_madd_epi16(A_b0, w_0);
    A_b2 = _mm_madd_epi16(A_b2, w_8);
    B_b0 = _mm_madd_epi16(B_b0, w_0);
    B_b2 = _mm_madd_epi16(B_b2, w_8);
    A_b0 = _mm_add_epi32(A_b0, A_b2);
    B_b0 = _mm_add_epi32(B_b0, B_b2);

    // difference of weighted sums
    A_b2 = _mm_sub_epi32(A_b0, B_b0);
    _mm_storeu_si128((__m128i*)&sum[0], A_b2);
  return sum[0] + sum[1] + sum[2] + sum[3];
 * This function utilises 3 properties of the cost function lookup tables,   *
 * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
 * vp9_encoder.c.                                                            *
 * For the joint cost:                                                       *
 *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
 * For the component costs:                                                  *
 *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
 *         (Equal costs for both components)                                 *
 *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
 *         (Cost function is even)                                           *
 * If these do not hold, then this function cannot be used without           *
 * modification, in which case you can revert to using the C implementation, *
 * which does not rely on these properties.                                  *
int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                               const search_site_config *cfg,
                               MV *ref_mv, MV *best_mv, int search_param,
                               int sad_per_bit, int *num00,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *center_mv) {
  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);

  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);

  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);

  // search_param determines the length of the initial step and hence the number
  // of iterations.
  // 0 = initial step (MAX_FIRST_STEP) pel
  // 1 = (MAX_FIRST_STEP/2) pel,
  // 2 = (MAX_FIRST_STEP/4) pel...
  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
  const int tot_steps = cfg->total_steps - search_param;

  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
                                        center_mv->col >> 3);
  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);

  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);

  int_mv bmv = pack_int_mv(ref_row, ref_col);
  int_mv new_bmv = bmv;
  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);

  const int what_stride = x->plane[0].src.stride;
  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
  const uint8_t *const what = x->plane[0].src.buf;
  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
                                 ref_row * in_what_stride + ref_col;

  // Work out the start point for the search
  const uint8_t *best_address = in_what;
  const uint8_t *new_best_address = best_address;
#if ARCH_X86_64
  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);

  unsigned int best_sad;

  int i;
  int j;
  int step;

  // Check the prerequisite cost function properties that are easy to check
  // in an assert. See the function-level documentation for details on all
  // prerequisites.
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);

  // Check the starting position
  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);

  *num00 = 0;

  for (i = 0, step = 0; step < tot_steps; step++) {
    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
      __m128i v_sad_d;
      __m128i v_cost_d;
      __m128i v_outside_d;
      __m128i v_inside_d;
      __m128i v_diff_mv_w;
#if ARCH_X86_64
      __m128i v_blocka[2];
      __m128i v_blocka[1];

      // Compute the candidate motion vectors
      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
      // Clamp them to the search bounds
      __m128i v_these_mv_clamp_w = v_these_mv_w;
      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
      // The ones that did not change are inside the search area
      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);

      // If none of them are inside, then move on
      if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) {

      // The inverse mask indicates which of the MVs are outside
      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
      // Shift right to keep the sign bit clear, we will use this later
      // to set the cost to the maximum value.
      v_outside_d = _mm_srli_epi32(v_outside_d, 1);

      // Compute the difference MV
      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
      // We utilise the fact that the cost function is even, and use the
      // absolute difference. This allows us to use unsigned indexes later
      // and reduces cache pressure somewhat as only a half of the table
      // is ever referenced.
      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);

      // Compute the SIMD pointer offsets.
#if ARCH_X86_64  //  sizeof(intptr_t) == 8
        // Load the offsets
        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
        // Set the ones falling outside to zero
        v_bo10_q = _mm_and_si128(v_bo10_q,
        v_bo32_q = _mm_and_si128(v_bo32_q,
                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
        // Compute the candidate addresses
        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
#else  // ARCH_X86 //  sizeof(intptr_t) == 4
        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);

      fn_ptr->sdx4df(what, what_stride,
                     (const uint8_t **)&v_blocka[0], in_what_stride,

      // Look up the component cost of the residual motion vector
        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);

        // Note: This is a use case for vpgather in AVX2
        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];

        __m128i v_cost_10_d, v_cost_32_d;

        v_cost_10_d = _mm_cvtsi32_si128(cost0);
        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);

        v_cost_32_d = _mm_cvtsi32_si128(cost2);
        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);

        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);

      // Now add in the joint cost
        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);

      // Multiply by sad_per_bit
      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
      // ROUND_POWER_OF_TWO(v_cost_d, 8)
      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
      // Add the cost to the sad
      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);

      // Make the motion vectors outside the search area have max cost
      // by or'ing in the comparison mask, this way the minimum search won't
      // pick them.
      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);

      // Find the minimum value and index horizontally in v_sad_d
        // Try speculatively on 16 bits, so we can use the minpos intrinsic
        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);

        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);

        // If the local best value is not saturated, just use it, otherwise
        // find the horizontal minimum again the hard way on 32 bits.
        // This is executed rarely.
        if (__unlikely__(local_best_sad == 0xffff)) {
          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;

          v_loval_d = v_sad_d;
          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
          v_hival_d = _mm_srli_si128(v_loval_d, 8);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
          v_hival_d = _mm_srli_si128(v_loval_d, 4);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);

          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);

        // Update the global minimum if the local minimum is smaller
        if (__likely__(local_best_sad < best_sad)) {
          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];

          best_sad = local_best_sad;

    bmv = new_bmv;
    best_address = new_best_address;

    v_bmv_w = _mm_set1_epi32(bmv.as_int);
#if ARCH_X86_64
    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
    v_ba_d = _mm_set1_epi32((intptr_t)best_address);

    if (__unlikely__(best_address == in_what)) {

  *best_mv = bmv.as_mv;
  return best_sad;
예제 #22
__m128i test_mm_abs_epi16(__m128i a) {
  // CHECK-LABEL: test_mm_abs_epi16
  // CHECK: call <8 x i16> @llvm.x86.ssse3.pabs.w.128
  return _mm_abs_epi16(a);