int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
                                     intptr_t block_size, int64_t *ssz,
                                     int bps) {
  int i, j, test;
  uint32_t temp[4];
  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
  int64_t error = 0, sqcoeff = 0;
  const int shift = 2 * (bps - 8);
  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;

  for (i = 0; i < block_size; i += 8) {
    // Load the data into xmm registers
    __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
    __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
    __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
    // Check if any values require more than 15 bit
    max = _mm_set1_epi32(0x3fff);
    min = _mm_set1_epi32(0xffffc000);
    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
                         _mm_cmplt_epi32(mm_coeff, min));
    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
                         _mm_cmplt_epi32(mm_coeff2, min));
    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
                         _mm_cmplt_epi32(mm_dqcoeff, min));
    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
                         _mm_cmplt_epi32(mm_dqcoeff2, min));
    test = _mm_movemask_epi8(
        _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));

    if (!test) {
      __m128i mm_diff, error_sse2, sqcoeff_sse2;
      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
      _mm_storeu_si128((__m128i *)temp, error_sse2);
      error = error + temp[0] + temp[1] + temp[2] + temp[3];
      _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
    } else {
      for (j = 0; j < 8; j++) {
        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
        error += diff * diff;
        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
      }
    }
  }
  assert(error >= 0 && sqcoeff >= 0);
  error = (error + rounding) >> shift;
  sqcoeff = (sqcoeff + rounding) >> shift;

  *ssz = sqcoeff;
  return error;
}
int searchSIMDTree(int32_t **tree, int *fanout, int levels, int32_t value) {
    int iLevel = 0;
    int lOffset = 0;
    int pOffset = 0;
    int32_t cmpmask = 0;
    int32_t eqmask = 0;

     __m128i key = _mm_cvtsi32_si128(value);
    key = _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0));

    while (iLevel < levels) {
        int f = fanout[iLevel];
        pOffset = lOffset;
        lOffset *= f - 1;
        int iter = 0;
        int position = 0;
        while (iter < f/4) {
            __m128i delimiters = _mm_load_si128((__m128i const*)&tree[iLevel][lOffset + iter*4]);
            __m128i compare = _mm_cmpgt_epi32(key, delimiters);
            cmpmask = _mm_movemask_ps(_mm_castsi128_ps(compare));
            cmpmask ^= 0x0F;
            if (cmpmask) {
                position = _bit_scan_forward(cmpmask);
                break;
            }
            iter++;
        }
        int offset = lOffset + iter*4 + position;
        lOffset = offset + pOffset;
        iLevel++;
    }
    return lOffset;
}
Beispiel #3
0
static inline __m128i hardlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                          const __m128i& sa, const __m128i& da) {
    // if (2 * sc <= sa)
    __m128i tmp1 = _mm_slli_epi32(sc, 1);
    __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
    __m128i rc1 = _mm_mullo_epi16(sc, dc);                // sc * dc;
    rc1 = _mm_slli_epi32(rc1, 1);                         // 2 * sc * dc
    rc1 = _mm_andnot_si128(cmp1, rc1);

    // else
    tmp1 = _mm_mullo_epi16(sa, da);
    __m128i tmp2 = Multiply32_SSE2(_mm_sub_epi32(da, dc),
                                   _mm_sub_epi32(sa, sc));
    tmp2 = _mm_slli_epi32(tmp2, 1);
    __m128i rc2 = _mm_sub_epi32(tmp1, tmp2);
    rc2 = _mm_and_si128(cmp1, rc2);

    __m128i rc = _mm_or_si128(rc1, rc2);

    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    tmp1 = _mm_mullo_epi16(sc, ida);
    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    tmp2 = _mm_mullo_epi16(dc, isa);
    rc = _mm_add_epi32(rc, tmp1);
    rc = _mm_add_epi32(rc, tmp2);
    return clamp_div255round_SSE2(rc);
}
Beispiel #4
0
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
  int i, j;
  __m128i L = _mm_cvtsi32_si128(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    __m128i pa;
    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
    for (j = 0; j < 4; ++j) {
      const __m128i L_lo = _mm_unpacklo_epi32(L, L);
      const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
      const __m128i pb = _mm_sad_epu8(L_lo, TL_lo);  // pb = sum |L-TL|
      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
      const __m128i A = _mm_and_si128(mask, L);
      const __m128i B = _mm_andnot_si128(mask, T);
      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
      L = _mm_add_epi8(src, pred);
      out[i + j] = _mm_cvtsi128_si32(L);
      // Shift the pre-computed value for the next iteration.
      T = _mm_srli_si128(T, 4);
      TL = _mm_srli_si128(TL, 4);
      src = _mm_srli_si128(src, 4);
      pa = _mm_srli_si128(pa, 4);
    }
  }
  if (i != num_pixels) {
    VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
  }
}
Beispiel #5
0
void vFindMax(__m128i *pixels, int n)
{
  __m128i vIdx,vMax;
  int i;
  vIdx = _mm_setzero_si128();
  vMax = _mm_set_epi32(INT_MIN,INT_MIN,INT_MIN,INT_MIN);
  for(i = 0; i < n; i++)
    {
      __m128i v = _mm_load_si128(pixels+i);
      __m128i vCmp = _mm_cmpgt_epi32(v, vMax);
      /* max value */
      vMax = _mm_max_epi32(vMax,v);
      
      __m128i vBdxIdx = _mm_set_epi32(i,i,i,i); 
      
      __m128 t0 = _mm_and_ps((__m128)vBdxIdx,(__m128)vCmp);
      __m128 t1 = _mm_andnot_ps((__m128)vCmp, (__m128)vIdx);
      /* max index */
      vIdx = (__m128i)_mm_or_ps(t0,t1);
    }
  int indices[4];
  int values[4];
  _mm_store_si128((__m128i*)indices, vIdx);
  _mm_store_si128((__m128i*)values, vMax);
  printf("SSE:\n");
  for(i=0;i<4;i++)
    {
      printf("%d:max=%d,idx=%d\n",i,values[i],indices[i]);
      //int idx = 4*indices[i] + i;
      //int *sArr = (int*)pixels;
      //printf("sArr[%d]=%d\n",idx,sArr[idx]);
    }
}
static inline __m128i saturated_add_SSE2(const __m128i& a, const __m128i& b) {
    __m128i sum = _mm_add_epi32(a, b);
    __m128i cmp = _mm_cmpgt_epi32(sum, _mm_set1_epi32(255));

    sum = _mm_or_si128(_mm_and_si128(cmp, _mm_set1_epi32(255)),
                       _mm_andnot_si128(cmp, sum));
    return sum;
}
Beispiel #7
0
__m128i test_mm_cmpgt_epi32(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_cmpgt_epi32
  // DAG: icmp sgt <4 x i32>
  //
  // ASM-LABEL: test_mm_cmpgt_epi32
  // ASM: pcmpgtd
  return _mm_cmpgt_epi32(A, B);
}
    SIMDValue SIMDInt32x4Operation::OpGreaterThan(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        x86Result.m128i_value = _mm_cmpgt_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a > b?

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
static inline __m128i clamp_signed_byte_SSE2(const __m128i& n) {
    __m128i cmp1 = _mm_cmplt_epi32(n, _mm_setzero_si128());
    __m128i cmp2 = _mm_cmpgt_epi32(n, _mm_set1_epi32(255));
    __m128i ret = _mm_and_si128(cmp2, _mm_set1_epi32(255));

    __m128i cmp = _mm_or_si128(cmp1, cmp2);
    ret = _mm_or_si128(_mm_and_si128(cmp, ret), _mm_andnot_si128(cmp, n));

    return ret;
}
Beispiel #10
0
__m64 _m_pcmpgtd(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_cmpgt_epi32(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
Beispiel #11
0
/* resample code ---------------------------------------------------------------
* resample code
* args   : char   *code     I   code
*          int    len       I   code length (len < 2^(31-FPBIT))
*          double coff      I   initial code offset (chip)
*          int    smax      I   maximum correlator space (sample) 
*          double ci        I   code sampling interval (chip)
*          int    n         I   number of samples
*          short  *rcode    O   resampling code
* return : double               code remainder
*-----------------------------------------------------------------------------*/
extern double rescode(const short *code, int len, double coff, int smax, 
                      double ci, int n, short *rcode)
{
    short *p;

#if !defined(SSE2_ENABLE)
    coff-=smax*ci;
    coff-=floor(coff/len)*len; /* 0<=coff<len */

    for (p=rcode;p<rcode+n+2*smax;p++,coff+=ci) {
        if (coff>=len) coff-=len;
        *p=code[(int)coff];
    }
    return coff-smax*ci;

#else
    int i,index[4],x[4],nbit,scale;
    __m128i xmm1,xmm2,xmm3,xmm4,xmm5;

    coff-=smax*ci;
    coff-=floor(coff/len)*len; /* 0<=coff<len */

    for (i=len,nbit=31;i;i>>=1,nbit--) ;
    nbit-=1;
    scale=1<<nbit; /* scale factor */

    for (i=0;i<4;i++,coff+=ci) {
        x[i]=(int)(coff*scale+0.5);
    }
    xmm1=_mm_loadu_si128((__m128i *)x);
    xmm2=_mm_set1_epi32(len*scale-1);
    xmm3=_mm_set1_epi32(len*scale);
    xmm4=_mm_set1_epi32((int)(ci*4*scale+0.5));

    for (p=rcode;p<rcode+n+2*smax;p+=4) {

        xmm5=_mm_cmpgt_epi32(xmm1,xmm2);
        xmm5=_mm_and_si128(xmm5,xmm3);
        xmm1=_mm_sub_epi32(xmm1,xmm5);
        xmm5=_mm_srai_epi32(xmm1,nbit);
        _mm_storeu_si128((__m128i *)index,xmm5);
        p[0]=code[index[0]];
        p[1]=code[index[1]];
        p[2]=code[index[2]];
        p[3]=code[index[3]];
        xmm1=_mm_add_epi32(xmm1,xmm4);
    }
    coff+=ci*(n+2*smax)-4*ci;
    coff-=floor(coff/len)*len;
    return coff-smax*ci;
#endif
}
/* constant-time doubling in GF(2^128) */
static __m128i gf128_mul2(const __m128i x)
{
    const __m128i REDPOLY = _mm_set_epi64x(0, 0x87);
    const __m128i ZERO = _mm_setzero_si128();
	__m128i x2;

    __m128i mask = _mm_cmpgt_epi32(ZERO, x);
    mask = _mm_shuffle_epi32(mask, 0xff);

    x2 = _mm_slli_epi64(x, 1) | _mm_srli_epi64(_mm_slli_si128(x, 8), 63);

    return x2 ^ (REDPOLY & mask);
}
Beispiel #13
0
static inline __m128i lighten_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                        const __m128i& sa, const __m128i& da) {
    __m128i sd = _mm_mullo_epi16(sc, da);
    __m128i ds = _mm_mullo_epi16(dc, sa);

    __m128i cmp = _mm_cmpgt_epi32(sd, ds);

    __m128i tmp = _mm_add_epi32(sc, dc);
    __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds));
    __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd));
    __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1),
                               _mm_andnot_si128(cmp, ret2));
    return ret;
}
Beispiel #14
0
static inline __m128i clamp_div255round_SSE2(const __m128i& prod) {
    // test if > 0
    __m128i cmp1 = _mm_cmpgt_epi32(prod, _mm_setzero_si128());
    // test if < 255*255
    __m128i cmp2 = _mm_cmplt_epi32(prod, _mm_set1_epi32(255*255));

    __m128i ret = _mm_setzero_si128();

    // if value >= 255*255, value = 255
    ret = _mm_andnot_si128(cmp2,  _mm_set1_epi32(255));

    __m128i div = SkDiv255Round_SSE2(prod);

    // test if > 0 && < 255*255
    __m128i cmp = _mm_and_si128(cmp1, cmp2);

    ret = _mm_or_si128(_mm_and_si128(cmp, div), _mm_andnot_si128(cmp, ret));

    return ret;
}
Beispiel #15
0
// Portable version overlay_byte() is in SkXfermode.cpp.
static inline __m128i overlay_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                        const __m128i& sa, const __m128i& da) {
    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    __m128i tmp1 = _mm_mullo_epi16(sc, ida);
    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);
    __m128i tmp2 = _mm_mullo_epi16(dc, isa);
    __m128i tmp = _mm_add_epi32(tmp1, tmp2);

    __m128i cmp = _mm_cmpgt_epi32(_mm_slli_epi32(dc, 1), da);
    __m128i rc1 = _mm_slli_epi32(sc, 1);                        // 2 * sc
    rc1 = Multiply32_SSE2(rc1, dc);                             // *dc

    __m128i rc2 = _mm_mullo_epi16(sa, da);                      // sa * da
    __m128i tmp3 = _mm_slli_epi32(_mm_sub_epi32(da, dc), 1);    // 2 * (da - dc)
    tmp3 = Multiply32_SSE2(tmp3, _mm_sub_epi32(sa, sc));        // * (sa - sc)
    rc2 = _mm_sub_epi32(rc2, tmp3);

    __m128i rc = _mm_or_si128(_mm_andnot_si128(cmp, rc1),
                              _mm_and_si128(cmp, rc2));
    return clamp_div255round_SSE2(_mm_add_epi32(rc, tmp));
}
Beispiel #16
0
static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    __m128i pa, pb;
    GetSumAbsDiff32_SSE2(&T, &TL, &pa);   // pa = sum |T-TL|
    GetSumAbsDiff32_SSE2(&L, &TL, &pb);   // pb = sum |L-TL|
    {
      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
      const __m128i A = _mm_and_si128(mask, L);
      const __m128i B = _mm_andnot_si128(mask, T);
      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
      const __m128i res = _mm_sub_epi8(src, pred);
      _mm_storeu_si128((__m128i*)&out[i], res);
    }
  }
  if (i != num_pixels) {
    VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
  }
}
Beispiel #17
0
inline FORCE_INLINE __m128i mm_cvtps_ph(__m128 x)
{
	__m128 magic = _mm_castsi128_ps(_mm_set1_epi32((uint32_t)15 << 23));
	__m128i inf = _mm_set1_epi32((uint32_t)255UL << 23);
	__m128i f16inf = _mm_set1_epi32((uint32_t)31UL << 23);
	__m128i sign_mask = _mm_set1_epi32(0x80000000UL);
	__m128i round_mask = _mm_set1_epi32(~0x0FFFU);

	__m128i ret_0x7E00 = _mm_set1_epi32(0x7E00);
	__m128i ret_0x7C00 = _mm_set1_epi32(0x7C00);

	__m128i f, sign, ge_inf, eq_inf;

	f = _mm_castps_si128(x);
	sign = _mm_and_si128(f, sign_mask);
	f = _mm_xor_si128(f, sign);

	ge_inf = _mm_cmpgt_epi32(f, inf);
	eq_inf = _mm_cmpeq_epi32(f, inf);

	f = _mm_and_si128(f, round_mask);
	f = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(f), magic));
	f = _mm_sub_epi32(f, round_mask);

	f = mm_min_epi32(f, f16inf);
	f = _mm_srli_epi32(f, 13);

	f = mm_blendv_ps(ret_0x7E00, f, ge_inf);
	f = mm_blendv_ps(ret_0x7C00, f, eq_inf);

	sign = _mm_srli_epi32(sign, 16);
	f = _mm_or_si128(f, sign);

	f = mm_packus_epi32(f, _mm_setzero_si128());
	return f;
}
Beispiel #18
0
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width,
                int height, int stride, uint8_t *d, const uint8_t *s)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t *p0 = (uint16_t *)buff + 8;
    uint16_t *p1 = p0 + bstride;
    uint16_t *p2 = p1 + bstride;
    uint16_t *p3 = p2 + bstride;
    uint16_t *p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 rdiv = _mm_set1_ps((float)ch->rdiv);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    __m128i max = _mm_set1_epi32(0xFFFF);
    __m128 matrix[25];
    for (int i = 0; i < 25; i++) {
        matrix[i] = _mm_set1_ps((float)ch->m[i]);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);
        uint16_t *array[] = {
            p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2,
            p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2,
            p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2,
            p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2,
            p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2
        };
        
        for (int x = 0; x < width; x += 8) {
            __m128 sum[2] = {(__m128)zero, (__m128)zero};

            for (int i = 0; i < 25; i++) {
                __m128i xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x));
                __m128 xmm1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero));
                __m128 xmm2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero));
                xmm1 = _mm_mul_ps(xmm1, matrix[i]);
                xmm2 = _mm_mul_ps(xmm2, matrix[i]);
                sum[0] = _mm_add_ps(sum[0], xmm1);
                sum[1] = _mm_add_ps(sum[1], xmm2);
            }

            __m128i sumi[2];
            for (int i = 0; i < 2; i++) {
                sum[i]  = _mm_mul_ps(sum[i], rdiv);
                sum[i]  = _mm_add_ps(sum[i], bias);
                if (!ch->saturate) {
                    sum[i] = mm_abs_ps(sum[i]);
                }
                sumi[i] = _mm_cvtps_epi32(sum[i]);
                sumi[i] = mm_min_epi32(sumi[i], max);
                __m128i mask = _mm_cmpgt_epi32(sumi[i], zero);
                sumi[i] = _mm_and_si128(sumi[i], mask);
            }

            sumi[0] = mm_cast_epi32(sumi[0], sumi[1]);

            _mm_store_si128((__m128i *)(dstp + x), sumi[0]);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
static inline __m128i _mm_max_epi32_rpl(__m128i a, __m128i b) {
    __m128i mask = _mm_cmpgt_epi32(a, b);
    a = _mm_and_si128(a, mask);
    b = _mm_andnot_si128(mask, b);
    return _mm_or_si128(a, b);
}
Beispiel #20
0
RETi CMPGT(const __m128i x, const __m128i y) { return _mm_cmpgt_epi32(x, y); }
mlib_status
__mlib_VectorSumAbsDiff_S32_Sat(
    mlib_d64 *z,
    const mlib_s32 *x,
    const mlib_s32 *y,
    mlib_s32 n)
{
    if (n <= 0)
        return (MLIB_FAILURE);

    mlib_s32 i, nstep, ax, ay, n1, n2, n3;
    mlib_s32 *px = (mlib_s32 *)x, *py = (mlib_s32 *)y;
    __m128i zero, xbuf, ybuf, zbuf, xlo, xhi, mext;
    mlib_d64 dsum = 0.0;
    zero = _mm_setzero_si128();
    zbuf = zero;

    nstep = 16 / sizeof (mlib_s32);
    ax = (mlib_addr)x & 15;
    ay = (mlib_addr)y & 15;
    n1 = ((16 - ax) & 15) / sizeof (mlib_s32);
    n2 = (n - n1) / nstep;
    n3 = n - n1 - n2 * nstep;

    if (n2 < 1) {
        for (i = 0; i < n; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }
        *z = dsum;
    } else {
        for (i = 0; i < n1; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }
        if (ax == ay) {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_load_si128((__m128i *)py);
                mext = _mm_cmpgt_epi32(ybuf, xbuf);
                xbuf = _mm_sub_epi32(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi32(xbuf, mext);
                xlo = _mm_unpacklo_epi32(xbuf, zero);
                xhi = _mm_unpackhi_epi32(xbuf, zero);
                zbuf = _mm_add_epi64(zbuf, xlo);
                zbuf = _mm_add_epi64(zbuf, xhi);
                px += nstep;
                py += nstep;
            }
        } else {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_loadu_si128((__m128i *)py);
                mext = _mm_cmpgt_epi32(ybuf, xbuf);
                xbuf = _mm_sub_epi32(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi32(xbuf, mext);
                xlo = _mm_unpacklo_epi32(xbuf, zero);
                xhi = _mm_unpackhi_epi32(xbuf, zero);
                zbuf = _mm_add_epi64(zbuf, xlo);
                zbuf = _mm_add_epi64(zbuf, xhi);
                px += nstep;
                py += nstep;
            }
        }
        for (i = 0; i < n3; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }

        long long pz[2];
        _mm_storeu_si128((__m128i *)pz, zbuf);
        dsum += pz[0];
        dsum += pz[1];
        *z = dsum;
    }
    return (MLIB_SUCCESS);
}
void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
                                int skip_block, const int16_t *zbin_ptr,
                                const int16_t *round_ptr,
                                const int16_t *quant_ptr,
                                const int16_t *quant_shift_ptr,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
  __m128i zbins[2];
  __m128i nzbins[2];

  zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
                           (int)zbin_ptr[0]);
  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);

  nzbins[0] = _mm_setzero_si128();
  nzbins[1] = _mm_setzero_si128();
  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

  (void)scan;

  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));

  if (!skip_block) {
    // Pre-scan pass
    for (i = ((int)count / 4) - 1; i >= 0; i--) {
      __m128i coeffs, cmp1, cmp2;
      int test;
      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
      cmp1 = _mm_and_si128(cmp1, cmp2);
      test = _mm_movemask_epi8(cmp1);
      if (test == 0xffff)
        non_zero_regs--;
      else
        break;
    }

    // Quantization pass:
    for (i = 0; i < non_zero_regs; i++) {
      __m128i coeffs, coeffs_sign, tmp1, tmp2;
      int test;
      int abs_coeff[4];
      int coeff_sign[4];

      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
      coeffs_sign = _mm_srai_epi32(coeffs, 31);
      coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
      tmp1 = _mm_or_si128(tmp1, tmp2);
      test = _mm_movemask_epi8(tmp1);
      _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
      _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);

      for (j = 0; j < 4; j++) {
        if (test & (1 << (4 * j))) {
          int k = 4 * i + j;
          const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
          const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
          const uint32_t abs_qcoeff =
              (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
          if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
        }
      }
    }
  }
Beispiel #23
0
static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                          const __m128i& sa, const __m128i& da) {
    __m128i tmp1, tmp2, tmp3;

    // int m = da ? dc * 256 / da : 0;
    __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128());
    __m128i m = _mm_slli_epi32(dc, 8);
    __m128 x = _mm_cvtepi32_ps(m);
    __m128 y = _mm_cvtepi32_ps(da);
    m = _mm_cvttps_epi32(_mm_div_ps(x, y));
    m = _mm_andnot_si128(cmp, m);

    // if (2 * sc <= sa)
    tmp1 = _mm_slli_epi32(sc, 1);                      // 2 * sc
    __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
    tmp1 = _mm_sub_epi32(tmp1, sa);                    // 2 * sc - sa
    tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m);      // 256 - m
    tmp1 = Multiply32_SSE2(tmp1, tmp2);
    tmp1 = _mm_srai_epi32(tmp1, 8);
    tmp1 = _mm_add_epi32(sa, tmp1);
    tmp1 = Multiply32_SSE2(dc, tmp1);
    __m128i rc1 = _mm_andnot_si128(cmp1, tmp1);

    // else if (4 * dc <= da)
    tmp2 = _mm_slli_epi32(dc, 2);                      // dc * 4
    __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da);
    __m128i i = _mm_slli_epi32(m, 2);                  // 4 * m
    __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256
    __m128i k = Multiply32_SSE2(i, j);                 // 4 * m * (4 * m + 256)
    __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256
    i = Multiply32_SSE2(k, t);                         // 4 * m * (4 * m + 256) * (m - 256)
    i = _mm_srai_epi32(i, 16);                         // >> 16
    j = Multiply32_SSE2(_mm_set1_epi32(7), m);         // 7 * m
    tmp2 = _mm_add_epi32(i, j);
    i = Multiply32_SSE2(dc, sa);                       // dc * sa
    j = _mm_slli_epi32(sc, 1);                         // 2 * sc
    j = _mm_sub_epi32(j, sa);                          // 2 * sc - sa
    j = Multiply32_SSE2(da, j);                        // da * (2 * sc - sa)
    tmp2 = Multiply32_SSE2(j, tmp2);                   // * tmp
    tmp2 = _mm_srai_epi32(tmp2, 8);                    // >> 8
    tmp2 = _mm_add_epi32(i, tmp2);
    cmp = _mm_andnot_si128(cmp2, cmp1);
    __m128i rc2 = _mm_and_si128(cmp, tmp2);
    __m128i rc = _mm_or_si128(rc1, rc2);

    // else
    tmp3 = sqrt_unit_byte_SSE2(m);
    tmp3 = _mm_sub_epi32(tmp3, m);
    tmp3 = Multiply32_SSE2(j, tmp3);                   // j = da * (2 * sc - sa)
    tmp3 = _mm_srai_epi32(tmp3, 8);
    tmp3 = _mm_add_epi32(i, tmp3);                     // i = dc * sa
    cmp = _mm_and_si128(cmp1, cmp2);
    __m128i rc3 = _mm_and_si128(cmp, tmp3);
    rc = _mm_or_si128(rc, rc3);

    tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da);     // 255 - da
    tmp1 = _mm_mullo_epi16(sc, tmp1);
    tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa);     // 255 - sa
    tmp2 = _mm_mullo_epi16(dc, tmp2);
    rc = _mm_add_epi32(rc, tmp1);
    rc = _mm_add_epi32(rc, tmp2);
    return clamp_div255round_SSE2(rc);
}
Beispiel #24
0
int sse_auction_search(int *pr, int *P, int *ai0, int *ai1, int *a0, int *a1, int nodes, int arcs, int s, int t)
{
	int i __attribute__ ((aligned (16))) = 0;	
	int j __attribute__ ((aligned (16))) = t;
	int k __attribute__ ((aligned (16))) = 0;
	int m __attribute__ ((aligned (16))) = 0;	
	int maxla __attribute__ ((aligned (32))) = 0;
	int argmaxla __attribute__ ((aligned (16))) = 0;
	int cost __attribute__ ((aligned (16))) = 0;
	int length __attribute__ ((aligned (16))) = 1;
	int path_cost __attribute__ ((aligned (16))) = 0;
	
	uint32_t tmp1, tmp2;
	int cost_tab[nodes+1];

	__m128i a0sse, a1sse, ai0sse, ai1sse, ai1sse1, I, J, K, M, then;
	__m128i ARCS, MNODES, INFINITE, NEGINF, prsse, Psse, MAXLA, ARGMAXLA, LA, mask1, mask2, mask3, COST;
			
	for(i = 0; i <= nodes; i++) {
		cost_tab[i] = 0;
	}

	if(check_s_t(s, t, P, nodes) != 0) {
		return 1;
	}

	while(P[s] == INF) {
		k = -1;	
		m = -1;

		//printf("j = %d\n", j);

		J = _mm_set1_epi32(j);			//aktualna wartosc j
		K = _mm_set1_epi32(-1);			//poczatkowy indeks w tablicy z kosztami krawedzi
		M = _mm_set1_epi32(-1);			//koncowy indeks w tablicy z kosztami krawedzi
		MNODES = _mm_set1_epi32(nodes-1);	//liczba wezlow pomniejszona o 1 (do sprawdzenia czy koniec tablicy)
		ARCS = _mm_set1_epi32(arcs);		//liczba krawedzi
	
		/* wyliczenie k, m */
		for(i = 0; i < nodes; i+=4) {
			ai0sse = _mm_load_si128((__m128i*) &ai0[i]);	//ladowanie ai0 (numerow wezlow)
			ai1sse = _mm_load_si128((__m128i*) &ai1[i]);	//ladowanie ai1 (indeksow w tablicy z krawedziami)
			ai1sse1 = _mm_set_epi32(ai1[i+4],ai1[i+3],ai1[i+2],ai1[i+1]);	//ladowanie indeksow z ai1 przesunietych o 1
			mask1 = _mm_cmpeq_epi32(J, ai0sse);				//sprawdzenie warunku j == ai0[i]
			K = _mm_or_si128(_mm_and_si128(mask1,ai1sse), _mm_andnot_si128(mask1,K));	//ustalenie K
			I = _mm_set_epi32(i+3, i+2, i+1, i);						//aktualne wartosci i
			mask2 = _mm_cmplt_epi32(I, MNODES);				//sprawdzenie warunku i == nodes-1
			mask3 = _mm_and_si128(mask1,mask2);				//sprawdzenie sumy warunkow 1 i 2
			then = _mm_or_si128(_mm_and_si128(mask2,ai1sse1), _mm_andnot_si128(mask2,ARCS));	//m = ai1[i+1] lub arcs
			M = _mm_or_si128(_mm_and_si128(mask3,then), _mm_andnot_si128(mask3,M));		//ustalenie M
		}
	
		for(i = 0; i < nodes; i++) {
			if(ai0[i] == j) {
				k = ai1[i];		//k - indeks startowy krawedzi wychodzacych z j
				//printf("i = %d ", i);
				if(i < nodes - 1) {
					m = ai1[i+1];
				}
				else {
					m = arcs;
				}
			}
		}


		/* zapisanie k, m */
		for(i = 0; i < 4; i++) {
			tmp1 = get_from_m128i(K,i);
			tmp2 = get_from_m128i(M,i);
			if(tmp1 != -1) {
				k = tmp1;
			}
			if(tmp2 != -1) {
				m = tmp2;
			}
		}
		//printf("K,M: %d %d\n", k, m);
		
		/* wybor optymalnej krawedzi */
		if(k != -1) {		
			INFINITE = _mm_set1_epi32(INF);		//wartosc "nieskonczona"
			NEGINF = _mm_set1_epi32(0-INF);		//wartosc -INF
			COST = _mm_set1_epi32(cost);		//koszt wybranej krawedzi
			MAXLA = _mm_set1_epi32(0-INF);		//maksymalna wartosc la = pr[a0[i]] - a1[i]
			ARGMAXLA = _mm_set1_epi32(-1);		//indeks dla którego la jest najwieksza
			for(i = k; i < m; i+=4) {
				a1sse = _mm_set_epi32(a1[i],a1[i+1],a1[i+2],a1[i+3]);				//ladowanie a1
				a0sse = _mm_set_epi32(a0[i],a0[i+1],a0[i+2],a0[i+3]);				//ladowanie a0
				prsse = _mm_set_epi32(pr[a0[i]],pr[a0[i+1]],pr[a0[i+2]],pr[a0[i+3]]);		//ladowanie pr
				Psse = _mm_set_epi32(P[a0[i]],P[a0[i+1]],P[a0[i+2]],P[a0[i+3]]);		//ladowanie P
				mask1 = _mm_cmpgt_epi32(_mm_set1_epi32(m),_mm_set_epi32(i,i+1,i+2,i+3));	//czy ostatni obieg
				prsse = _mm_or_si128(_mm_and_si128(mask1,prsse), _mm_andnot_si128(mask1,NEGINF));	//obciecie cudzych lukow
				LA = _mm_sub_epi32(prsse, a1sse);		//la = pr[a0[i]] - a1[i]
				then = _mm_max_epi32(LA,MAXLA);			//maksymalna wartość la, maxla
				mask1 = _mm_cmpeq_epi32(Psse,INFINITE);		//czy P[i] == INF
				mask2 = _mm_and_si128(mask1,_mm_cmpgt_epi32(LA,MAXLA));		//czy P[i] == INF i LA > MAXLA
				MAXLA = _mm_or_si128(_mm_and_si128(mask1,then), _mm_andnot_si128(mask1,MAXLA));		//aktualizacja maxla
				ARGMAXLA = _mm_or_si128(_mm_and_si128(mask2,a0sse), _mm_andnot_si128(mask2,ARGMAXLA));	//aktualizacja argmaxla
				COST = _mm_or_si128(_mm_and_si128(mask2,a1sse), _mm_andnot_si128(mask2,COST));		//aktualizacja cost
			}
		}
	
		/* zapisanie maxla, argmaxla, cost */
		maxla = 0 - INF;
		for(i = 0; i < 4; i++) {
			tmp1 = get_from_m128i(MAXLA,i);
			if(tmp1 > maxla) {
				argmaxla = get_from_m128i(ARGMAXLA,i);
				maxla = tmp1;
				cost = get_from_m128i(COST,i);
			}
		}
		//printf("COST: %d, PATH_COST: %d\n", cost, path_cost);
		//printf("pr[j] = %d, maxla = %d, argmaxla = %d\n", pr[j], maxla, argmaxla);

		/* skrocenie sciezki */
		if(pr[j] > maxla || maxla == -INF) {
			
			/* uaktualnienie ceny */
			pr[j] = maxla;

			/* sciezka jednoelementowa nie jest skracana */
			if(j != t) {

				/* uaktualnienie sciezki */
				P[j] = INF;
				length = length - 1;
				path_cost = path_cost - cost_tab[length];
				cost_tab[length] = 0;
			
				/* powrot do poprzedniego wierzcholka w sciezce (j), k - odcinany */
				k = j;
				for(i = 0; i < nodes; i++) {
					if(P[i] == length - 1) {
						j = i;
						break;
					}
				}
			}
		}
		/* przedluzenie sciezki */
		else {
			P[argmaxla] = length;
			j = argmaxla;
			path_cost = path_cost + cost;
			cost_tab[length] = cost;
			length = length + 1;

			/* sciezka doszla do wierzcholka startowego => koniec */
			if(argmaxla == s)
			{
				printf("dlugosc sciezki: %d\n", path_cost);
				return 0;
			}
		}
	}
	return 0;


}
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width,
                int height, int stride, uint8_t *d, const uint8_t *s)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t *p0 = (uint16_t *)buff + 8;
    uint16_t *p1 = p0 + bstride;
    uint16_t *p2 = p1 + bstride;
    uint16_t *p3 = p2 + bstride;
    uint16_t *p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128i all1 = _mm_cmpeq_epi32(zero, zero);
    __m128i one = _mm_srli_epi32(all1, 31);
    __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h);
    __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v);
    __m128 bias = _mm_set1_ps((float)ch->bias);

    __m128i matrix_h[5];
    __m128i matrix_v[5];
    int sign_h[5];
    int sign_v[5];
    for (int i = 0; i < 5; i++) {
        sign_h[i] = ch->m_h[i] < 0 ? 1 : 0;
        sign_v[i] = ch->m_v[i] < 0 ? 1 : 0;
        uint16_t val = sign_h[i] ? (uint16_t)(ch->m_h[i] * -1) : (uint16_t)ch->m_h[i];
        matrix_h[i] = _mm_set1_epi16((int16_t)val);
        val = sign_v[i] ? (uint16_t)(ch->m_v[i] * -1) : (uint16_t)ch->m_v[i];
        matrix_v[i] = _mm_set1_epi16((int16_t)val);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);

        for (int x = 0; x < width; x += 8) {
            uint16_t *array[] = {
                p0 + x, p1 + x, p2 + x, p3 + x, p4 + x,
                p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2
            };

            for (int j = 0; j < 2; j++) {
                __m128i *matrix = j == 0 ? matrix_v : matrix_h;
                int *sign = j == 0 ? sign_v : sign_h;
                __m128 rdiv = j == 0 ? rdiv_v : rdiv_h;
                __m128i sum[2];
                sum[0] = _mm_setzero_si128();
                sum[1] = _mm_setzero_si128();

                for (int i = 0; i < 5; i++) {
                    __m128i xmm0, xmm1, xmm2;

                    xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]);

                    xmm1 = _mm_mullo_epi16(xmm0, matrix[i]);
                    xmm0 = _mm_mulhi_epu16(xmm0, matrix[i]);
                    xmm2 = _mm_unpacklo_epi16(xmm1, xmm0);
                    xmm0 = _mm_unpackhi_epi16(xmm1, xmm0);

                    if (sign[i]) {
                        xmm2 = _mm_add_epi32(one, _mm_xor_si128(xmm2, all1));
                        xmm0 = _mm_add_epi32(one, _mm_xor_si128(xmm0, all1));
                    }
                    sum[0] = _mm_add_epi32(sum[0], xmm2);
                    sum[1] = _mm_add_epi32(sum[1], xmm0);
                }

                for (int i = 0; i < 2; i++) {
                    __m128 sumfp;
                    __m128i mask, temp;
                    sumfp = _mm_cvtepi32_ps(sum[i]);
                    sumfp = _mm_mul_ps(sumfp, rdiv);
                    if (j == 1) {
                        sumfp = _mm_add_ps(sumfp, bias);
                    }
                    sum[i] = _mm_cvttps_epi32(sumfp);

                    temp = _mm_srli_epi32(all1, 16);
                    mask = _mm_cmplt_epi32(sum[i], temp);
                    sum[i] = _mm_or_si128(_mm_and_si128(sum[i], mask),
                                          _mm_andnot_si128(mask, temp));
                    mask = _mm_cmpgt_epi32(sum[i], zero);
                    if (ch->saturate) {
                        sum[i] = _mm_and_si128(mask, sum[i]);
                    } else {
                        temp = _mm_add_epi32(one, _mm_xor_si128(sum[i], all1));
                        sum[i] = _mm_or_si128(_mm_and_si128(mask, sum[i]),
                                              _mm_andnot_si128(mask, temp));
                    }
                }

                sum[0] = mm_cast_epi32(sum[0], sum[1]);

                _mm_store_si128((__m128i *)(dstp + x), sum[0]);
            }
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
Beispiel #26
0
void spu_interpreter::CG(SPUThread& CPU, spu_opcode_t op)
{
	const auto a = _mm_xor_si128(CPU.GPR[op.ra].vi, _mm_set1_epi32(0x7fffffff));
	const auto b = _mm_xor_si128(CPU.GPR[op.rb].vi, _mm_set1_epi32(0x80000000));
	CPU.GPR[op.rt].vi = _mm_srli_epi32(_mm_cmpgt_epi32(b, a), 31);
}
Beispiel #27
0
__m128i test_mm_cmpgt_epi32(__m128i A, __m128i B) {
  // CHECK-LABEL: test_mm_cmpgt_epi32
  // CHECK: icmp sgt <4 x i32>
  return _mm_cmpgt_epi32(A, B);
}