Exemplo n.º 1
0
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
{
    LOAD_ZERO;
    vec_s16 *pv1 = (vec_s16*)v1;
    vec_s16 *pv2 = (vec_s16*)v2;
    vec_s16 *pv3 = (vec_s16*)v3;
    register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
    register vec_s16 t0, t1, i0, i1;
    register vec_s16 i2 = pv2[0], i3 = pv3[0];
    register vec_s32 res = zero_s32v;
    register vec_u8 align = vec_lvsl(0, v2);
    int32_t ires;
    order >>= 4;
    do {
        t0 = vec_perm(i2, pv2[1], align);
        i2 = pv2[2];
        t1 = vec_perm(pv2[1], i2, align);
        i0 = pv1[0];
        i1 = pv1[1];
        res = vec_msum(t0, i0, res);
        res = vec_msum(t1, i1, res);
        t0 = vec_perm(i3, pv3[1], align);
        i3 = pv3[2];
        t1 = vec_perm(pv3[1], i3, align);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1 += 2;
        pv2 += 2;
        pv3 += 2;
    } while(--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);
    return ires;
}
Exemplo n.º 2
0
void rgbaint_t::blend(const rgbaint_t& other, UINT8 factor)
{
	const VECU32 shift = vec_splat_u32(-16);
	const VECS32 scale1 = { factor, factor, factor, factor };
	const VECS32 scale2 = { 0x100 - factor, 0x100 - factor, 0x100 - factor, 0x100 - factor, };

	VECU32 temp = vec_msum((VECU16)m_value, (VECU16)vec_rl(scale1, shift), vec_splat_u32(0));
	temp = vec_msum((VECU16)other.m_value, (VECU16)vec_rl(scale2, shift), temp);

	m_value = vec_msum((VECU16)m_value, (VECU16)scale1, vec_mulo((VECU16)other.m_value, (VECU16)scale2));
	m_value = vec_add(vec_sl(temp, shift), (VECU32)m_value);
	sra(8);
}
 SIMD_INLINE void AddSquareDifference(const uint8_t * src, ptrdiff_t step, const v128_u8 & mask, v128_u32 & sum)
 {
     const v128_u8 a = Load<align>(src - step);
     const v128_u8 b = Load<align>(src + step);
     const v128_u8 d = vec_and(AbsDifferenceU8(a, b), mask);
     sum = vec_msum(d, d, sum);
 }
        void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count)
        {
            assert(width >= A);
            if (align)
                assert(Aligned(src) && Aligned(stride));

            size_t alignedWidth = AlignLo(width, QA);
            size_t bodyWidth = AlignLo(width, A);
            v128_u8 tailMask = ShiftLeft(K8_01, A - width + alignedWidth);
            v128_u8 _value = SIMD_VEC_SET1_EPI8(value);
            v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 };
            for (size_t row = 0; row < height; ++row)
            {
                size_t col = 0;
                for (; col < alignedWidth; col += QA)
                {
                    ConditionalCount8u<align, compareType>(src, col, _value, counts[0]);
                    ConditionalCount8u<align, compareType>(src, col + A, _value, counts[1]);
                    ConditionalCount8u<align, compareType>(src, col + 2 * A, _value, counts[2]);
                    ConditionalCount8u<align, compareType>(src, col + 3 * A, _value, counts[3]);
                }
                for (; col < bodyWidth; col += A)
                    ConditionalCount8u<align, compareType>(src, col, _value, counts[0]);
                if (alignedWidth != width)
                {
                    const v128_u8 mask = vec_and(Compare8u<compareType>(Load<false>(src + width - A), _value), tailMask);
                    counts[0] = vec_msum(mask, K8_01, counts[0]);
                }
                src += stride;
            }
            counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3]));
            *count = ExtractSum(counts[0]);
        }
Exemplo n.º 5
0
static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i, s = 0;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
    vector signed int sum;

    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned pixels. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char pixv = vec_perm(pixl, pixr, perm);

        /* Square the values, and add them to our sum. */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
}
Exemplo n.º 6
0
int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char *tv;
    vector unsigned char pixv;
    vector unsigned int sv;
    vector signed int sum;
    
    sv = (vector unsigned int)vec_splat_u32(0);
    
    s = 0;
    for (i = 0; i < 16; i++) {
        /* Read in the potentially unaligned pixels */
        tv = (vector unsigned char *) pix;
        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));

        /* Square the values, and add them to our sum */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
}
        void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count)
        {
            assert(width >= HA);
            if (align)
                assert(Aligned(src) && Aligned(stride));

            size_t alignedWidth = AlignLo(width, DA);
            size_t bodyWidth = Simd::AlignLo(width, HA);
            v128_u16 tailMask = ShiftLeft(K16_0001, HA - width + alignedWidth);
            v128_s16 _value = SIMD_VEC_SET1_EPI16(value);
            v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 };
            for (size_t row = 0; row < height; ++row)
            {
                const int16_t * s = (const int16_t *)src;
                size_t col = 0;
                for (; col < alignedWidth; col += DA)
                {
                    ConditionalCount16i<align, compareType>(s, col, _value, counts[0]);
                    ConditionalCount16i<align, compareType>(s, col + HA, _value, counts[1]);
                    ConditionalCount16i<align, compareType>(s, col + 2 * HA, _value, counts[2]);
                    ConditionalCount16i<align, compareType>(s, col + 3 * HA, _value, counts[3]);
                }
                for (; col < bodyWidth; col += HA)
                    ConditionalCount16i<align, compareType>(s, col, _value, counts[0]);
                if (alignedWidth != width)
                {
                    const v128_u16 mask = vec_and((v128_u16)Compare16i<compareType>(Load<false>(s + width - HA), _value), tailMask);
                    counts[0] = vec_msum(mask, K16_0001, counts[0]);
                }
                src += stride;
            }
            counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3]));
            *count = ExtractSum(counts[0]);
        }
Exemplo n.º 8
0
static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift)
{
    int i;
    LOAD_ZERO;
    register vec_s16 vec1, *pv;
    register vec_s32 res = vec_splat_s32(0), t;
    register vec_u32 shifts;
    int32_t ires;

    shifts = zero_u32v;
    if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
    if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
    if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
    if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
    if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));

    for(i = 0; i < order; i += 8){
        pv = (vec_s16*)v1;
        vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        t = vec_sr(t, shifts);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    }
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
}
Exemplo n.º 9
0
static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
                                     int size)
{
    int i, size16 = size >> 4;
    vector signed char vpix1;
    vector signed short vpix2, vdiff, vpix1l, vpix1h;
    union {
        vector signed int vscore;
        int32_t score[4];
    } u = { .vscore = vec_splat_s32(0) };

    while (size16) {
        // score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
        // load pix1 and the first batch of pix2

        vpix1 = vec_unaligned_load(pix1);
        vpix2 = vec_unaligned_load(pix2);
        pix2 += 8;
        // unpack
        vpix1h = vec_unpackh(vpix1);
        vdiff  = vec_sub(vpix1h, vpix2);
        vpix1l = vec_unpackl(vpix1);
        // load another batch from pix2
        vpix2    = vec_unaligned_load(pix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        vdiff    = vec_sub(vpix1l, vpix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        pix1    += 16;
        pix2    += 8;
        size16--;
    }
    u.vscore = vec_sums(u.vscore, vec_splat_s32(0));

    size %= 16;
    for (i = 0; i < size; i++)
        u.score[3] += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);

    return u.score[3];
}
#endif /* HAVE_ALTIVEC */

av_cold void ff_svq1enc_init_ppc(SVQ1EncContext *c)
{
#if HAVE_ALTIVEC
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
#endif /* HAVE_ALTIVEC */
}
Exemplo n.º 10
0
void rgbaint_t::blend(const rgbaint_t& other, UINT8 factor)
{
	const VECU32 shift = vec_splat_u32(-16);
	const VECS32 scale1 = { factor, factor, factor, factor };
	const VECS32 scale2 = { 0x100 - factor, 0x100 - factor, 0x100 - factor, 0x100 - factor, };

	VECU32 temp = vec_msum(VECU16(m_value), VECU16(vec_rl(scale1, shift)), vec_splat_u32(0));
	temp = vec_msum(VECU16(other.m_value), VECU16(vec_rl(scale2, shift)), temp);

#if defined __LITTLE_ENDIAN__
	m_value = VECS32(vec_msum(VECU16(m_value), VECU16(scale1), vec_mule(VECU16(other.m_value), VECU16(scale2))));
#else
	m_value = VECS32(vec_msum(VECU16(m_value), VECU16(scale1), vec_mulo(VECU16(other.m_value), VECU16(scale2))));
#endif
	m_value = VECS32(vec_add(vec_sl(temp, shift), VECU32(m_value)));
	sra_imm(8);
}
Exemplo n.º 11
0
static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
                                     int size) {
    int i, size16;
    vector signed char vpix1;
    vector signed short vpix2, vdiff, vpix1l,vpix1h;
    union { vector signed int vscore;
            int32_t score[4];
          } u;
    u.vscore = vec_splat_s32(0);
//
//XXX lazy way, fix it later

#define vec_unaligned_load(b) \
    vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));

    size16 = size >> 4;
    while(size16) {
//        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
        //load pix1 and the first batch of pix2

        vpix1 = vec_unaligned_load(pix1);
        vpix2 = vec_unaligned_load(pix2);
        pix2 += 8;
        //unpack
        vpix1h = vec_unpackh(vpix1);
        vdiff  = vec_sub(vpix1h, vpix2);
        vpix1l = vec_unpackl(vpix1);
        // load another batch from pix2
        vpix2 = vec_unaligned_load(pix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        vdiff  = vec_sub(vpix1l, vpix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        pix1 += 16;
        pix2 += 8;
        size16--;
    }
    u.vscore = vec_sums(u.vscore, vec_splat_s32(0));

    size %= 16;
    for (i = 0; i < size; i++) {
        u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
    }
    return u.score[3];
}
Exemplo n.º 12
0
/**
 * Sum of Squared Errors for a 8x8 block.
 * AltiVec-enhanced.
 * It's the pix_abs8x8_altivec code above w/ squaring added.
 */
int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sum;
    vector signed int sumsqr;
    
    sum = (vector unsigned int)vec_splat_u32(0);

    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);

    
    for(i=0;i<8;i++) {
	/* Read potentially unaligned pixels into t1 and t2
	   Since we're reading 16 pixels, and actually only want 8,
	   mask out the last 8 pixels. The 0s don't change the sum. */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);

        /*
          Since we want to use unsigned chars, we can take advantage
          of the fact that abs(a-b)^2 = (a-b)^2.
        */
        
	/* Calculate abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);
        
        /* Square the values and add them to our sum */
        sum = vec_msum(t5, t5, sum);
        
        pix1 += line_size;
        pix2 += line_size;
    }
    
    /* Sum up the four partial sums, and put the result into s */
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
    sumsqr = vec_splat(sumsqr, 3);
    vec_ste(sumsqr, 0, &s);
    
    return s;
}
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
                                                    const int16_t *v2,
                                                    const int16_t *v3,
                                                    int order, int mul)
{
    LOAD_ZERO;
    vec_s16 *pv1 = (vec_s16 *) v1;
    register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
    register vec_s16 t0, t1, i0, i1, i4, i2, i3;
    register vec_s32 res = zero_s32v;
#if HAVE_BIGENDIAN
    register vec_u8 align = vec_lvsl(0, v2);
    i2 = vec_ld(0, v2);
    i3 = vec_ld(0, v3);
#endif
    int32_t ires;

    order >>= 4;
    do {
        GET_T(t0,t1,v2,i1,i2);
        i0     = pv1[0];
        i1     = pv1[1];
        res    = vec_msum(t0, i0, res);
        res    = vec_msum(t1, i1, res);
        GET_T(t0,t1,v3,i4,i3);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1   += 2;
        v2    += 16;
        v3    += 16;
    } while (--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);

    return ires;
}
Exemplo n.º 14
0
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                           int order)
{
    int i;
    LOAD_ZERO;
    register vec_s16 vec1;
    register vec_s32 res = vec_splat_s32(0), t;
    int32_t ires;

    for(i = 0; i < order; i += 8){
        vec1 = vec_unaligned_load(v1);
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    }
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
}
Exemplo n.º 15
0
scalarproduct_int16_vsx (const signed short *v1, const signed short *v2,
			 int order)
{
  int i;
  LOAD_ZERO;
  register vec_s16 vec1;
  register vec_s32 res = vec_splat_s32 (0), t;
  signed int ires;

  for (i = 0; i < order; i += 8) {
    vec1 = vec_vsx_ld (0, v1);
    t    = vec_msum (vec1, vec_vsx_ld (0, v2), zero_s32v);
    res  = vec_sums (t, res);
    v1  += 8;
    v2  += 8;
  }
  res = vec_splat (res, 3);
  vec_ste (res, 0, &ires);

  return ires;
}
Exemplo n.º 16
0
static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i;
    int s;
    __vector zero = __vzero();
/*
    vector unsigned char *tv;
    vector unsigned char pixv;
    vector unsigned int sv;
    vector signed int sum;	
*/

	__vector *tv;
    __vector pixv;
    __vector sv;
    __vector sum;

    sv = __vzero();

    s = 0;
    for (i = 0; i < 16; i++) {
        /* Read in the potentially unaligned pixels */
        //tv = (vector unsigned char *) pix;
		tv = (__vector*) pix;
        //pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
		pixv = __vperm(tv[0], tv[1], __lvsl(pix,0));

        /* Square the values, and add them to our sum */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    
	vec_ste(sum, 0, &s);

    return s;
}
Exemplo n.º 17
0
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                           int order)
{
    int i;
    LOAD_ZERO;
    const vec_s16 *pv;
    register vec_s16 vec1;
    register vec_s32 res = vec_splat_s32(0), t;
    int32_t ires;

    for(i = 0; i < order; i += 8){
        pv = (const vec_s16*)v1;
        vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    }
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
}
Exemplo n.º 18
0
        void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height,
            const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum)
        {
            assert(width >= A);
            if (align)
                assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride));

            size_t alignedWidth = AlignLo(width, QA);
            size_t bodyWidth = AlignLo(width, A);
            v128_u8 tailMask = ShiftLeft(K8_FF, A - width + alignedWidth);
            v128_u8 _value = SetU8(value);
            *sum = 0;
            for (size_t row = 0; row < height; ++row)
            {
                size_t col = 0;
                v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 };
                for (; col < alignedWidth; col += QA)
                {
                    ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]);
                    ConditionalSquareSum<align, compareType>(src, mask, col + A, _value, sums[1]);
                    ConditionalSquareSum<align, compareType>(src, mask, col + 2 * A, _value, sums[2]);
                    ConditionalSquareSum<align, compareType>(src, mask, col + 3 * A, _value, sums[3]);
                }
                sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3]));
                for (; col < bodyWidth; col += A)
                    ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]);
                if (alignedWidth != width)
                {
                    const v128_u8 _mask = Compare8u<compareType>(Load<false>(mask + width - A), _value);
                    const v128_u8 _src = vec_and(vec_and(Load<false>(src + width - A), _mask), tailMask);
                    sums[0] = vec_msum(_src, _src, sums[0]);
                }
                *sum += ExtractSum(sums[0]);
                src += srcStride;
                mask += maskStride;
            }
        }
Exemplo n.º 19
0
 SIMD_INLINE void LaplaceAbsSum(v128_u8 a[3][3], v128_u32 sums[2])
 {
     sums[0] = vec_msum(ConditionalAbs<true>(Laplace<0>(a)), K16_0001, sums[0]);
     sums[1] = vec_msum(ConditionalAbs<true>(Laplace<1>(a)), K16_0001, sums[1]);
 }
Exemplo n.º 20
0
vector signed int
test_msum_si (vector signed short vss2, vector signed short vss3,
	   vector signed int vsi2)
{
  return vec_msum (vss2, vss3, vsi2);
}
Exemplo n.º 21
0
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
Exemplo n.º 22
0
int field_dct_best_altivec(FIELD_DCT_BEST_PDECL)
{
    /*
     * calculate prediction error (cur-pred) for top (blk0)
     * and bottom field (blk1)
     */
    double r, d;
    int sumtop, sumbot, sumsqtop, sumsqbot, sumbottop;
    int topvar, botvar;
    int whichdct;

    int i;
    vector unsigned char ct, pt, cb, pb;
    vector unsigned char *ctp, *ptp, *cbp, *pbp;
    unsigned int offset, stride2;
    vector signed short cur, pred;
    vector signed short dift, difb;
    vector signed int vsumtop, vsumbot, vsumsqtop, vsumsqbot, vsumbottop;
    vector signed int t0, t1, t2, t3;
    vector signed int zero;
    union {
	vector signed int v;
	struct {
	    signed int top;
	    signed int bot;
	    signed int sqtop;
	    signed int sqbot;
	} sum;
	struct {
	    signed int pad[3];
	    signed int sum;
	} bottop;
    } vo;


    AMBER_START;


#ifdef ALTIVEC_VERIFY
    if (NOT_VECTOR_ALIGNED(cur_lum_mb))
	mjpeg_error_exit1("field_dct_best: cur_lum_mb %% 16 != 0, (%d)\n",
	    cur_lum_mb);

    if (NOT_VECTOR_ALIGNED(pred_lum_mb))
	mjpeg_error_exit1("field_dct_best: pred_lum_mb %% 16 != 0, (%d)\n",
	    pred_lum_mb);

    if (NOT_VECTOR_ALIGNED(stride))
	mjpeg_error_exit1("field_dct_best: stride %% 16 != 0, (%d)\n", stride);
#endif


    zero = vec_splat_s32(0);
    vsumtop = vec_splat_s32(0);
    vsumbot = vec_splat_s32(0);
    vsumsqtop = vec_splat_s32(0);
    vsumsqbot = vec_splat_s32(0);
    vsumbottop = vec_splat_s32(0);

    ctp = (vector unsigned char*) cur_lum_mb;
    ptp = (vector unsigned char*) pred_lum_mb;
    cbp = (vector unsigned char*)(cur_lum_mb + stride);
    pbp = (vector unsigned char*)(pred_lum_mb + stride);
    offset = 0;
    stride2 = stride << 1;

#if 1
    ct = vec_ld(offset, ctp);
    pt = vec_ld(offset, ptp);
    cb = vec_ld(offset, cbp);
    pb = vec_ld(offset, pbp);

    i = 16/2 - 1;
    do {
	cur = (vector signed short)vec_mergeh(vu8(zero), ct);
	pred = (vector signed short)vec_mergeh(vu8(zero), pt);
	dift = vec_sub(cur, pred);

	cur = (vector signed short)vec_mergeh(vu8(zero), cb);
	pred = (vector signed short)vec_mergeh(vu8(zero), pb);
	difb = vec_sub(cur, pred);

	vsumtop = vec_sum4s(dift, vsumtop);
	vsumbot = vec_sum4s(difb, vsumbot);

	vsumsqtop = vec_msum(dift, dift, vsumsqtop);
	vsumsqbot = vec_msum(difb, difb, vsumsqbot);

	vsumbottop = vec_msum(dift, difb, vsumbottop);

	cur = (vector signed short)vec_mergel(vu8(zero), ct);
	pred = (vector signed short)vec_mergel(vu8(zero), pt);
	dift = vec_sub(cur, pred);

	cur = (vector signed short)vec_mergel(vu8(zero), cb);
	pred = (vector signed short)vec_mergel(vu8(zero), pb);
	difb = vec_sub(cur, pred);

	offset += stride2;
	ct = vec_ld(offset, ctp);
	pt = vec_ld(offset, ptp);
	cb = vec_ld(offset, cbp);
	pb = vec_ld(offset, pbp);

	vsumtop = vec_sum4s(dift, vsumtop);
	vsumbot = vec_sum4s(difb, vsumbot);

	vsumsqtop = vec_msum(dift, dift, vsumsqtop);
	vsumsqbot = vec_msum(difb, difb, vsumsqbot);

	vsumbottop = vec_msum(dift, difb, vsumbottop);
    } while (--i);
    cur = (vector signed short)vec_mergeh(vu8(zero), ct);
    pred = (vector signed short)vec_mergeh(vu8(zero), pt);
    dift = vec_sub(cur, pred);

    cur = (vector signed short)vec_mergeh(vu8(zero), cb);
    pred = (vector signed short)vec_mergeh(vu8(zero), pb);
    difb = vec_sub(cur, pred);

    vsumtop = vec_sum4s(dift, vsumtop);
    vsumbot = vec_sum4s(difb, vsumbot);

    vsumsqtop = vec_msum(dift, dift, vsumsqtop);
    vsumsqbot = vec_msum(difb, difb, vsumsqbot);

    vsumbottop = vec_msum(dift, difb, vsumbottop);

    cur = (vector signed short)vec_mergel(vu8(zero), ct);
    pred = (vector signed short)vec_mergel(vu8(zero), pt);
    dift = vec_sub(cur, pred);

    cur = (vector signed short)vec_mergel(vu8(zero), cb);
    pred = (vector signed short)vec_mergel(vu8(zero), pb);
    difb = vec_sub(cur, pred);

    vsumtop = vec_sum4s(dift, vsumtop);
    vsumbot = vec_sum4s(difb, vsumbot);

    vsumsqtop = vec_msum(dift, dift, vsumsqtop);
    vsumsqbot = vec_msum(difb, difb, vsumsqbot);

    vsumbottop = vec_msum(dift, difb, vsumbottop);
#else
    for (i = 0; i < 16/2; i++) { /* {{{ */
	ct = vec_ld(offset, ctp);
	pt = vec_ld(offset, ptp);
	cb = vec_ld(offset, cbp);
	pb = vec_ld(offset, pbp);

	cur = (vector signed short)vec_mergeh(vu8(zero), ct);
	pred = (vector signed short)vec_mergeh(vu8(zero), pt);
	dift = vec_sub(cur, pred);

	cur = (vector signed short)vec_mergeh(vu8(zero), cb);
	pred = (vector signed short)vec_mergeh(vu8(zero), pb);
	difb = vec_sub(cur, pred);

	vsumtop = vec_sum4s(dift, vsumtop);
	vsumbot = vec_sum4s(difb, vsumbot);

	vsumsqtop = vec_msum(dift, dift, vsumsqtop);
	vsumsqbot = vec_msum(difb, difb, vsumsqbot);

	vsumbottop = vec_msum(dift, difb, vsumbottop);

	cur = (vector signed short)vec_mergel(vu8(zero), ct);
	pred = (vector signed short)vec_mergel(vu8(zero), pt);
	dift = vec_sub(cur, pred);

	cur = (vector signed short)vec_mergel(vu8(zero), cb);
	pred = (vector signed short)vec_mergel(vu8(zero), pb);
	difb = vec_sub(cur, pred);

	vsumtop = vec_sum4s(dift, vsumtop);
	vsumbot = vec_sum4s(difb, vsumbot);

	vsumsqtop = vec_msum(dift, dift, vsumsqtop);
	vsumsqbot = vec_msum(difb, difb, vsumsqbot);

	vsumbottop = vec_msum(dift, difb, vsumbottop);

	offset += stride2;
    } /* }}} */
#endif

    /* transpose [sumtop, sumbot, sumsqtop, sumsqbot] {{{ */
    t0 = vec_mergel(vsumtop, vsumsqtop);
    t1 = vec_mergeh(vsumtop, vsumsqtop);
    t2 = vec_mergel(vsumbot, vsumsqbot);
    t3 = vec_mergeh(vsumbot, vsumsqbot);
    vsumtop = vec_mergeh(t1, t3);
    vsumbot = vec_mergel(t1, t3);
    vsumsqtop = vec_mergeh(t0, t2);
    vsumsqbot = vec_mergel(t0, t2);
    /* }}} */

    /* sum final values for sumtop, sumbot, sumsqtop, sumsqbot */
    vsumtop = vec_add(vsumtop, vsumbot);
    vsumsqtop = vec_add(vsumsqtop, vsumsqbot);
    vo.v = vec_add(vsumtop, vsumsqtop);

    sumtop = vo.sum.top;
    sumbot = vo.sum.bot;
    sumsqtop = vo.sum.sqtop;
    sumsqbot = vo.sum.sqbot;

    vsumbottop = vec_sums(vsumbottop, zero);

    vo.v = vsumbottop;


    /* Calculate Variances top and bottom.  If they're of similar
       sign estimate correlation if its good use frame DCT otherwise
       use field.
     */
    whichdct = FIELD_DCT;
    r = 0.0;
    topvar = sumsqtop-sumtop*sumtop/128;
    botvar = sumsqbot-sumbot*sumbot/128;
    if (!((topvar > 0) ^ (botvar > 0)))
    {
	sumbottop = vo.bottop.sum;

	d = ((double) topvar) * ((double)botvar);
	r = (sumbottop-(sumtop*sumbot)/128);
	if (r > (0.5 * sqrt(d)))
	    whichdct = FRAME_DCT;
    }

    AMBER_STOP;

    return whichdct;
}
 template <bool align> void AbsDifferenceSums3Masked(const v128_u8 & current, const uint8_t * background, const v128_u8 & mask, v128_u32 sums[3])
 {
     sums[0] = vec_msum(AbsDifferenceU8(current, vec_and(mask, Load<align>(background - 1))), K8_01, sums[0]);
     sums[1] = vec_msum(AbsDifferenceU8(current, vec_and(mask, Load<false>(background))), K8_01, sums[1]);
     sums[2] = vec_msum(AbsDifferenceU8(current, vec_and(mask, Load<false>(background + 1))), K8_01, sums[2]);
 }
 template <bool align> void AbsDifferenceSumMasked(const uint8_t * a, const uint8_t *b, size_t offset, const v128_u8 & mask, v128_u32 & sum)
 {
     const v128_u8 _a = vec_and(Load<align>(a + offset), mask);
     const v128_u8 _b = vec_and(Load<align>(b + offset), mask);
     sum = vec_msum(AbsDifferenceU8(_a, _b), K8_01, sum);
 }
Exemplo n.º 25
0
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
Exemplo n.º 26
0
 template <bool align, SimdCompareType compareType> void ConditionalSquareSum(const uint8_t * src, const uint8_t * mask, size_t offset, const v128_u8 & value, v128_u32 & sum)
 {
     const v128_u8 _mask = Compare8u<compareType>(Load<align>(mask + offset), value);
     const v128_u8 _src = vec_and(Load<align>(src + offset), _mask);
     sum = vec_msum(_src, _src, sum);
 }
Exemplo n.º 27
0
static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
    vector unsigned char perm1 = vec_lvsl(0, pix1);
    vector unsigned char perm2 = vec_lvsl(0, pix2);
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    sad = (vector unsigned int)vec_splat_u32(0);

    for (i = 0; i < h; i++) {
        /* Read potentially unaligned pixels into t1 and t2
           Since we're reading 16 pixels, and actually only want 8,
           mask out the last 8 pixels. The 0s don't change the sum. */
        vector unsigned char pix1l = vec_ld( 0, pix1);
        vector unsigned char pix1r = vec_ld(15, pix1);
        vector unsigned char pix2l = vec_ld( 0, pix2);
        vector unsigned char pix2r = vec_ld(15, pix2);
        t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
        t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);

        /* Calculate a sum of abs differences vector */
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}

static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i;
    int s;
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned char pixv;
    vector unsigned int sv;
    vector signed int sum;

    sv = (vector unsigned int)vec_splat_u32(0);

    s = 0;
    for (i = 0; i < 16; i++) {
        /* Read in the potentially unaligned pixels */
        vector unsigned char pixl = vec_ld( 0, pix);
        vector unsigned char pixr = vec_ld(15, pix);
        pixv = vec_perm(pixl, pixr, perm);

        /* Square the values, and add them to our sum */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
}
Exemplo n.º 28
0
/**
 * Sum of Squared Errors for a 8x8 block.
 * AltiVec-enhanced.
 * It's the sad8_altivec code above w/ squaring added.
 */
static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
    vector unsigned char perm1 = vec_lvsl(0, pix1);
    vector unsigned char perm2 = vec_lvsl(0, pix2);
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sum;
    vector signed int sumsqr;

    sum = (vector unsigned int)vec_splat_u32(0);

    for (i = 0; i < h; i++) {
        /* Read potentially unaligned pixels into t1 and t2
           Since we're reading 16 pixels, and actually only want 8,
           mask out the last 8 pixels. The 0s don't change the sum. */
        vector unsigned char pix1l = vec_ld( 0, pix1);
        vector unsigned char pix1r = vec_ld(15, pix1);
        vector unsigned char pix2l = vec_ld( 0, pix2);
        vector unsigned char pix2r = vec_ld(15, pix2);
        t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
        t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);

        /* Since we want to use unsigned chars, we can take advantage
           of the fact that abs(a-b)^2 = (a-b)^2. */

        /* Calculate abs differences vector */
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

        /* Square the values and add them to our sum */
        sum = vec_msum(t5, t5, sum);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
    sumsqr = vec_splat(sumsqr, 3);
    vec_ste(sumsqr, 0, &s);

    return s;
}

/**
 * Sum of Squared Errors for a 16x16 block.
 * AltiVec-enhanced.
 * It's the sad16_altivec code above w/ squaring added.
 */
static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix2);
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sum;
    vector signed int sumsqr;

    sum = (vector unsigned int)vec_splat_u32(0);

    for (i = 0; i < h; i++) {
        /* Read potentially unaligned pixels into t1 and t2 */
        vector unsigned char pix2l = vec_ld( 0, pix2);
        vector unsigned char pix2r = vec_ld(15, pix2);
        t1 = vec_ld(0, pix1);
        t2 = vec_perm(pix2l, pix2r, perm);

        /* Since we want to use unsigned chars, we can take advantage
           of the fact that abs(a-b)^2 = (a-b)^2. */

        /* Calculate abs differences vector */
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

        /* Square the values and add them to our sum */
        sum = vec_msum(t5, t5, sum);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
    sumsqr = vec_splat(sumsqr, 3);
    vec_ste(sumsqr, 0, &s);

    return s;
}
Exemplo n.º 29
0
 template <bool align, SimdCompareType compareType> void ConditionalCount8u(const uint8_t * src, size_t offset, const v128_u8 & value, v128_u32 & count)
 {
     const v128_u8 _src = Load<align>(src + offset);
     const v128_u8 mask = vec_and(Compare8u<compareType>(_src, value), K8_01);
     count = vec_msum(mask, K8_01, count);
 }
Exemplo n.º 30
0
 template <bool align, SimdCompareType compareType> void ConditionalCount16i(const int16_t * src, size_t offset, const v128_s16 & value, v128_u32 & count)
 {
     const v128_s16 _src = Load<align>(src + offset);
     const v128_u16 mask = vec_and((v128_u16)Compare16i<compareType>(_src, value), K16_0001);
     count = vec_msum(mask, K16_0001, count);
 }