static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul) { LOAD_ZERO; vec_s16 *pv1 = (vec_s16*)v1; vec_s16 *pv2 = (vec_s16*)v2; vec_s16 *pv3 = (vec_s16*)v3; register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul}; register vec_s16 t0, t1, i0, i1; register vec_s16 i2 = pv2[0], i3 = pv3[0]; register vec_s32 res = zero_s32v; register vec_u8 align = vec_lvsl(0, v2); int32_t ires; order >>= 4; do { t0 = vec_perm(i2, pv2[1], align); i2 = pv2[2]; t1 = vec_perm(pv2[1], i2, align); i0 = pv1[0]; i1 = pv1[1]; res = vec_msum(t0, i0, res); res = vec_msum(t1, i1, res); t0 = vec_perm(i3, pv3[1], align); i3 = pv3[2]; t1 = vec_perm(pv3[1], i3, align); pv1[0] = vec_mladd(t0, muls, i0); pv1[1] = vec_mladd(t1, muls, i1); pv1 += 2; pv2 += 2; pv3 += 2; } while(--order); res = vec_splat(vec_sums(res, zero_s32v), 3); vec_ste(res, 0, &ires); return ires; }
void rgbaint_t::blend(const rgbaint_t& other, UINT8 factor) { const VECU32 shift = vec_splat_u32(-16); const VECS32 scale1 = { factor, factor, factor, factor }; const VECS32 scale2 = { 0x100 - factor, 0x100 - factor, 0x100 - factor, 0x100 - factor, }; VECU32 temp = vec_msum((VECU16)m_value, (VECU16)vec_rl(scale1, shift), vec_splat_u32(0)); temp = vec_msum((VECU16)other.m_value, (VECU16)vec_rl(scale2, shift), temp); m_value = vec_msum((VECU16)m_value, (VECU16)scale1, vec_mulo((VECU16)other.m_value, (VECU16)scale2)); m_value = vec_add(vec_sl(temp, shift), (VECU32)m_value); sra(8); }
SIMD_INLINE void AddSquareDifference(const uint8_t * src, ptrdiff_t step, const v128_u8 & mask, v128_u32 & sum) { const v128_u8 a = Load<align>(src - step); const v128_u8 b = Load<align>(src + step); const v128_u8 d = vec_and(AbsDifferenceU8(a, b), mask); sum = vec_msum(d, d, sum); }
void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) { assert(width >= A); if (align) assert(Aligned(src) && Aligned(stride)); size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_01, A - width + alignedWidth); v128_u8 _value = SIMD_VEC_SET1_EPI8(value); v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (size_t row = 0; row < height; ++row) { size_t col = 0; for (; col < alignedWidth; col += QA) { ConditionalCount8u<align, compareType>(src, col, _value, counts[0]); ConditionalCount8u<align, compareType>(src, col + A, _value, counts[1]); ConditionalCount8u<align, compareType>(src, col + 2 * A, _value, counts[2]); ConditionalCount8u<align, compareType>(src, col + 3 * A, _value, counts[3]); } for (; col < bodyWidth; col += A) ConditionalCount8u<align, compareType>(src, col, _value, counts[0]); if (alignedWidth != width) { const v128_u8 mask = vec_and(Compare8u<compareType>(Load<false>(src + width - A), _value), tailMask); counts[0] = vec_msum(mask, K8_01, counts[0]); } src += stride; } counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3])); *count = ExtractSum(counts[0]); }
static int pix_norm1_altivec(uint8_t *pix, int line_size) { int i, s = 0; const vector unsigned int zero = (const vector unsigned int) vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned int sv = (vector unsigned int) vec_splat_u32(0); vector signed int sum; for (i = 0; i < 16; i++) { /* Read the potentially unaligned pixels. */ vector unsigned char pixl = vec_ld(0, pix); vector unsigned char pixr = vec_ld(15, pix); vector unsigned char pixv = vec_perm(pixl, pixr, perm); /* Square the values, and add them to our sum. */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s. */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
int pix_norm1_altivec(uint8_t *pix, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char *tv; vector unsigned char pixv; vector unsigned int sv; vector signed int sum; sv = (vector unsigned int)vec_splat_u32(0); s = 0; for (i = 0; i < 16; i++) { /* Read in the potentially unaligned pixels */ tv = (vector unsigned char *) pix; pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); /* Square the values, and add them to our sum */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count) { assert(width >= HA); if (align) assert(Aligned(src) && Aligned(stride)); size_t alignedWidth = AlignLo(width, DA); size_t bodyWidth = Simd::AlignLo(width, HA); v128_u16 tailMask = ShiftLeft(K16_0001, HA - width + alignedWidth); v128_s16 _value = SIMD_VEC_SET1_EPI16(value); v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (size_t row = 0; row < height; ++row) { const int16_t * s = (const int16_t *)src; size_t col = 0; for (; col < alignedWidth; col += DA) { ConditionalCount16i<align, compareType>(s, col, _value, counts[0]); ConditionalCount16i<align, compareType>(s, col + HA, _value, counts[1]); ConditionalCount16i<align, compareType>(s, col + 2 * HA, _value, counts[2]); ConditionalCount16i<align, compareType>(s, col + 3 * HA, _value, counts[3]); } for (; col < bodyWidth; col += HA) ConditionalCount16i<align, compareType>(s, col, _value, counts[0]); if (alignedWidth != width) { const v128_u16 mask = vec_and((v128_u16)Compare16i<compareType>(Load<false>(s + width - HA), _value), tailMask); counts[0] = vec_msum(mask, K16_0001, counts[0]); } src += stride; } counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3])); *count = ExtractSum(counts[0]); }
static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift) { int i; LOAD_ZERO; register vec_s16 vec1, *pv; register vec_s32 res = vec_splat_s32(0), t; register vec_u32 shifts; int32_t ires; shifts = zero_u32v; if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1))); if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08)); if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04)); if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02)); if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01)); for(i = 0; i < order; i += 8){ pv = (vec_s16*)v1; vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1)); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); t = vec_sr(t, shifts); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }
static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, int size) { int i, size16 = size >> 4; vector signed char vpix1; vector signed short vpix2, vdiff, vpix1l, vpix1h; union { vector signed int vscore; int32_t score[4]; } u = { .vscore = vec_splat_s32(0) }; while (size16) { // score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]); // load pix1 and the first batch of pix2 vpix1 = vec_unaligned_load(pix1); vpix2 = vec_unaligned_load(pix2); pix2 += 8; // unpack vpix1h = vec_unpackh(vpix1); vdiff = vec_sub(vpix1h, vpix2); vpix1l = vec_unpackl(vpix1); // load another batch from pix2 vpix2 = vec_unaligned_load(pix2); u.vscore = vec_msum(vdiff, vdiff, u.vscore); vdiff = vec_sub(vpix1l, vpix2); u.vscore = vec_msum(vdiff, vdiff, u.vscore); pix1 += 16; pix2 += 8; size16--; } u.vscore = vec_sums(u.vscore, vec_splat_s32(0)); size %= 16; for (i = 0; i < size; i++) u.score[3] += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]); return u.score[3]; } #endif /* HAVE_ALTIVEC */ av_cold void ff_svq1enc_init_ppc(SVQ1EncContext *c) { #if HAVE_ALTIVEC c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec; #endif /* HAVE_ALTIVEC */ }
void rgbaint_t::blend(const rgbaint_t& other, UINT8 factor) { const VECU32 shift = vec_splat_u32(-16); const VECS32 scale1 = { factor, factor, factor, factor }; const VECS32 scale2 = { 0x100 - factor, 0x100 - factor, 0x100 - factor, 0x100 - factor, }; VECU32 temp = vec_msum(VECU16(m_value), VECU16(vec_rl(scale1, shift)), vec_splat_u32(0)); temp = vec_msum(VECU16(other.m_value), VECU16(vec_rl(scale2, shift)), temp); #if defined __LITTLE_ENDIAN__ m_value = VECS32(vec_msum(VECU16(m_value), VECU16(scale1), vec_mule(VECU16(other.m_value), VECU16(scale2)))); #else m_value = VECS32(vec_msum(VECU16(m_value), VECU16(scale1), vec_mulo(VECU16(other.m_value), VECU16(scale2)))); #endif m_value = VECS32(vec_add(vec_sl(temp, shift), VECU32(m_value))); sra_imm(8); }
static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, int size) { int i, size16; vector signed char vpix1; vector signed short vpix2, vdiff, vpix1l,vpix1h; union { vector signed int vscore; int32_t score[4]; } u; u.vscore = vec_splat_s32(0); // //XXX lazy way, fix it later #define vec_unaligned_load(b) \ vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b)); size16 = size >> 4; while(size16) { // score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); //load pix1 and the first batch of pix2 vpix1 = vec_unaligned_load(pix1); vpix2 = vec_unaligned_load(pix2); pix2 += 8; //unpack vpix1h = vec_unpackh(vpix1); vdiff = vec_sub(vpix1h, vpix2); vpix1l = vec_unpackl(vpix1); // load another batch from pix2 vpix2 = vec_unaligned_load(pix2); u.vscore = vec_msum(vdiff, vdiff, u.vscore); vdiff = vec_sub(vpix1l, vpix2); u.vscore = vec_msum(vdiff, vdiff, u.vscore); pix1 += 16; pix2 += 8; size16--; } u.vscore = vec_sums(u.vscore, vec_splat_s32(0)); size %= 16; for (i = 0; i < size; i++) { u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); } return u.score[3]; }
/** * Sum of Squared Errors for a 8x8 block. * AltiVec-enhanced. * It's the pix_abs8x8_altivec code above w/ squaring added. */ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; vector signed int sumsqr; sum = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */ /* Calculate abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Square the values and add them to our sum */ sum = vec_msum(t5, t5, sum); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); sumsqr = vec_splat(sumsqr, 3); vec_ste(sumsqr, 0, &s); return s; }
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul) { LOAD_ZERO; vec_s16 *pv1 = (vec_s16 *) v1; register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul }; register vec_s16 t0, t1, i0, i1, i4, i2, i3; register vec_s32 res = zero_s32v; #if HAVE_BIGENDIAN register vec_u8 align = vec_lvsl(0, v2); i2 = vec_ld(0, v2); i3 = vec_ld(0, v3); #endif int32_t ires; order >>= 4; do { GET_T(t0,t1,v2,i1,i2); i0 = pv1[0]; i1 = pv1[1]; res = vec_msum(t0, i0, res); res = vec_msum(t1, i1, res); GET_T(t0,t1,v3,i4,i3); pv1[0] = vec_mladd(t0, muls, i0); pv1[1] = vec_mladd(t1, muls, i1); pv1 += 2; v2 += 16; v3 += 16; } while (--order); res = vec_splat(vec_sums(res, zero_s32v), 3); vec_ste(res, 0, &ires); return ires; }
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2, int order) { int i; LOAD_ZERO; register vec_s16 vec1; register vec_s32 res = vec_splat_s32(0), t; int32_t ires; for(i = 0; i < order; i += 8){ vec1 = vec_unaligned_load(v1); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }
scalarproduct_int16_vsx (const signed short *v1, const signed short *v2, int order) { int i; LOAD_ZERO; register vec_s16 vec1; register vec_s32 res = vec_splat_s32 (0), t; signed int ires; for (i = 0; i < order; i += 8) { vec1 = vec_vsx_ld (0, v1); t = vec_msum (vec1, vec_vsx_ld (0, v2), zero_s32v); res = vec_sums (t, res); v1 += 8; v2 += 8; } res = vec_splat (res, 3); vec_ste (res, 0, &ires); return ires; }
static int pix_norm1_altivec(uint8_t *pix, int line_size) { int i; int s; __vector zero = __vzero(); /* vector unsigned char *tv; vector unsigned char pixv; vector unsigned int sv; vector signed int sum; */ __vector *tv; __vector pixv; __vector sv; __vector sum; sv = __vzero(); s = 0; for (i = 0; i < 16; i++) { /* Read in the potentially unaligned pixels */ //tv = (vector unsigned char *) pix; tv = (__vector*) pix; //pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); pixv = __vperm(tv[0], tv[1], __lvsl(pix,0)); /* Square the values, and add them to our sum */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2, int order) { int i; LOAD_ZERO; const vec_s16 *pv; register vec_s16 vec1; register vec_s32 res = vec_splat_s32(0), t; int32_t ires; for(i = 0; i < order; i += 8){ pv = (const vec_s16*)v1; vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1)); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }
void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) { assert(width >= A); if (align) assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); v128_u8 _value = SetU8(value); *sum = 0; for (size_t row = 0; row < height; ++row) { size_t col = 0; v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (; col < alignedWidth; col += QA) { ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]); ConditionalSquareSum<align, compareType>(src, mask, col + A, _value, sums[1]); ConditionalSquareSum<align, compareType>(src, mask, col + 2 * A, _value, sums[2]); ConditionalSquareSum<align, compareType>(src, mask, col + 3 * A, _value, sums[3]); } sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3])); for (; col < bodyWidth; col += A) ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]); if (alignedWidth != width) { const v128_u8 _mask = Compare8u<compareType>(Load<false>(mask + width - A), _value); const v128_u8 _src = vec_and(vec_and(Load<false>(src + width - A), _mask), tailMask); sums[0] = vec_msum(_src, _src, sums[0]); } *sum += ExtractSum(sums[0]); src += srcStride; mask += maskStride; } }
SIMD_INLINE void LaplaceAbsSum(v128_u8 a[3][3], v128_u32 sums[2]) { sums[0] = vec_msum(ConditionalAbs<true>(Laplace<0>(a)), K16_0001, sums[0]); sums[1] = vec_msum(ConditionalAbs<true>(Laplace<1>(a)), K16_0001, sums[1]); }
vector signed int test_msum_si (vector signed short vss2, vector signed short vss3, vector signed int vsi2) { return vec_msum (vss2, vss3, vsi2); }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) { return v_int32x4(vec_msum(a.val, b.val, c.val)); }
int field_dct_best_altivec(FIELD_DCT_BEST_PDECL) { /* * calculate prediction error (cur-pred) for top (blk0) * and bottom field (blk1) */ double r, d; int sumtop, sumbot, sumsqtop, sumsqbot, sumbottop; int topvar, botvar; int whichdct; int i; vector unsigned char ct, pt, cb, pb; vector unsigned char *ctp, *ptp, *cbp, *pbp; unsigned int offset, stride2; vector signed short cur, pred; vector signed short dift, difb; vector signed int vsumtop, vsumbot, vsumsqtop, vsumsqbot, vsumbottop; vector signed int t0, t1, t2, t3; vector signed int zero; union { vector signed int v; struct { signed int top; signed int bot; signed int sqtop; signed int sqbot; } sum; struct { signed int pad[3]; signed int sum; } bottop; } vo; AMBER_START; #ifdef ALTIVEC_VERIFY if (NOT_VECTOR_ALIGNED(cur_lum_mb)) mjpeg_error_exit1("field_dct_best: cur_lum_mb %% 16 != 0, (%d)\n", cur_lum_mb); if (NOT_VECTOR_ALIGNED(pred_lum_mb)) mjpeg_error_exit1("field_dct_best: pred_lum_mb %% 16 != 0, (%d)\n", pred_lum_mb); if (NOT_VECTOR_ALIGNED(stride)) mjpeg_error_exit1("field_dct_best: stride %% 16 != 0, (%d)\n", stride); #endif zero = vec_splat_s32(0); vsumtop = vec_splat_s32(0); vsumbot = vec_splat_s32(0); vsumsqtop = vec_splat_s32(0); vsumsqbot = vec_splat_s32(0); vsumbottop = vec_splat_s32(0); ctp = (vector unsigned char*) cur_lum_mb; ptp = (vector unsigned char*) pred_lum_mb; cbp = (vector unsigned char*)(cur_lum_mb + stride); pbp = (vector unsigned char*)(pred_lum_mb + stride); offset = 0; stride2 = stride << 1; #if 1 ct = vec_ld(offset, ctp); pt = vec_ld(offset, ptp); cb = vec_ld(offset, cbp); pb = vec_ld(offset, pbp); i = 16/2 - 1; do { cur = (vector signed short)vec_mergeh(vu8(zero), ct); pred = (vector signed short)vec_mergeh(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergeh(vu8(zero), cb); pred = (vector signed short)vec_mergeh(vu8(zero), pb); difb = vec_sub(cur, pred); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); cur = (vector signed short)vec_mergel(vu8(zero), ct); pred = (vector signed short)vec_mergel(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergel(vu8(zero), cb); pred = (vector signed short)vec_mergel(vu8(zero), pb); difb = vec_sub(cur, pred); offset += stride2; ct = vec_ld(offset, ctp); pt = vec_ld(offset, ptp); cb = vec_ld(offset, cbp); pb = vec_ld(offset, pbp); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); } while (--i); cur = (vector signed short)vec_mergeh(vu8(zero), ct); pred = (vector signed short)vec_mergeh(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergeh(vu8(zero), cb); pred = (vector signed short)vec_mergeh(vu8(zero), pb); difb = vec_sub(cur, pred); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); cur = (vector signed short)vec_mergel(vu8(zero), ct); pred = (vector signed short)vec_mergel(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergel(vu8(zero), cb); pred = (vector signed short)vec_mergel(vu8(zero), pb); difb = vec_sub(cur, pred); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); #else for (i = 0; i < 16/2; i++) { /* {{{ */ ct = vec_ld(offset, ctp); pt = vec_ld(offset, ptp); cb = vec_ld(offset, cbp); pb = vec_ld(offset, pbp); cur = (vector signed short)vec_mergeh(vu8(zero), ct); pred = (vector signed short)vec_mergeh(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergeh(vu8(zero), cb); pred = (vector signed short)vec_mergeh(vu8(zero), pb); difb = vec_sub(cur, pred); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); cur = (vector signed short)vec_mergel(vu8(zero), ct); pred = (vector signed short)vec_mergel(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergel(vu8(zero), cb); pred = (vector signed short)vec_mergel(vu8(zero), pb); difb = vec_sub(cur, pred); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); offset += stride2; } /* }}} */ #endif /* transpose [sumtop, sumbot, sumsqtop, sumsqbot] {{{ */ t0 = vec_mergel(vsumtop, vsumsqtop); t1 = vec_mergeh(vsumtop, vsumsqtop); t2 = vec_mergel(vsumbot, vsumsqbot); t3 = vec_mergeh(vsumbot, vsumsqbot); vsumtop = vec_mergeh(t1, t3); vsumbot = vec_mergel(t1, t3); vsumsqtop = vec_mergeh(t0, t2); vsumsqbot = vec_mergel(t0, t2); /* }}} */ /* sum final values for sumtop, sumbot, sumsqtop, sumsqbot */ vsumtop = vec_add(vsumtop, vsumbot); vsumsqtop = vec_add(vsumsqtop, vsumsqbot); vo.v = vec_add(vsumtop, vsumsqtop); sumtop = vo.sum.top; sumbot = vo.sum.bot; sumsqtop = vo.sum.sqtop; sumsqbot = vo.sum.sqbot; vsumbottop = vec_sums(vsumbottop, zero); vo.v = vsumbottop; /* Calculate Variances top and bottom. If they're of similar sign estimate correlation if its good use frame DCT otherwise use field. */ whichdct = FIELD_DCT; r = 0.0; topvar = sumsqtop-sumtop*sumtop/128; botvar = sumsqbot-sumbot*sumbot/128; if (!((topvar > 0) ^ (botvar > 0))) { sumbottop = vo.bottop.sum; d = ((double) topvar) * ((double)botvar); r = (sumbottop-(sumtop*sumbot)/128); if (r > (0.5 * sqrt(d))) whichdct = FRAME_DCT; } AMBER_STOP; return whichdct; }
template <bool align> void AbsDifferenceSums3Masked(const v128_u8 & current, const uint8_t * background, const v128_u8 & mask, v128_u32 sums[3]) { sums[0] = vec_msum(AbsDifferenceU8(current, vec_and(mask, Load<align>(background - 1))), K8_01, sums[0]); sums[1] = vec_msum(AbsDifferenceU8(current, vec_and(mask, Load<false>(background))), K8_01, sums[1]); sums[2] = vec_msum(AbsDifferenceU8(current, vec_and(mask, Load<false>(background + 1))), K8_01, sums[2]); }
template <bool align> void AbsDifferenceSumMasked(const uint8_t * a, const uint8_t *b, size_t offset, const v128_u8 & mask, v128_u32 & sum) { const v128_u8 _a = vec_and(Load<align>(a + offset), mask); const v128_u8 _b = vec_and(Load<align>(b + offset), mask); sum = vec_msum(AbsDifferenceU8(_a, _b), K8_01, sum); }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
template <bool align, SimdCompareType compareType> void ConditionalSquareSum(const uint8_t * src, const uint8_t * mask, size_t offset, const v128_u8 & value, v128_u32 & sum) { const v128_u8 _mask = Compare8u<compareType>(Load<align>(mask + offset), value); const v128_u8 _src = vec_and(Load<align>(src + offset), _mask); sum = vec_msum(_src, _src, sum); }
static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; vector unsigned char perm1 = vec_lvsl(0, pix1); vector unsigned char perm2 = vec_lvsl(0, pix2); vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ vector unsigned char pix1l = vec_ld( 0, pix1); vector unsigned char pix1r = vec_ld(15, pix1); vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2r = vec_ld(15, pix2); t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear); t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; } static int pix_norm1_altivec(uint8_t *pix, int line_size) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned char pixv; vector unsigned int sv; vector signed int sum; sv = (vector unsigned int)vec_splat_u32(0); s = 0; for (i = 0; i < 16; i++) { /* Read in the potentially unaligned pixels */ vector unsigned char pixl = vec_ld( 0, pix); vector unsigned char pixr = vec_ld(15, pix); pixv = vec_perm(pixl, pixr, perm); /* Square the values, and add them to our sum */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
/** * Sum of Squared Errors for a 8x8 block. * AltiVec-enhanced. * It's the sad8_altivec code above w/ squaring added. */ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; vector unsigned char perm1 = vec_lvsl(0, pix1); vector unsigned char perm2 = vec_lvsl(0, pix2); vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; vector signed int sumsqr; sum = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ vector unsigned char pix1l = vec_ld( 0, pix1); vector unsigned char pix1r = vec_ld(15, pix1); vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2r = vec_ld(15, pix2); t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear); t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear); /* Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */ /* Calculate abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Square the values and add them to our sum */ sum = vec_msum(t5, t5, sum); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); sumsqr = vec_splat(sumsqr, 3); vec_ste(sumsqr, 0, &s); return s; } /** * Sum of Squared Errors for a 16x16 block. * AltiVec-enhanced. * It's the sad16_altivec code above w/ squaring added. */ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix2); vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; vector signed int sumsqr; sum = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read potentially unaligned pixels into t1 and t2 */ vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2r = vec_ld(15, pix2); t1 = vec_ld(0, pix1); t2 = vec_perm(pix2l, pix2r, perm); /* Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */ /* Calculate abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Square the values and add them to our sum */ sum = vec_msum(t5, t5, sum); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); sumsqr = vec_splat(sumsqr, 3); vec_ste(sumsqr, 0, &s); return s; }
template <bool align, SimdCompareType compareType> void ConditionalCount8u(const uint8_t * src, size_t offset, const v128_u8 & value, v128_u32 & count) { const v128_u8 _src = Load<align>(src + offset); const v128_u8 mask = vec_and(Compare8u<compareType>(_src, value), K8_01); count = vec_msum(mask, K8_01, count); }
template <bool align, SimdCompareType compareType> void ConditionalCount16i(const int16_t * src, size_t offset, const v128_s16 & value, v128_u32 & count) { const v128_s16 _src = Load<align>(src + offset); const v128_u16 mask = vec_and((v128_u16)Compare16i<compareType>(_src, value), K16_0001); count = vec_msum(mask, K16_0001, count); }