static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, int size) { int i, size16 = size >> 4; vector signed char vpix1; vector signed short vpix2, vdiff, vpix1l, vpix1h; union { vector signed int vscore; int32_t score[4]; } u = { .vscore = vec_splat_s32(0) }; while (size16) { // score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]); // load pix1 and the first batch of pix2 vpix1 = vec_unaligned_load(pix1); vpix2 = vec_unaligned_load(pix2); pix2 += 8; // unpack vpix1h = vec_unpackh(vpix1); vdiff = vec_sub(vpix1h, vpix2); vpix1l = vec_unpackl(vpix1); // load another batch from pix2 vpix2 = vec_unaligned_load(pix2); u.vscore = vec_msum(vdiff, vdiff, u.vscore); vdiff = vec_sub(vpix1l, vpix2); u.vscore = vec_msum(vdiff, vdiff, u.vscore); pix1 += 16; pix2 += 8; size16--; } u.vscore = vec_sums(u.vscore, vec_splat_s32(0)); size %= 16; for (i = 0; i < size; i++) u.score[3] += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]); return u.score[3]; } #endif /* HAVE_ALTIVEC */ av_cold void ff_svq1enc_init_ppc(SVQ1EncContext *c) { #if HAVE_ALTIVEC c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec; #endif /* HAVE_ALTIVEC */ }
static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, int size) { int i, size16; vector signed char vpix1; vector signed short vpix2, vdiff, vpix1l,vpix1h; union { vector signed int vscore; int32_t score[4]; } u; u.vscore = vec_splat_s32(0); // //XXX lazy way, fix it later #define vec_unaligned_load(b) \ vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b)); size16 = size >> 4; while(size16) { // score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); //load pix1 and the first batch of pix2 vpix1 = vec_unaligned_load(pix1); vpix2 = vec_unaligned_load(pix2); pix2 += 8; //unpack vpix1h = vec_unpackh(vpix1); vdiff = vec_sub(vpix1h, vpix2); vpix1l = vec_unpackl(vpix1); // load another batch from pix2 vpix2 = vec_unaligned_load(pix2); u.vscore = vec_msum(vdiff, vdiff, u.vscore); vdiff = vec_sub(vpix1l, vpix2); u.vscore = vec_msum(vdiff, vdiff, u.vscore); pix1 += 16; pix2 += 8; size16--; } u.vscore = vec_sums(u.vscore, vec_splat_s32(0)); size %= 16; for (i = 0; i < size; i++) { u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); } return u.score[3]; }
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2, int order) { int i; LOAD_ZERO; register vec_s16 vec1; register vec_s32 res = vec_splat_s32(0), t; int32_t ires; for(i = 0; i < order; i += 8){ vec1 = vec_unaligned_load(v1); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }