void x264_sub8x8_dct_dc_altivec( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ) { vec_s16_t diff[2]; vec_s32_t sum[2]; vec_s32_t zero32 = vec_splat_s32(0); vec_u8_t mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }; pix_diff( &pix1[0], &pix2[0], diff, 0 ); pix_diff( &pix1[4*FENC_STRIDE], &pix2[4*FDEC_STRIDE], diff, 1 ); sum[0] = vec_sum4s( diff[0], zero32 ); sum[1] = vec_sum4s( diff[1], zero32 ); diff[0] = vec_packs( sum[0], sum[1] ); sum[0] = vec_sum4s( diff[0], zero32 ); diff[0] = vec_packs( sum[0], zero32 ); diff[1] = vec_vsx_ld( 0, dct ); diff[0] = vec_perm( diff[0], diff[1], mask ); vec_vsx_st( diff[0], 0, dct ); /* 2x2 DC transform */ int d0 = dct[0] + dct[1]; int d1 = dct[2] + dct[3]; int d2 = dct[0] - dct[1]; int d3 = dct[2] - dct[3]; dct[0] = d0 + d1; dct[1] = d0 - d1; dct[2] = d2 + d3; dct[3] = d2 - d3; }
static int pix_sum_altivec(uint8_t * pix, int line_size) { const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned char t1; vector unsigned int sad; vector signed int sumdiffs; int i; int s; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < 16; i++) { /* Read the potentially unaligned 16 pixels into t1 */ vector unsigned char pixl = vec_ld( 0, pix); vector unsigned char pixr = vec_ld(15, pix); t1 = vec_perm(pixl, pixr, perm); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t1, sad); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; uint8_t *pix3 = pix2 + line_size; s = 0; sad = (vector unsigned int)vec_splat_u32(0); /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] Split the pixel vectors into shorts */ tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix3[0]; pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix3v); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2v = pix3v; pix3 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
inline int v_signmask(const v_int16x8& a) { static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7}; vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15)); sv = vec_sl(sv, slm); vec_int4 svi = vec_int4_z; svi = vec_sums(vec_sum4s(sv, svi), svi); return vec_extract(svi, 3); }
uint32_t quant_h263_inter_altivec_c(int16_t *coeff, int16_t *data, const uint32_t quant, const uint16_t *mpeg_quant_matrices) { vector unsigned char zerovec; vector unsigned short mult; vector unsigned short quant_m_2; vector unsigned short quant_d_2; vector unsigned short sum_short; vector signed short acLevel; vector unsigned int even; vector unsigned int odd; vector bool short m2_mask; vector bool short zero_mask; uint32_t result; #ifdef DEBUG if(((unsigned)coeff) & 0x15) fprintf(stderr, "quant_h263_inter_altivec_c:incorrect align, coeff: %lx\n", (long)coeff); #endif /* initialisation stuff */ zerovec = vec_splat_u8(0); *((unsigned short*)&mult) = (unsigned short)multipliers[quant]; mult = vec_splat(mult, 0); *((unsigned short*)&quant_m_2) = (unsigned short)quant; quant_m_2 = vec_splat(quant_m_2, 0); quant_m_2 = vec_sl(quant_m_2, vec_splat_u16(1)); *((unsigned short*)&quant_d_2) = (unsigned short)quant; quant_d_2 = vec_splat(quant_d_2, 0); quant_d_2 = vec_sr(quant_d_2, vec_splat_u16(1)); sum_short = (vector unsigned short)zerovec; /* Quantize */ QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); /* Calculate the return value */ even = (vector unsigned int)vec_sum4s((vector signed short)sum_short, (vector signed int)zerovec); even = (vector unsigned int)vec_sums((vector signed int)even, (vector signed int)zerovec); even = vec_splat(even, 3); vec_ste(even, 0, &result); return result; }
/** Mask **/ inline int v_signmask(const v_uint8x16& a) { vec_uchar16 sv = vec_sr(a.val, vec_uchar16_sp(7)); static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; sv = vec_sl(sv, slm); vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z); static const vec_uint4 slm4 = {0, 0, 8, 8}; sv4 = vec_sl(sv4, slm4); return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3); }
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); tv = (vector unsigned char *) &pix2[1]; pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static unsigned reg_sad_altivec(const kvz_pixel * const data1, const kvz_pixel * const data2, const int width, const int height, const unsigned stride1, const unsigned stride2) { vector unsigned int vsad = {0,0,0,0}, vzero = {0,0,0,0}; vector signed int sumdiffs; int tmpsad, sad = 0; int y, x; for (y = 0; y < height; ++y) { vector unsigned char perm1, perm2; perm1 = vec_lvsl(0, &data1[y * stride1]); perm2 = vec_lvsl(0, &data2[y * stride2]); for (x = 0; x <= width-16; x+=16) { vector unsigned char t1, t2, t3, t4, t5; vector unsigned char *current, *previous; current = (vector unsigned char *) &data1[y * stride1 + x]; previous = (vector unsigned char *) &data2[y * stride2 + x]; t1 = vec_perm(current[0], current[1], perm1 ); /* align current vector */ t2 = vec_perm(previous[0], previous[1], perm2 );/* align previous vector */ t3 = vec_max(t1, t2 ); /* find largest of two */ t4 = vec_min(t1, t2 ); /* find smaller of two */ t5 = vec_sub(t3, t4); /* find absolute difference */ vsad = vec_sum4s(t5, vsad); /* accumulate sum of differences */ } for (; x < width; ++x) { sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]); } } sumdiffs = vec_sums((vector signed int) vsad, (vector signed int) vzero); /* copy vector sum into unaligned result */ sumdiffs = vec_splat( sumdiffs, 3); vec_ste( sumdiffs, 0, &tmpsad ); sad += tmpsad; return sad; }
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char perm1 = vec_lvsl(0, pix2); vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); vector unsigned char pix2l, pix2r; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ pix1v = vec_ld( 0, pix1); pix2l = vec_ld( 0, pix2); pix2r = vec_ld(16, pix2); pix2v = vec_perm(pix2l, pix2r, perm1); pix2iv = vec_perm(pix2l, pix2r, perm2); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read potentially unaligned pixels into t1 and t2 */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_perm(pix1v[0], pix1v[1], perm1); t2 = vec_perm(pix2v[0], pix2v[1], perm2); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix2); vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read potentially unaligned pixels into t1 and t2 */ vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2r = vec_ld(15, pix2); t1 = vec_ld(0, pix1); t2 = vec_perm(pix2l, pix2r, perm); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
int bsad_altivec(BSAD_PDECL) { int i; uint8_t *pfy, *pby; vector unsigned char l0, l1, lR; vector unsigned char permF0, permF1, permB0, permB1; vector unsigned char vf, vfa, vfb, vfc; vector unsigned char vb, vba, vbb, vbc; vector unsigned short tH, tL, fH, fL, bH, bL; vector unsigned char zero; vector unsigned short one, two; vector unsigned char max, min, dif; vector unsigned int sum; union { vector signed int v; struct { signed int pad[3]; signed int sum; } s; } vo; #ifdef ALTIVEC_VERIFY if (NOT_VECTOR_ALIGNED(p2)) mjpeg_error_exit1("bsad: p2 %% 16 != 0, (0x%X)", p2); if (NOT_VECTOR_ALIGNED(rowstride)) mjpeg_error_exit1("bsad: rowstride %% 16 != 0, (%d)", rowstride); if (hxf != 0 && hxf != 1) mjpeg_error_exit1("bsad: hxf != [0|1], (hxf=%d)", hxf); if (hyf != 0 && hyf != 1) mjpeg_error_exit1("bsad: hyf != [0|1], (hyf=%d)", hyf); if (hxb != 0 && hxb != 1) mjpeg_error_exit1("bsad: hxb != [0|1], (hxb=%d)", hxb); if (hyb != 0 && hyb != 1) mjpeg_error_exit1("bsad: hyb != [0|1], (hyb=%d)", hyb); #endif if (h != 8 && h != 16) mjpeg_error_exit1("bsad: h != [8|16], (%d)", h); AMBER_START; /* start loading first set */ vfb = vec_ld(0, pf); /* use vfb & vfc as temp for vf & vfa */ vfc = vec_ld(16, pf); pfy = pf + (rowstride * hyf); l0 = vec_ld(0, pfy); l1 = vec_ld(16, pfy); pby = pb + (rowstride * hyb); zero = vec_splat_u8(0); one = vec_splat_u16(1); two = vec_splat_u16(2); sum = vec_splat_u32(0); permF0 = vec_lvsl(0, pf); permF1 = vec_lvsl(hxf, (unsigned char*)0); permF1 = vec_splat(permF1, 0); permF1 = vec_add(permF0, permF1); permB0 = vec_lvsl(0, pb); permB1 = vec_lvsl(hxb, (unsigned char*)0); permB1 = vec_splat(permB1, 0); permB1 = vec_add(permB0, permB1); i = h - 1; do { /* while (--i) */ vf = vec_perm(vfb, vfc, permF0); vfa = vec_perm(vfb, vfc, permF1); vfb = vec_perm(l0, l1, permF0); vfc = vec_perm(l0, l1, permF1); vbb = vec_ld(0, pb); /* use vbb & vbc as temp for vb & vba */ vbc = vec_ld(16, pb); l0 = vec_ld(0, pby); l1 = vec_ld(16, pby); pb += rowstride; pby += rowstride; /* (unsigned short[]) pf[0-7] */ fH = vu16(vec_mergeh(zero, vf)); /* (unsigned short[]) pf[8-15] */ fL = vu16(vec_mergel(zero, vf)); /* (unsigned short[]) pfa[0-7] */ tH = vu16(vec_mergeh(zero, vfa)); /* (unsigned short[]) pfa[8-15] */ tL = vu16(vec_mergel(zero, vfa)); /* pf[i] + pfa[i] */ fH = vec_add(fH, tH); fL = vec_add(fL, tL); /* (unsigned short[]) pfb[0-7] */ tH = vu16(vec_mergeh(zero, vfb)); /* (unsigned short[]) pfb[8-15] */ tL = vu16(vec_mergel(zero, vfb)); /* (pf[i]+pfa[i]) + pfb[i] */ fH = vec_add(fH, tH); fL = vec_add(fL, tL); /* (unsigned short[]) pfc[0-7] */ tH = vu16(vec_mergeh(zero, vfc)); /* (unsigned short[]) pfc[8-15] */ tL = vu16(vec_mergel(zero, vfc)); /* (pf[i]+pfa[i]+pfb[i]) + pfc[i] */ fH = vec_add(fH, tH); fL = vec_add(fL, tL); /* (pf[i]+pfa[i]+pfb[i]+pfc[i]) + 2 */ fH = vec_add(fH, two); fL = vec_add(fL, two); /* (pf[i]+pfa[i]+pfb[i]+pfc[i]+2) >> 2 */ fH = vec_sra(fH, two); fL = vec_sra(fL, two); lR = vec_ld(0, p2); p2 += rowstride; vb = vec_perm(vbb, vbc, permB0); vba = vec_perm(vbb, vbc, permB1); vbb = vec_perm(l0, l1, permB0); vbc = vec_perm(l0, l1, permB1); pf += rowstride; vfb = vec_ld(0, pf); /* use vfb & vfc as temp for vf & vfa */ vfc = vec_ld(16, pf); pfy += rowstride; l0 = vec_ld(0, pfy); l1 = vec_ld(16, pfy); /* (unsigned short[]) pb[0-7] */ bH = vu16(vec_mergeh(zero, vb)); /* (unsigned short[]) pb[8-15] */ bL = vu16(vec_mergel(zero, vb)); /* (unsigned short[]) pba[0-7] */ tH = vu16(vec_mergeh(zero, vba)); /* (unsigned short[]) pba[8-15] */ tL = vu16(vec_mergel(zero, vba)); /* pb[i] + pba[i] */ bH = vec_add(bH, tH); bL = vec_add(bL, tL); /* (unsigned short[]) pbb[0-7] */ tH = vu16(vec_mergeh(zero, vbb)); /* (unsigned short[]) pbb[8-15] */ tL = vu16(vec_mergel(zero, vbb)); /* (pb[i]+pba[i]) + pbb[i] */ bH = vec_add(bH, tH); bL = vec_add(bL, tL); /* (unsigned short[]) pbc[0-7] */ tH = vu16(vec_mergeh(zero, vbc)); /* (unsigned short[]) pbc[8-15] */ tL = vu16(vec_mergel(zero, vbc)); /* (pb[i]+pba[i]+pbb[i]) + pbc[i] */ bH = vec_add(bH, tH); bL = vec_add(bL, tL); /* (pb[i]+pba[i]+pbb[i]+pbc[i]) + 2 */ bH = vec_add(bH, two); bL = vec_add(bL, two); /* (pb[i]+pba[i]+pbb[i]+pbc[i]+2) >> 2 */ bH = vec_sra(bH, two); bL = vec_sra(bL, two); /* ((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2) + * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2) */ tH = vec_add(fH, bH); tL = vec_add(fL, bL); /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+ * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)) + 1 */ tH = vec_add(tH, one); tL = vec_add(tL, one); /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+ * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1) >> 1 */ tH = vec_sra(tH, one); tL = vec_sra(tL, one); /* d = abs( ((((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+ * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1)>>1) - p2[i] ) */ tH = vu16(vec_packsu(tH, tL)); min = vec_min(vu8(tH), lR); max = vec_max(vu8(tH), lR); dif = vec_sub(max, min); /* sum += d */ sum = vec_sum4s(dif, sum); } while (--i); vf = vec_perm(vfb, vfc, permF0); vfa = vec_perm(vfb, vfc, permF1); vfb = vec_perm(l0, l1, permF0); vfc = vec_perm(l0, l1, permF1); vbb = vec_ld(0, pb); /* use vbb & vbc as temp for vb & vba */ vbc = vec_ld(16, pb); l0 = vec_ld(0, pby); l1 = vec_ld(16, pby); /* (unsigned short[]) pf[0-7] */ fH = vu16(vec_mergeh(zero, vf)); /* (unsigned short[]) pf[8-15] */ fL = vu16(vec_mergel(zero, vf)); /* (unsigned short[]) pfa[0-7] */ tH = vu16(vec_mergeh(zero, vfa)); /* (unsigned short[]) pfa[8-15] */ tL = vu16(vec_mergel(zero, vfa)); /* pf[i] + pfa[i] */ fH = vec_add(fH, tH); fL = vec_add(fL, tL); /* (unsigned short[]) pfb[0-7] */ tH = vu16(vec_mergeh(zero, vfb)); /* (unsigned short[]) pfb[8-15] */ tL = vu16(vec_mergel(zero, vfb)); /* (pf[i]+pfa[i]) + pfb[i] */ fH = vec_add(fH, tH); fL = vec_add(fL, tL); /* (unsigned short[]) pfc[0-7] */ tH = vu16(vec_mergeh(zero, vfc)); /* (unsigned short[]) pfc[8-15] */ tL = vu16(vec_mergel(zero, vfc)); /* (pf[i]+pfa[i]+pfb[i]) + pfc[i] */ fH = vec_add(fH, tH); fL = vec_add(fL, tL); /* (pf[i]+pfa[i]+pfb[i]+pfc[i]) + 2 */ fH = vec_add(fH, two); fL = vec_add(fL, two); /* (pf[i]+pfa[i]+pfb[i]+pfc[i]+2) >> 2 */ fH = vec_sra(fH, two); fL = vec_sra(fL, two); lR = vec_ld(0, p2); vb = vec_perm(vbb, vbc, permB0); vba = vec_perm(vbb, vbc, permB1); vbb = vec_perm(l0, l1, permB0); vbc = vec_perm(l0, l1, permB1); /* (unsigned short[]) pb[0-7] */ bH = vu16(vec_mergeh(zero, vb)); /* (unsigned short[]) pb[8-15] */ bL = vu16(vec_mergel(zero, vb)); /* (unsigned short[]) pba[0-7] */ tH = vu16(vec_mergeh(zero, vba)); /* (unsigned short[]) pba[8-15] */ tL = vu16(vec_mergel(zero, vba)); /* pb[i] + pba[i] */ bH = vec_add(bH, tH); bL = vec_add(bL, tL); /* (unsigned short[]) pbb[0-7] */ tH = vu16(vec_mergeh(zero, vbb)); /* (unsigned short[]) pbb[8-15] */ tL = vu16(vec_mergel(zero, vbb)); /* (pb[i]+pba[i]) + pbb[i] */ bH = vec_add(bH, tH); bL = vec_add(bL, tL); /* (unsigned short[]) pbc[0-7] */ tH = vu16(vec_mergeh(zero, vbc)); /* (unsigned short[]) pbc[8-15] */ tL = vu16(vec_mergel(zero, vbc)); /* (pb[i]+pba[i]+pbb[i]) + pbc[i] */ bH = vec_add(bH, tH); bL = vec_add(bL, tL); /* (pb[i]+pba[i]+pbb[i]+pbc[i]) + 2 */ bH = vec_add(bH, two); bL = vec_add(bL, two); /* (pb[i]+pba[i]+pbb[i]+pbc[i]+2) >> 2 */ bH = vec_sra(bH, two); bL = vec_sra(bL, two); /* ((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2) + * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2) */ tH = vec_add(fH, bH); tL = vec_add(fL, bL); /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+ * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)) + 1 */ tH = vec_add(tH, one); tL = vec_add(tL, one); /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+ * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1) >> 1 */ tH = vec_sra(tH, one); tL = vec_sra(tL, one); /* d = abs( ((((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+ * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1)>>1) - p2[i] ) */ tH = vu16(vec_packsu(tH, tL)); min = vec_min(vu8(tH), lR); max = vec_max(vu8(tH), lR); dif = vec_sub(max, min); /* sum += d */ sum = vec_sum4s(dif, sum); /* sum all parts of difference into one 32 bit quantity */ vo.v = vec_sums(vs32(sum), vs32(zero)); AMBER_STOP; return vo.s.sum; }
void iquant_intra_m2_altivec(IQUANT_INTRA_PDECL) { int i; vector signed short vsrc; uint16_t *qmat; vector unsigned short vqmat; vector unsigned short vmquant; vector bool short ltzero; vector signed short val, t0; vector signed short zero; vector unsigned int four; vector signed short min, max; vector signed int vsum; int sum; int offset, offset2; int16_t dst0; union { vector unsigned short vu16; unsigned short mquant; vector signed int vs32; struct { signed int pad[3]; signed int sum; } s; } vu; #ifdef ALTIVEC_DST DataStreamControl dsc; #endif #ifdef ALTIVEC_VERIFY /* {{{ */ if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat)) mjpeg_error_exit1("iquant_intra_m2: wsp->intra_q_mat %% 16 != 0, (%d)", wsp->intra_q_mat); if (NOT_VECTOR_ALIGNED(src)) mjpeg_error_exit1("iquant_intra_m2: src %% 16 != 0, (%d)", src); if (NOT_VECTOR_ALIGNED(dst)) mjpeg_error_exit1("iquant_intra_m2: dst %% 16 != 0, (%d)", dst); for (i = 0; i < 64; i++) if (src[i] < -256 || src[i] > 255) mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)", i, src[i]); #endif /* }}} */ AMBER_START; dst0 = src[0] << (3 - dc_prec); qmat = (uint16_t*)wsp->intra_q_mat; #ifdef ALTIVEC_DST dsc.control = DATA_STREAM_CONTROL(64/8,1,0); vec_dst(src, dsc.control, 0); vec_dst(qmat, dsc.control, 1); #endif /* vmquant = (vector unsigned short)(mquant); */ vu.mquant = (unsigned short)mquant; vmquant = vec_splat(vu.vu16, 0); vsum = vec_splat_s32(0); zero = vec_splat_s16(0); four = vec_splat_u32(4); /* max = (2047); min = (-2048); {{{ */ vu8(max) = vec_splat_u8(0x7); t0 = vec_splat_s16(-1); /* 0xffff */ vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */ min = vec_sub(t0, max); /* }}} */ offset = 0; #if 1 vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); i = (64/8) - 1; do { /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); offset2 = offset; offset += 8*sizeof(int16_t); vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vsum = vec_sum4s(val, vsum); vec_st(val, offset2, dst); } while (--i); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vsum = vec_sum4s(val, vsum); vec_st(val, offset, dst); #else /* {{{ */ i = (64/8); do { vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vsum = vec_sum4s(val, vsum); vec_st(val, offset, dst); offset += 8*sizeof(int16_t); } while (--i); /* }}} */ #endif vu.vs32 = vec_sums(vsum, vs32(zero)); sum = vu.s.sum; sum -= dst[0]; sum += dst0; dst[0] = dst0; /* mismatch control */ #if 1 dst[63] ^= !(sum & 1); #else if ((sum & 1) == 0) dst[63] ^= 1; #endif AMBER_STOP; }
static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; vector unsigned char perm1 = vec_lvsl(0, pix1); vector unsigned char perm2 = vec_lvsl(0, pix2); vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ vector unsigned char pix1l = vec_ld( 0, pix1); vector unsigned char pix1r = vec_ld(15, pix1); vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2r = vec_ld(15, pix2); t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear); t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; } static int pix_norm1_altivec(uint8_t *pix, int line_size) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned char pixv; vector unsigned int sv; vector signed int sum; sv = (vector unsigned int)vec_splat_u32(0); s = 0; for (i = 0; i < 16; i++) { /* Read in the potentially unaligned pixels */ vector unsigned char pixl = vec_ld( 0, pix); vector unsigned char pixr = vec_ld(15, pix); pixv = vec_perm(pixl, pixr, perm); /* Square the values, and add them to our sum */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; uint8_t *pix3 = pix2 + line_size; const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); vector unsigned char avgv, t5; vector unsigned char perm1 = vec_lvsl(0, pix2); vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); vector unsigned char pix2l, pix2r; vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; vector unsigned short avghv, avglv; vector unsigned short t1, t2, t3, t4; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); s = 0; /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, as well as some splitting, and vector addition each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] Split the pixel vectors into shorts */ pix2l = vec_ld( 0, pix2); pix2r = vec_ld(16, pix2); pix2v = vec_perm(pix2l, pix2r, perm1); pix2iv = vec_perm(pix2l, pix2r, perm2); pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); t1 = vec_add(pix2hv, pix2ihv); t2 = vec_add(pix2lv, pix2ilv); for (i = 0; i < h; i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */ pix1v = vec_ld(0, pix1); pix2l = vec_ld( 0, pix3); pix2r = vec_ld(16, pix3); pix3v = vec_perm(pix2l, pix2r, perm1); pix3iv = vec_perm(pix2l, pix2r, perm2); /* Note that AltiVec does have vec_avg, but this works on vector pairs and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. Instead, we have to split the pixel vectors into vectors of shorts, and do the averaging by hand. */ /* Split the pixel vectors into shorts */ pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); /* Do the averaging on them */ t3 = vec_add(pix3hv, pix3ihv); t4 = vec_add(pix3lv, pix3ilv); avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); /* Pack the shorts back into a result */ avgv = vec_pack(avghv, avglv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix3 += line_size; /* Transfer the calculated values for pix3 into pix2 */ t1 = t3; t2 = t4; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
int field_dct_best_altivec(FIELD_DCT_BEST_PDECL) { /* * calculate prediction error (cur-pred) for top (blk0) * and bottom field (blk1) */ double r, d; int sumtop, sumbot, sumsqtop, sumsqbot, sumbottop; int topvar, botvar; int whichdct; int i; vector unsigned char ct, pt, cb, pb; vector unsigned char *ctp, *ptp, *cbp, *pbp; unsigned int offset, stride2; vector signed short cur, pred; vector signed short dift, difb; vector signed int vsumtop, vsumbot, vsumsqtop, vsumsqbot, vsumbottop; vector signed int t0, t1, t2, t3; vector signed int zero; union { vector signed int v; struct { signed int top; signed int bot; signed int sqtop; signed int sqbot; } sum; struct { signed int pad[3]; signed int sum; } bottop; } vo; AMBER_START; #ifdef ALTIVEC_VERIFY if (NOT_VECTOR_ALIGNED(cur_lum_mb)) mjpeg_error_exit1("field_dct_best: cur_lum_mb %% 16 != 0, (%d)\n", cur_lum_mb); if (NOT_VECTOR_ALIGNED(pred_lum_mb)) mjpeg_error_exit1("field_dct_best: pred_lum_mb %% 16 != 0, (%d)\n", pred_lum_mb); if (NOT_VECTOR_ALIGNED(stride)) mjpeg_error_exit1("field_dct_best: stride %% 16 != 0, (%d)\n", stride); #endif zero = vec_splat_s32(0); vsumtop = vec_splat_s32(0); vsumbot = vec_splat_s32(0); vsumsqtop = vec_splat_s32(0); vsumsqbot = vec_splat_s32(0); vsumbottop = vec_splat_s32(0); ctp = (vector unsigned char*) cur_lum_mb; ptp = (vector unsigned char*) pred_lum_mb; cbp = (vector unsigned char*)(cur_lum_mb + stride); pbp = (vector unsigned char*)(pred_lum_mb + stride); offset = 0; stride2 = stride << 1; #if 1 ct = vec_ld(offset, ctp); pt = vec_ld(offset, ptp); cb = vec_ld(offset, cbp); pb = vec_ld(offset, pbp); i = 16/2 - 1; do { cur = (vector signed short)vec_mergeh(vu8(zero), ct); pred = (vector signed short)vec_mergeh(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergeh(vu8(zero), cb); pred = (vector signed short)vec_mergeh(vu8(zero), pb); difb = vec_sub(cur, pred); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); cur = (vector signed short)vec_mergel(vu8(zero), ct); pred = (vector signed short)vec_mergel(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergel(vu8(zero), cb); pred = (vector signed short)vec_mergel(vu8(zero), pb); difb = vec_sub(cur, pred); offset += stride2; ct = vec_ld(offset, ctp); pt = vec_ld(offset, ptp); cb = vec_ld(offset, cbp); pb = vec_ld(offset, pbp); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); } while (--i); cur = (vector signed short)vec_mergeh(vu8(zero), ct); pred = (vector signed short)vec_mergeh(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergeh(vu8(zero), cb); pred = (vector signed short)vec_mergeh(vu8(zero), pb); difb = vec_sub(cur, pred); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); cur = (vector signed short)vec_mergel(vu8(zero), ct); pred = (vector signed short)vec_mergel(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergel(vu8(zero), cb); pred = (vector signed short)vec_mergel(vu8(zero), pb); difb = vec_sub(cur, pred); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); #else for (i = 0; i < 16/2; i++) { /* {{{ */ ct = vec_ld(offset, ctp); pt = vec_ld(offset, ptp); cb = vec_ld(offset, cbp); pb = vec_ld(offset, pbp); cur = (vector signed short)vec_mergeh(vu8(zero), ct); pred = (vector signed short)vec_mergeh(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergeh(vu8(zero), cb); pred = (vector signed short)vec_mergeh(vu8(zero), pb); difb = vec_sub(cur, pred); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); cur = (vector signed short)vec_mergel(vu8(zero), ct); pred = (vector signed short)vec_mergel(vu8(zero), pt); dift = vec_sub(cur, pred); cur = (vector signed short)vec_mergel(vu8(zero), cb); pred = (vector signed short)vec_mergel(vu8(zero), pb); difb = vec_sub(cur, pred); vsumtop = vec_sum4s(dift, vsumtop); vsumbot = vec_sum4s(difb, vsumbot); vsumsqtop = vec_msum(dift, dift, vsumsqtop); vsumsqbot = vec_msum(difb, difb, vsumsqbot); vsumbottop = vec_msum(dift, difb, vsumbottop); offset += stride2; } /* }}} */ #endif /* transpose [sumtop, sumbot, sumsqtop, sumsqbot] {{{ */ t0 = vec_mergel(vsumtop, vsumsqtop); t1 = vec_mergeh(vsumtop, vsumsqtop); t2 = vec_mergel(vsumbot, vsumsqbot); t3 = vec_mergeh(vsumbot, vsumsqbot); vsumtop = vec_mergeh(t1, t3); vsumbot = vec_mergel(t1, t3); vsumsqtop = vec_mergeh(t0, t2); vsumsqbot = vec_mergel(t0, t2); /* }}} */ /* sum final values for sumtop, sumbot, sumsqtop, sumsqbot */ vsumtop = vec_add(vsumtop, vsumbot); vsumsqtop = vec_add(vsumsqtop, vsumsqbot); vo.v = vec_add(vsumtop, vsumsqtop); sumtop = vo.sum.top; sumbot = vo.sum.bot; sumsqtop = vo.sum.sqtop; sumsqbot = vo.sum.sqbot; vsumbottop = vec_sums(vsumbottop, zero); vo.v = vsumbottop; /* Calculate Variances top and bottom. If they're of similar sign estimate correlation if its good use frame DCT otherwise use field. */ whichdct = FIELD_DCT; r = 0.0; topvar = sumsqtop-sumtop*sumtop/128; botvar = sumsqbot-sumbot*sumbot/128; if (!((topvar > 0) ^ (botvar > 0))) { sumbottop = vo.bottop.sum; d = ((double) topvar) * ((double)botvar); r = (sumbottop-(sumtop*sumbot)/128); if (r > (0.5 * sqrt(d))) whichdct = FRAME_DCT; } AMBER_STOP; return whichdct; }
/** Reduce **/ inline short v_reduce_sum(const v_int16x8& a) { const vec_int4 zero = vec_int4_z; return saturate_cast<short>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3)); }
void subsample_image_altivec(SUBSAMPLE_IMAGE_PDECL) { int i, ii, j, stride1, stride2, stride3, stride4, halfstride; unsigned char *pB, *pB2, *pB4; vector unsigned char l0, l1, l2, l3; vector unsigned short s0, s1, s2, s3; vector unsigned short s22_0, s22_1, s22_2, s22_3; vector unsigned short s44, s44_0, s44_1; vector unsigned short zero, two; #ifdef ALTIVEC_DST DataStreamControl dsc; #endif #ifdef ALTIVEC_VERIFY if (NOT_VECTOR_ALIGNED(image)) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "image", 16, image); if (NOT_VECTOR_ALIGNED(sub22_image)) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "sub22_image", 16, sub22_image); if (NOT_VECTOR_ALIGNED(sub44_image)) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "sub44_image", 16, sub44_image); if ((rowstride & 63) != 0) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "rowstride", 64, rowstride); #endif AMBER_START; pB = image; #ifdef ALTIVEC_DST dsc.control = DATA_STREAM_CONTROL(6,4,0); dsc.block.stride = rowstride; vec_dst(pB, dsc.control, 0); #endif pB2 = sub22_image; pB4 = sub44_image; j = ((unsigned long)(pB2 - pB) / rowstride) >> 2; /* height/4 */ stride1 = rowstride; stride2 = stride1 + stride1; stride3 = stride2 + stride1; stride4 = stride2 + stride2; halfstride = stride1 >> 1; /* /2 */ ii = rowstride >> 6; /* rowstride/16/4 */ zero = vec_splat_u16(0); two = vec_splat_u16(2); do { i = ii; do { l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; #ifdef ALTIVEC_DST vec_dst(pB + (16 * 3), dsc.control, 0); #endif /* l0 = 0x[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F] */ /* l1 = 0x[10,11,12,13,14,15,16,17,18,19,1A,1B,1C,1D,1E,1F] */ /* l2 = 0x[20,21,22,23,24,25,26,27,28,29,2A,2B,2C,2D,2E,2F] */ /* l3 = 0x[30,31,32,33,34,35,36,37,38,39,3A,3B,3C,3D,3E,3F] */ /* s0 = 0x[00,01, 02,03, 04,05, 06,07, ] */ /* [ 10,11, 12,13, 14,15, 16,17] */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); /* s0 = 0x[00+01+10+11,02+03+12+13,04+05+14+15,06+07+16+17] */ s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); /* s1 = 0x[08,09, 0A,0B, 0C,0D, 0E,0F, ] */ /* [ 18,19, 1A,1B, 1C,1D, 1E,1F] */ s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); /* s1 = 0x[08+09+18+19,0A+0B+1A+1B,0C+0D+1C+1D,0E+0F+1E+1F] */ s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); /* s2 = 0x[20,21, 22,23, 24,25, 26,27, ] */ /* [ 30,31, 32,33, 34,35, 36,37] */ s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); /* s2 = 0x[20+21+30+31,22+23+32+33,24+25+34+35,26+27+36+37] */ s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); /* s3 = 0x[28,29, 2A,2B, 2C,2D, 2E,2F, ] */ /* [ 38,39, 3A,3B, 3C,3D, 3E,3F] */ s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); /* s3 = 0x[28+29+38+39,2A+2B+3A+3B,2C+2D+3C+3D,2E+2F+3E+3F] */ s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); /* start loading next block */ l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; /* s0 = 0x[00+01+10+11, 02+03+12+13, 04+05+14+15, 06+07+16+17] */ /* s1 = 0x[08+09+18+19, 0A+0B+1A+1B, 0C+0D+1C+1D, 0E+0F+1E+1F] */ /* s2 = 0x[20+21+30+31, 22+23+32+33, 24+25+34+35, 26+27+36+37] */ /* s3 = 0x[28+29+38+39, 2A+2B+3A+3B, 2C+2D+3C+3D, 2E+2F+3E+3F] */ /* s22_0 = 0x[ 00, 02, 04, 06, 08, 0A, 0C, 0E] */ s22_0 = vec_packsu(vu32(s0), vu32(s1)); /* s22_1 = 0x[ 20, 22, 24, 26, 28, 2A, 2C, 2E] */ s22_1 = vec_packsu(vu32(s2), vu32(s3)); /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]) + 2 */ s22_0 = vec_add(s22_0, two); /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]) + 2 */ s22_1 = vec_add(s22_1, two); /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]+2) >> 2 */ s22_0 = vec_sra(s22_0, two); /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]+2) >> 2 */ s22_1 = vec_sra(s22_1, two); /* s22_0 = 0x[ 00, 02, 04, 06, 08, 0A, 0C, 0E] */ /* s22_1 = 0x[ 20, 22, 24, 26, 28, 2A, 2C, 2E] */ /* s44_0 = 0x[00+20,02+22,04+24,06+26,08+28,0A+2A,0C+2C,0E+2E] */ s44_0 = vec_add(s22_0, s22_1); /* s44_0 = 0x[00+20+02+22, 04+24+06+26, 08+28+0A+2A, 0C+2C+0E+2E] */ s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero))); /* - - - - - - - - - - - - - - - - - - - */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); /* start loading next l[0-3] */ l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; s22_2 = vec_packsu(vu32(s0), vu32(s1)); s22_3 = vec_packsu(vu32(s2), vu32(s3)); s22_2 = vec_add(s22_2, two); s22_3 = vec_add(s22_3, two); s22_2 = vec_sra(s22_2, two); s22_3 = vec_sra(s22_3, two); s44_1 = vec_add(s22_2, s22_3); s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero))); /* store s22 block */ s22_0 = vu16(vec_packsu(s22_0, s22_2)); s22_1 = vu16(vec_packsu(s22_1, s22_3)); vec_st(vu8(s22_0), 0, pB2); vec_st(vu8(s22_1), halfstride, pB2); pB2 += 16; /* - - - - - - - - - - - - - - - - - - - */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); /* starting loading next l[0-3] */ l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; s22_0 = vec_packsu(vu32(s0), vu32(s1)); s22_1 = vec_packsu(vu32(s2), vu32(s3)); s22_0 = vec_add(s22_0, two); s22_1 = vec_add(s22_1, two); s22_0 = vec_sra(s22_0, two); s22_1 = vec_sra(s22_1, two); s44 = vec_packsu(vu32(s44_0), vu32(s44_1)); s44 = vec_add(s44, two); s44 = vec_sra(s44, two); s44_0 = vec_add(s22_0, s22_1); s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero))); /* - - - - - - - - - - - - - - - - - - - */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); s22_2 = vec_packsu(vu32(s0), vu32(s1)); s22_3 = vec_packsu(vu32(s2), vu32(s3)); s22_2 = vec_add(s22_2, two); s22_3 = vec_add(s22_3, two); s22_2 = vec_sra(s22_2, two); s22_3 = vec_sra(s22_3, two); s44_1 = vec_add(s22_2, s22_3); s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero))); /* store s22 block */ s22_0 = vu16(vec_packsu(s22_0, s22_2)); s22_1 = vu16(vec_packsu(s22_1, s22_3)); vec_st(vu8(s22_0), 0, pB2); vec_st(vu8(s22_1), halfstride, pB2); pB2 += 16; /* pack all four s44 chunks */ s44_0 = vec_packsu(vu32(s44_0), vu32(s44_1)); s44_0 = vec_add(s44_0, two); s44_0 = vec_sra(s44_0, two); s44 = vu16(vec_packsu(s44, s44_0)); vec_st(vu8(s44), 0, pB4); pB4 += 16; } while (--i); pB += stride3; pB2 += halfstride; } while (--j); #ifdef ALTIVEC_DST vec_dss(0); #endif AMBER_STOP; }