static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul) { LOAD_ZERO; vec_s16 *pv1 = (vec_s16*)v1; vec_s16 *pv2 = (vec_s16*)v2; vec_s16 *pv3 = (vec_s16*)v3; register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul}; register vec_s16 t0, t1, i0, i1; register vec_s16 i2 = pv2[0], i3 = pv3[0]; register vec_s32 res = zero_s32v; register vec_u8 align = vec_lvsl(0, v2); int32_t ires; order >>= 4; do { t0 = vec_perm(i2, pv2[1], align); i2 = pv2[2]; t1 = vec_perm(pv2[1], i2, align); i0 = pv1[0]; i1 = pv1[1]; res = vec_msum(t0, i0, res); res = vec_msum(t1, i1, res); t0 = vec_perm(i3, pv3[1], align); i3 = pv3[2]; t1 = vec_perm(pv3[1], i3, align); pv1[0] = vec_mladd(t0, muls, i0); pv1[1] = vec_mladd(t1, muls, i1); pv1 += 2; pv2 += 2; pv3 += 2; } while(--order); res = vec_splat(vec_sums(res, zero_s32v), 3); vec_ste(res, 0, &ires); return ires; }
int pix_norm1_altivec(uint8_t *pix, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char *tv; vector unsigned char pixv; vector unsigned int sv; vector signed int sum; sv = (vector unsigned int)vec_splat_u32(0); s = 0; for (i = 0; i < 16; i++) { /* Read in the potentially unaligned pixels */ tv = (vector unsigned char *) pix; pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); /* Square the values, and add them to our sum */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
static int pix_sum_altivec(uint8_t * pix, int line_size) { const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned char t1; vector unsigned int sad; vector signed int sumdiffs; int i; int s; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < 16; i++) { /* Read the potentially unaligned 16 pixels into t1 */ vector unsigned char pixl = vec_ld( 0, pix); vector unsigned char pixr = vec_ld(15, pix); t1 = vec_perm(pixl, pixr, perm); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t1, sad); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift) { int i; LOAD_ZERO; register vec_s16 vec1, *pv; register vec_s32 res = vec_splat_s32(0), t; register vec_u32 shifts; int32_t ires; shifts = zero_u32v; if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1))); if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08)); if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04)); if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02)); if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01)); for(i = 0; i < order; i += 8){ pv = (vec_s16*)v1; vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1)); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); t = vec_sr(t, shifts); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }
uint32_t sad8_altivec_c(const uint8_t * cur, const uint8_t *ref, const uint32_t stride) { uint32_t result = 0; register vector unsigned int sad; register vector unsigned char c; register vector unsigned char r; /* initialize */ sad = vec_splat_u32(0); /* Perform sad operations */ SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); /* finish addition, add the first 2 together */ sad = vec_and(sad, (vector unsigned int)vec_pack(vec_splat_u16(-1),vec_splat_u16(0))); sad = (vector unsigned int)vec_sums((vector signed int)sad, vec_splat_s32(0)); sad = vec_splat(sad,3); vec_ste(sad, 0, &result); return result; }
uint32_t sse8_16bit_altivec_c(const int16_t * b1, const int16_t * b2, const uint32_t stride) { register vector signed short b1_vec; register vector signed short b2_vec; register vector signed short diff; register vector signed int sum; uint32_t result; /* initialize */ sum = vec_splat_s32(0); SSE8_16BIT(); SSE8_16BIT(); SSE8_16BIT(); SSE8_16BIT(); SSE8_16BIT(); SSE8_16BIT(); SSE8_16BIT(); SSE8_16BIT(); /* sum the vector */ sum = vec_sums(sum, vec_splat_s32(0)); sum = vec_splat(sum,3); vec_ste(sum,0,(int*)&result); /* and return */ return result; }
static int pix_norm1_altivec(uint8_t *pix, int line_size) { int i, s = 0; const vector unsigned int zero = (const vector unsigned int) vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned int sv = (vector unsigned int) vec_splat_u32(0); vector signed int sum; for (i = 0; i < 16; i++) { /* Read the potentially unaligned pixels. */ vector unsigned char pixl = vec_ld(0, pix); vector unsigned char pixr = vec_ld(15, pix); vector unsigned char pixv = vec_perm(pixl, pixr, perm); /* Square the values, and add them to our sum. */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s. */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
static av_always_inline void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int w, int is6tap) { LOAD_H_SUBPEL_FILTER(mx-1); vec_u8 align_vec0, align_vec8, permh0, permh8, filt; vec_u8 perm_6tap0, perm_6tap8, perml0, perml8; vec_u8 a, b, pixh, pixl, outer; vec_s16 f16h, f16l; vec_s32 filth, filtl; vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 }; vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 }; vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4; vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 }; vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6)); vec_u16 c7 = vec_splat_u16(7); align_vec0 = vec_lvsl( -is6tap-1, src); align_vec8 = vec_lvsl(8-is6tap-1, src); permh0 = vec_perm(align_vec0, align_vec0, perm_inner); permh8 = vec_perm(align_vec8, align_vec8, perm_inner); perm_inner = vec_add(perm_inner, vec_splat_u8(4)); perml0 = vec_perm(align_vec0, align_vec0, perm_inner); perml8 = vec_perm(align_vec8, align_vec8, perm_inner); perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer); perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer); while (h --> 0) { FILTER_H(f16h, 0); if (w == 16) { FILTER_H(f16l, 8); filt = vec_packsu(f16h, f16l); vec_st(filt, 0, dst); } else { filt = vec_packsu(f16h, f16h); vec_ste((vec_u32)filt, 0, (uint32_t*)dst); if (w == 8) vec_ste((vec_u32)filt, 4, (uint32_t*)dst); } src += src_stride; dst += dst_stride; } }
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; uint8_t *pix3 = pix2 + line_size; s = 0; sad = (vector unsigned int)vec_splat_u32(0); /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] Split the pixel vectors into shorts */ tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix3[0]; pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix3v); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2v = pix3v; pix3 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
void b() { z = vec_add (x, y); /* Make sure the predicates accept correct argument types. */ int1 = vec_all_in (f, g); int1 = vec_all_ge (f, g); int1 = vec_all_eq (c, d); int1 = vec_all_ne (s, t); int1 = vec_any_eq (i, j); int1 = vec_any_ge (f, g); int1 = vec_all_ngt (f, g); int1 = vec_any_ge (c, d); int1 = vec_any_ge (s, t); int1 = vec_any_ge (i, j); int1 = vec_any_ge (c, d); int1 = vec_any_ge (s, t); int1 = vec_any_ge (i, j); vec_mtvscr (i); vec_dssall (); s = (vector signed short) vec_mfvscr (); vec_dss (3); vec_dst (pi, int1 + int2, 3); vec_dstst (pi, int1 + int2, 3); vec_dststt (pi, int1 + int2, 3); vec_dstt (pi, int1 + int2, 3); uc = (vector unsigned char) vec_lvsl (int1 + 69, (signed int *) pi); uc = (vector unsigned char) vec_lvsr (int1 + 69, (signed int *) pi); c = vec_lde (int1, (signed char *) pi); s = vec_lde (int1, (signed short *) pi); i = vec_lde (int1, (signed int *) pi); i = vec_ldl (int1, pi); i = vec_ld (int1, pi); vec_st (i, int2, pi); vec_ste (c, int2, (signed char *) pi); vec_ste (s, int2, (signed short *) pi); vec_ste (i, int2, (signed int *) pi); vec_stl (i, int2, pi); }
static void test () { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ vector unsigned char vuc = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}; vector signed char vsc = {7,6,5,4,3,2,1,0,-1,-2,-3,-4,-5,-6,-7,-8}; vector unsigned short vus = {7,6,5,4,3,2,1,0}; vector signed short vss = {3,2,1,0,-1,-2,-3,-4}; vector unsigned int vui = {3,2,1,0}; vector signed int vsi = {1,0,-1,-2}; vector float vf = {3.0,2.0,1.0,0.0}; #else vector unsigned char vuc = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; vector signed char vsc = {-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7}; vector unsigned short vus = {0,1,2,3,4,5,6,7}; vector signed short vss = {-4,-3,-2,-1,0,1,2,3}; vector unsigned int vui = {0,1,2,3}; vector signed int vsi = {-2,-1,0,1}; vector float vf = {0.0,1.0,2.0,3.0}; #endif vec_ste (vuc, 9*1, (unsigned char *)svuc); vec_ste (vsc, 14*1, (signed char *)svsc); vec_ste (vus, 7*2, (unsigned short *)svus); vec_ste (vss, 1*2, (signed short *)svss); vec_ste (vui, 3*4, (unsigned int *)svui); vec_ste (vsi, 2*4, (signed int *)svsi); vec_ste (vf, 0*4, (float *)svf); check_arrays (); }
void r_dimpatchD_ALTIVEC(const DCanvas *const cvs, argb_t color, int alpha, int x1, int y1, int w, int h) { int x, y, i; argb_t *line; int invAlpha = 256 - alpha; int dpitch = cvs->pitch / sizeof(argb_t); line = (argb_t *)cvs->buffer + y1 * dpitch; int batches = w / 4; int remainder = w & 3; // AltiVec temporaries: const vu16 zero = {0, 0, 0, 0, 0, 0, 0, 0}; const vu16 upper8mask = {0, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0xff}; const vu16 blendAlpha = {0, alpha, alpha, alpha, 0, alpha, alpha, alpha}; const vu16 blendInvAlpha = {0, invAlpha, invAlpha, invAlpha, 0, invAlpha, invAlpha, invAlpha}; const vu16 blendColor = {0, RPART(color), GPART(color), BPART(color), 0, RPART(color), GPART(color), BPART(color)}; const vu16 blendMult = vec_mladd(blendColor, blendAlpha, zero); for (y = y1; y < y1 + h; y++) { // AltiVec optimize the bulk in batches of 4 colors: for (i = 0, x = x1; i < batches; ++i, x += 4) { const vu32 input = {line[x + 0], line[x + 1], line[x + 2], line[x + 3]}; const vu32 output = (vu32)blend4vs1_altivec(input, blendMult, blendInvAlpha, upper8mask); vec_ste(output, 0, &line[x]); vec_ste(output, 4, &line[x]); vec_ste(output, 8, &line[x]); vec_ste(output, 12, &line[x]); } if (remainder) { // Pick up the remainder: for (; x < x1 + w; x++) { line[x] = alphablend1a(line[x], color, alpha); } } line += dpitch; } }
uint32_t sad16_altivec_c(vector unsigned char *cur, vector unsigned char *ref, uint32_t stride, const uint32_t best_sad) { vector unsigned char perm; vector unsigned char t1, t2; vector unsigned int sad; vector unsigned int sumdiffs; vector unsigned int best_vec; uint32_t result; #ifdef DEBUG /* print alignment errors if DEBUG is on */ if (((unsigned long) cur) & 0xf) fprintf(stderr, "sad16_altivec:incorrect align, cur: %lx\n", (long)cur); if (stride & 0xf) fprintf(stderr, "sad16_altivec:incorrect align, stride: %lu\n", stride); #endif /* initialization */ sad = vec_splat_u32(0); sumdiffs = sad; stride >>= 4; perm = vec_lvsl(0, (unsigned char *) ref); *((uint32_t*)&best_vec) = best_sad; best_vec = vec_splat(best_vec, 0); /* perform sum of differences between current and previous */ SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); bail: /* copy vector sum into unaligned result */ sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, (uint32_t*) &result); return result; }
uint32_t quant_h263_inter_altivec_c(int16_t *coeff, int16_t *data, const uint32_t quant, const uint16_t *mpeg_quant_matrices) { vector unsigned char zerovec; vector unsigned short mult; vector unsigned short quant_m_2; vector unsigned short quant_d_2; vector unsigned short sum_short; vector signed short acLevel; vector unsigned int even; vector unsigned int odd; vector bool short m2_mask; vector bool short zero_mask; uint32_t result; #ifdef DEBUG if(((unsigned)coeff) & 0x15) fprintf(stderr, "quant_h263_inter_altivec_c:incorrect align, coeff: %lx\n", (long)coeff); #endif /* initialisation stuff */ zerovec = vec_splat_u8(0); *((unsigned short*)&mult) = (unsigned short)multipliers[quant]; mult = vec_splat(mult, 0); *((unsigned short*)&quant_m_2) = (unsigned short)quant; quant_m_2 = vec_splat(quant_m_2, 0); quant_m_2 = vec_sl(quant_m_2, vec_splat_u16(1)); *((unsigned short*)&quant_d_2) = (unsigned short)quant; quant_d_2 = vec_splat(quant_d_2, 0); quant_d_2 = vec_sr(quant_d_2, vec_splat_u16(1)); sum_short = (vector unsigned short)zerovec; /* Quantize */ QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); /* Calculate the return value */ even = (vector unsigned int)vec_sum4s((vector signed short)sum_short, (vector signed int)zerovec); even = (vector unsigned int)vec_sums((vector signed int)even, (vector signed int)zerovec); even = vec_splat(even, 3); vec_ste(even, 0, &result); return result; }
/* Store less than a vector to an unaligned location in memory */ static inline void StoreUnalignedLess (vector unsigned char v, const guchar *where, int n) { int i; vector unsigned char permuteVector = vec_lvsr(0, where); v = vec_perm(v, v, permuteVector); for (i=0; i<n; i++) vec_ste(v, i, CONST_BUFFER(where)); }
uint32_t sad16bi_altivec_c(vector unsigned char *cur, vector unsigned char *ref1, vector unsigned char *ref2, uint32_t stride) { vector unsigned char t1, t2; vector unsigned char mask1, mask2; vector unsigned char sad; vector unsigned int sum; uint32_t result; #ifdef DEBUG /* print alignment errors if this is on */ if((long)cur & 0xf) fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lx\n", (long)cur); if(stride & 0xf) fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lu\n", stride); #endif /* Initialisation stuff */ stride >>= 4; mask1 = vec_lvsl(0, (unsigned char*)ref1); mask2 = vec_lvsl(0, (unsigned char*)ref2); sad = vec_splat_u8(0); sum = (vector unsigned int)sad; SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0)); sum = vec_splat(sum, 3); vec_ste(sum, 0, (uint32_t*)&result); return result; }
/** * Sum of Squared Errors for a 8x8 block. * AltiVec-enhanced. * It's the pix_abs8x8_altivec code above w/ squaring added. */ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; vector signed int sumsqr; sum = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */ /* Calculate abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Square the values and add them to our sum */ sum = vec_msum(t5, t5, sum); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); sumsqr = vec_splat(sumsqr, 3); vec_ste(sumsqr, 0, &s); return s; }
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); tv = (vector unsigned char *) &pix2[1]; pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
/* * This function assumes cur is 8 bytes aligned, stride is 16 bytes * aligned and ref is unaligned */ unsigned long sad8_altivec(const vector unsigned char *cur, const vector unsigned char *ref, unsigned long stride) { vector unsigned char t1, t2, t3, t4, t5, tp; vector unsigned int sad; vector signed int sumdiffs; vector unsigned char perm_cur; vector unsigned char perm_ref1, perm_ref2; unsigned long result; ZERODEF; #ifdef DEBUG if (((unsigned long) cur) & 0x7) fprintf(stderr, "sad8_altivec:incorrect align, cur: %x\n", cur); // if (((unsigned long)ref) & 0x7) // fprintf(stderr, "sad8_altivec:incorrect align, ref: %x\n", ref); if (stride & 0xf) fprintf(stderr, "sad8_altivec:incorrect align, stride: %x\n", stride); #endif perm_cur = get_perm((((unsigned long) cur) >> 3) & 0x01); perm_ref1 = vec_lvsl(0, (unsigned char *) ref); perm_ref2 = get_perm(0); /* initialization */ sad = (vector unsigned int) (ZEROVEC); stride >>= 4; /* perform sum of differences between current and previous */ SAD8(); SAD8(); SAD8(); SAD8(); /* sum all parts of difference into one 32 bit quantity */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC); /* copy vector sum into unaligned result */ sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, (int *) &result); return (result); }
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char perm1 = vec_lvsl(0, pix2); vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); vector unsigned char pix2l, pix2r; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ pix1v = vec_ld( 0, pix1); pix2l = vec_ld( 0, pix2); pix2r = vec_ld(16, pix2); pix2v = vec_perm(pix2l, pix2r, perm1); pix2iv = vec_perm(pix2l, pix2r, perm2); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static unsigned reg_sad_altivec(const kvz_pixel * const data1, const kvz_pixel * const data2, const int width, const int height, const unsigned stride1, const unsigned stride2) { vector unsigned int vsad = {0,0,0,0}, vzero = {0,0,0,0}; vector signed int sumdiffs; int tmpsad, sad = 0; int y, x; for (y = 0; y < height; ++y) { vector unsigned char perm1, perm2; perm1 = vec_lvsl(0, &data1[y * stride1]); perm2 = vec_lvsl(0, &data2[y * stride2]); for (x = 0; x <= width-16; x+=16) { vector unsigned char t1, t2, t3, t4, t5; vector unsigned char *current, *previous; current = (vector unsigned char *) &data1[y * stride1 + x]; previous = (vector unsigned char *) &data2[y * stride2 + x]; t1 = vec_perm(current[0], current[1], perm1 ); /* align current vector */ t2 = vec_perm(previous[0], previous[1], perm2 );/* align previous vector */ t3 = vec_max(t1, t2 ); /* find largest of two */ t4 = vec_min(t1, t2 ); /* find smaller of two */ t5 = vec_sub(t3, t4); /* find absolute difference */ vsad = vec_sum4s(t5, vsad); /* accumulate sum of differences */ } for (; x < width; ++x) { sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]); } } sumdiffs = vec_sums((vector signed int) vsad, (vector signed int) vzero); /* copy vector sum into unaligned result */ sumdiffs = vec_splat( sumdiffs, 3); vec_ste( sumdiffs, 0, &tmpsad ); sad += tmpsad; return sad; }
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2, int order) { int i; LOAD_ZERO; register vec_s16 vec1; register vec_s32 res = vec_splat_s32(0), t; int32_t ires; for(i = 0; i < order; i += 8){ vec1 = vec_unaligned_load(v1); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read potentially unaligned pixels into t1 and t2 */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_perm(pix1v[0], pix1v[1], perm1); t2 = vec_perm(pix2v[0], pix2v[1], perm2); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
scalarproduct_int16_vsx (const signed short *v1, const signed short *v2, int order) { int i; LOAD_ZERO; register vec_s16 vec1; register vec_s32 res = vec_splat_s32 (0), t; signed int ires; for (i = 0; i < order; i += 8) { vec1 = vec_vsx_ld (0, v1); t = vec_msum (vec1, vec_vsx_ld (0, v2), zero_s32v); res = vec_sums (t, res); v1 += 8; v2 += 8; } res = vec_splat (res, 3); vec_ste (res, 0, &ires); return ires; }
static int pix_norm1_altivec(uint8_t *pix, int line_size) { int i; int s; __vector zero = __vzero(); /* vector unsigned char *tv; vector unsigned char pixv; vector unsigned int sv; vector signed int sum; */ __vector *tv; __vector pixv; __vector sv; __vector sum; sv = __vzero(); s = 0; for (i = 0; i < 16; i++) { /* Read in the potentially unaligned pixels */ //tv = (vector unsigned char *) pix; tv = (__vector*) pix; //pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); pixv = __vperm(tv[0], tv[1], __lvsl(pix,0)); /* Square the values, and add them to our sum */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix2); vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read potentially unaligned pixels into t1 and t2 */ vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2r = vec_ld(15, pix2); t1 = vec_ld(0, pix1); t2 = vec_perm(pix2l, pix2r, perm); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2, int order) { int i; LOAD_ZERO; const vec_s16 *pv; register vec_s16 vec1; register vec_s32 res = vec_splat_s32(0), t; int32_t ires; for(i = 0; i < order; i += 8){ pv = (const vec_s16*)v1; vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1)); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul) { LOAD_ZERO; vec_s16 *pv1 = (vec_s16 *) v1; register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul }; register vec_s16 t0, t1, i0, i1, i4, i2, i3; register vec_s32 res = zero_s32v; #if HAVE_BIGENDIAN register vec_u8 align = vec_lvsl(0, v2); i2 = vec_ld(0, v2); i3 = vec_ld(0, v3); #endif int32_t ires; order >>= 4; do { GET_T(t0,t1,v2,i1,i2); i0 = pv1[0]; i1 = pv1[1]; res = vec_msum(t0, i0, res); res = vec_msum(t1, i1, res); GET_T(t0,t1,v3,i4,i3); pv1[0] = vec_mladd(t0, muls, i0); pv1[1] = vec_mladd(t1, muls, i1); pv1 += 2; v2 += 16; v3 += 16; } while (--order); res = vec_splat(vec_sums(res, zero_s32v), 3); vec_ste(res, 0, &ires); return ires; }
static void ProjectDlightTexture_altivec( void ) { int i, l; vec_t origin0, origin1, origin2; float texCoords0, texCoords1; vector float floatColorVec0, floatColorVec1; vector float modulateVec, colorVec, zero; vector short colorShort; vector signed int colorInt; vector unsigned char floatColorVecPerm, modulatePerm, colorChar; vector unsigned char vSel = VECCONST_UINT8(0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff); float *texCoords; byte *colors; byte clipBits[SHADER_MAX_VERTEXES]; float texCoordsArray[SHADER_MAX_VERTEXES][2]; byte colorArray[SHADER_MAX_VERTEXES][4]; unsigned hitIndexes[SHADER_MAX_INDEXES]; int numIndexes; float scale; float radius; vec3_t floatColor; float modulate = 0.0f; if ( !backEnd.refdef.num_dlights ) { return; } // There has to be a better way to do this so that floatColor // and/or modulate are already 16-byte aligned. floatColorVecPerm = vec_lvsl(0,(float *)floatColor); modulatePerm = vec_lvsl(0,(float *)&modulate); modulatePerm = (vector unsigned char)vec_splat((vector unsigned int)modulatePerm,0); zero = (vector float)vec_splat_s8(0); for ( l = 0 ; l < backEnd.refdef.num_dlights ; l++ ) { dlight_t *dl; if ( !( tess.dlightBits & ( 1 << l ) ) ) { continue; // this surface definately doesn't have any of this light } texCoords = texCoordsArray[0]; colors = colorArray[0]; dl = &backEnd.refdef.dlights[l]; origin0 = dl->transformed[0]; origin1 = dl->transformed[1]; origin2 = dl->transformed[2]; radius = dl->radius; scale = 1.0f / radius; if(r_greyscale->integer) { float luminance; luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f; floatColor[0] = floatColor[1] = floatColor[2] = luminance; } else if(r_greyscale->value) { float luminance; luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f; floatColor[0] = LERP(dl->color[0] * 255.0f, luminance, r_greyscale->value); floatColor[1] = LERP(dl->color[1] * 255.0f, luminance, r_greyscale->value); floatColor[2] = LERP(dl->color[2] * 255.0f, luminance, r_greyscale->value); } else { floatColor[0] = dl->color[0] * 255.0f; floatColor[1] = dl->color[1] * 255.0f; floatColor[2] = dl->color[2] * 255.0f; } floatColorVec0 = vec_ld(0, floatColor); floatColorVec1 = vec_ld(11, floatColor); floatColorVec0 = vec_perm(floatColorVec0,floatColorVec0,floatColorVecPerm); for ( i = 0 ; i < tess.numVertexes ; i++, texCoords += 2, colors += 4 ) { int clip = 0; vec_t dist0, dist1, dist2; dist0 = origin0 - tess.xyz[i][0]; dist1 = origin1 - tess.xyz[i][1]; dist2 = origin2 - tess.xyz[i][2]; backEnd.pc.c_dlightVertexes++; texCoords0 = 0.5f + dist0 * scale; texCoords1 = 0.5f + dist1 * scale; if( !r_dlightBacks->integer && // dist . tess.normal[i] ( dist0 * tess.normal[i][0] + dist1 * tess.normal[i][1] + dist2 * tess.normal[i][2] ) < 0.0f ) { clip = 63; } else { if ( texCoords0 < 0.0f ) { clip |= 1; } else if ( texCoords0 > 1.0f ) { clip |= 2; } if ( texCoords1 < 0.0f ) { clip |= 4; } else if ( texCoords1 > 1.0f ) { clip |= 8; } texCoords[0] = texCoords0; texCoords[1] = texCoords1; // modulate the strength based on the height and color if ( dist2 > radius ) { clip |= 16; modulate = 0.0f; } else if ( dist2 < -radius ) { clip |= 32; modulate = 0.0f; } else { dist2 = Q_fabs(dist2); if ( dist2 < radius * 0.5f ) { modulate = 1.0f; } else { modulate = 2.0f * (radius - dist2) * scale; } } } clipBits[i] = clip; modulateVec = vec_ld(0,(float *)&modulate); modulateVec = vec_perm(modulateVec,modulateVec,modulatePerm); colorVec = vec_madd(floatColorVec0,modulateVec,zero); colorInt = vec_cts(colorVec,0); // RGBx colorShort = vec_pack(colorInt,colorInt); // RGBxRGBx colorChar = vec_packsu(colorShort,colorShort); // RGBxRGBxRGBxRGBx colorChar = vec_sel(colorChar,vSel,vSel); // RGBARGBARGBARGBA replace alpha with 255 vec_ste((vector unsigned int)colorChar,0,(unsigned int *)colors); // store color } // build a list of triangles that need light numIndexes = 0; for ( i = 0 ; i < tess.numIndexes ; i += 3 ) { int a, b, c; a = tess.indexes[i]; b = tess.indexes[i+1]; c = tess.indexes[i+2]; if ( clipBits[a] & clipBits[b] & clipBits[c] ) { continue; // not lighted } hitIndexes[numIndexes] = a; hitIndexes[numIndexes+1] = b; hitIndexes[numIndexes+2] = c; numIndexes += 3; } if ( !numIndexes ) { continue; } qglEnableClientState( GL_TEXTURE_COORD_ARRAY ); qglTexCoordPointer( 2, GL_FLOAT, 0, texCoordsArray[0] ); qglEnableClientState( GL_COLOR_ARRAY ); qglColorPointer( 4, GL_UNSIGNED_BYTE, 0, colorArray ); GL_Bind( tr.dlightImage ); // include GLS_DEPTHFUNC_EQUAL so alpha tested surfaces don't add light // where they aren't rendered if ( dl->additive ) { GL_State( GLS_SRCBLEND_ONE | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL ); } else { GL_State( GLS_SRCBLEND_DST_COLOR | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL ); } R_DrawElements( numIndexes, hitIndexes ); backEnd.pc.c_totalIndexes += numIndexes; backEnd.pc.c_dlightIndexes += numIndexes; } }