static inline vec_uint4 GENBX(vec_uint4 a, vec_uint4 b, vec_uint4 c) { return vec_and(vec_or(vec_cmpgt(a, b), vec_and(vec_cmpeq(a, b), c)), vec_splat_u32(1)); }
SIMD_INLINE void InterferenceChangeMasked(const Loader<align> & statisticSrc, v128_s16 value, v128_s16 saturation, const Loader<align> & maskSrc, v128_u8 index, v128_u8 tailMask, Storer<align> & statisticDst) { v128_u8 mask = vec_and(vec_cmpeq(Load<align, first>(maskSrc), index), tailMask); InterferenceChange<align, first, increment>(statisticSrc, vec_and(value, (v128_s16)UnpackLoU8(mask, mask)), saturation, statisticDst); InterferenceChange<align, false, increment>(statisticSrc, vec_and(value, (v128_s16)UnpackHiU8(mask, mask)), saturation, statisticDst); }
static inline void do_recursion(w128_t *r, w128_t *a, w128_t * b, w128_t * lung) { const vector unsigned char sl1 = ALTI_SL1; const vector unsigned char sl1_perm = ALTI_SL1_PERM; const vector unsigned int sl1_msk = ALTI_SL1_MSK; const vector unsigned char sr1 = ALTI_SR; const vector unsigned char sr1_perm = ALTI_SR_PERM; const vector unsigned int sr1_msk = ALTI_SR_MSK; const vector unsigned char perm = ALTI_PERM; const vector unsigned int msk1 = ALTI_MSK; vector unsigned int z = a->s; vector unsigned int w = lung->s; vector unsigned int x = vec_perm(w, (vector unsigned int)perm, perm); vector unsigned int y = vec_perm(z, (vector unsigned int)sl1_perm, sl1_perm); y = vec_sll(y, sl1); y = vec_and(y, sl1_msk); w = vec_xor(x, b->s); w = vec_xor(w, y); x = vec_perm(w, (vector unsigned int)sr1_perm, sr1_perm); x = vec_srl(x, sr1); x = vec_and(x, sr1_msk); y = vec_and(w, msk1); z = vec_xor(z, y); r->s = vec_xor(z, x); lung->s = w; }
void gimp_composite_multiply_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d,alpha_a,alpha_b,alpha; vector unsigned short al,ah; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); al=vec_mule(a,b); al=vec_add(al,ox0080); ah=vec_mulo(a,b); ah=vec_add(ah,ox0080); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); al=vec_mule(a,b); al=vec_add(al,ox0080); ah=vec_mulo(a,b); ah=vec_add(ah,ox0080); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnalignedLess(d, D, length); }
static inline vec_u8_t h264_deblock_mask( register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t q0, register vec_u8_t q1, register vec_u8_t alpha, register vec_u8_t beta ) { register vec_u8_t mask; register vec_u8_t tempmask; mask = diff_lt_altivec(p0, q0, alpha); tempmask = diff_lt_altivec(p1, p0, beta); mask = vec_and(mask, tempmask); tempmask = diff_lt_altivec(q1, q0, beta); mask = vec_and(mask, tempmask); return mask; }
/** * Sum of Squared Errors for a 8x8 block. * AltiVec-enhanced. * It's the pix_abs8x8_altivec code above w/ squaring added. */ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; vector signed int sumsqr; sum = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */ /* Calculate abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Square the values and add them to our sum */ sum = vec_msum(t5, t5, sum); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); sumsqr = vec_splat(sumsqr, 3); vec_ste(sumsqr, 0, &s); return s; }
void gimp_composite_difference_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d,e,alpha_a,alpha_b; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); d=vec_min(alpha_a, alpha_b); a=vec_andc(a, alphamask); a=vec_adds(a, d); b=vec_andc(b, alphamask); d=vec_subs(a, b); e=vec_subs(b, a); d=vec_add(d,e); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); alpha_a=vec_and(a,alphamask); alpha_b=vec_and(b,alphamask); d=vec_min(alpha_a,alpha_b); a=vec_andc(a,alphamask); a=vec_adds(a,d); b=vec_andc(b,alphamask); d=vec_subs(a,b); e=vec_subs(b, a); d=vec_add(d,e); StoreUnalignedLess(d, D, length); }
vector signed int test1_and (vector bool int x, vector signed int y) { vector signed int *foo; *foo += vec_and (x, y); return *foo; }
vector unsigned int test6_and (vector unsigned int x, vector unsigned int y) { vector unsigned int *foo; *foo += vec_and (x, y); return *foo; }
template <bool align, bool increment> void InterferenceChange(int16_t * statistic, size_t stride, size_t width, size_t height, uint8_t value, int16_t saturation) { assert(width >= HA); if(align) assert(Aligned(statistic) && Aligned(stride, HA)); size_t alignedWidth = Simd::AlignLo(width, HA); v128_s16 tailMask = (v128_s16)ShiftLeft(K16_FFFF, HA - width + alignedWidth); v128_s16 _value = SetI16(value); v128_s16 _saturation = SetI16(saturation); for(size_t row = 0; row < height; ++row) { Loader<align> statisticSrc(statistic); Storer<align> statisticDst(statistic); InterferenceChange<align, true, increment>(statisticSrc, _value, _saturation, statisticDst); for(size_t col = HA; col < alignedWidth; col += HA) InterferenceChange<align, false, increment>(statisticSrc, _value, _saturation, statisticDst); Flush(statisticDst); if(alignedWidth != width) { Loader<false> statisticSrc(statistic + width - HA); Storer<false> statisticDst(statistic + width - HA); InterferenceChange<false, true, increment>(statisticSrc, vec_and(_value, tailMask), _saturation, statisticDst); Flush(statisticDst); } statistic += stride; } }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t p2, register vec_u8_t q0, register vec_u8_t tc0) { register vec_u8_t average = vec_avg(p0, q0); register vec_u8_t temp; register vec_u8_t uncliped; register vec_u8_t ones; register vec_u8_t max; register vec_u8_t min; register vec_u8_t newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
vector unsigned short test6_and (vector unsigned short x, vector unsigned short y) { vector unsigned short *foo; *foo += vec_and (x, y); return *foo; }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0, register vector unsigned char p1, register vector unsigned char p2, register vector unsigned char q0, register vector unsigned char tc0) { register vector unsigned char average = vec_avg(p0, q0); register vector unsigned char temp; register vector unsigned char uncliped; register vector unsigned char ones; register vector unsigned char max; register vector unsigned char min; register vector unsigned char newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
uint32_t sad8_altivec_c(const uint8_t * cur, const uint8_t *ref, const uint32_t stride) { uint32_t result = 0; register vector unsigned int sad; register vector unsigned char c; register vector unsigned char r; /* initialize */ sad = vec_splat_u32(0); /* Perform sad operations */ SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); /* finish addition, add the first 2 together */ sad = vec_and(sad, (vector unsigned int)vec_pack(vec_splat_u16(-1),vec_splat_u16(0))); sad = (vector unsigned int)vec_sums((vector signed int)sad, vec_splat_s32(0)); sad = vec_splat(sad,3); vec_ste(sad, 0, &result); return result; }
SIMD_INLINE void AddSquareDifference(const uint8_t * src, ptrdiff_t step, const v128_u8 & mask, v128_u32 & sum) { const v128_u8 a = Load<align>(src - step); const v128_u8 b = Load<align>(src + step); const v128_u8 d = vec_and(AbsDifferenceU8(a, b), mask); sum = vec_msum(d, d, sum); }
void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) { assert(width >= A); if (align) assert(Aligned(src) && Aligned(stride)); size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_01, A - width + alignedWidth); v128_u8 _value = SIMD_VEC_SET1_EPI8(value); v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (size_t row = 0; row < height; ++row) { size_t col = 0; for (; col < alignedWidth; col += QA) { ConditionalCount8u<align, compareType>(src, col, _value, counts[0]); ConditionalCount8u<align, compareType>(src, col + A, _value, counts[1]); ConditionalCount8u<align, compareType>(src, col + 2 * A, _value, counts[2]); ConditionalCount8u<align, compareType>(src, col + 3 * A, _value, counts[3]); } for (; col < bodyWidth; col += A) ConditionalCount8u<align, compareType>(src, col, _value, counts[0]); if (alignedWidth != width) { const v128_u8 mask = vec_and(Compare8u<compareType>(Load<false>(src + width - A), _value), tailMask); counts[0] = vec_msum(mask, K8_01, counts[0]); } src += stride; } counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3])); *count = ExtractSum(counts[0]); }
vector signed short test1_and (vector bool short x, vector signed short y) { vector signed short *foo; *foo += vec_and (x, y); return *foo; }
void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count) { assert(width >= HA); if (align) assert(Aligned(src) && Aligned(stride)); size_t alignedWidth = AlignLo(width, DA); size_t bodyWidth = Simd::AlignLo(width, HA); v128_u16 tailMask = ShiftLeft(K16_0001, HA - width + alignedWidth); v128_s16 _value = SIMD_VEC_SET1_EPI16(value); v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (size_t row = 0; row < height; ++row) { const int16_t * s = (const int16_t *)src; size_t col = 0; for (; col < alignedWidth; col += DA) { ConditionalCount16i<align, compareType>(s, col, _value, counts[0]); ConditionalCount16i<align, compareType>(s, col + HA, _value, counts[1]); ConditionalCount16i<align, compareType>(s, col + 2 * HA, _value, counts[2]); ConditionalCount16i<align, compareType>(s, col + 3 * HA, _value, counts[3]); } for (; col < bodyWidth; col += HA) ConditionalCount16i<align, compareType>(s, col, _value, counts[0]); if (alignedWidth != width) { const v128_u16 mask = vec_and((v128_u16)Compare16i<compareType>(Load<false>(s + width - HA), _value), tailMask); counts[0] = vec_msum(mask, K16_0001, counts[0]); } src += stride; } counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3])); *count = ExtractSum(counts[0]); }
template <bool align> void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) { assert(height > 2 && width >= A + 2); if (align) assert(Aligned(background) && Aligned(backgroundStride)); width -= 2; height -= 2; current += 1 + currentStride; background += 1 + backgroundStride; mask += 1 + maskStride; size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); v128_u8 _index = SetU8(index); for (size_t i = 0; i < 9; ++i) sums[i] = 0; for (size_t row = 0; row < height; ++row) { v128_u32 _sums[9]; for (size_t i = 0; i < 9; ++i) _sums[i] = K32_00000000; for (size_t col = 0; col < bodyWidth; col += A) { const v128_u8 _mask = LoadMaskU8<false>(mask + col, _index); const v128_u8 _current = vec_and(Load<false>(current + col), _mask); AbsDifferenceSums3x3Masked<align>(_current, background + col, backgroundStride, _mask, _sums); } if (width - bodyWidth) { const v128_u8 _mask = vec_and(LoadMaskU8<false>(mask + width - A, _index), tailMask); const v128_u8 _current = vec_and(Load<false>(current + width - A), _mask); AbsDifferenceSums3x3Masked<false>(_current, background + width - A, backgroundStride, _mask, _sums); } for (size_t i = 0; i < 9; ++i) sums[i] += ExtractSum(_sums[i]); current += currentStride; background += backgroundStride; mask += maskStride; } }
void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) { assert(width >= A + 2 && height >= 3); if (align) assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); src += srcStride; mask += maskStride; height -= 2; size_t bodyWidth = Simd::AlignLo(width - 1, A); v128_u8 noseMask = ShiftRight(K8_FF, 1); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + 1 + bodyWidth); size_t alignedWidth = Simd::AlignLo(bodyWidth - A, DA); v128_u8 _value = SetU8(value); *sum = 0; for (size_t row = 0; row < height; ++row) { v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; { const v128_u8 _mask = vec_and(Compare8u<compareType>(Load<false>(mask + 1), _value), noseMask); AddSquareDifference<false>(src + 1, 1, _mask, sums[0]); AddSquareDifference<false>(src + 1, srcStride, _mask, sums[1]); } size_t col = A; for (; col < alignedWidth; col += DA) { ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col, _value, sums); ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col + A, _value, sums + 2); } for (; col < bodyWidth; col += A) ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col, _value, sums); if (bodyWidth != width - 1) { size_t offset = width - A - 1; const v128_u8 _mask = vec_and(Compare8u<compareType>(Load<false>(mask + offset), _value), tailMask); AddSquareDifference<false>(src + offset, 1, _mask, sums[0]); AddSquareDifference<false>(src + offset, srcStride, _mask, sums[1]); } sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3])); *sum += ExtractSum(sums[0]); src += srcStride; mask += maskStride; } }
static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0, register vector unsigned char p1, register vector unsigned char q0, register vector unsigned char q1, register vector unsigned char alpha, register vector unsigned char beta) { register vector unsigned char mask; register vector unsigned char tempmask; mask = diff_lt_altivec(p0, q0, alpha); tempmask = diff_lt_altivec(p1, p0, beta); mask = vec_and(mask, tempmask); tempmask = diff_lt_altivec(q1, q0, beta); mask = vec_and(mask, tempmask); return mask; }
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
__SIMDd _SIMD_and_pd(__SIMDd a, __SIMDd b) { #ifdef USE_SSE return _mm_and_pd(a,b); #elif defined USE_AVX return _m256_and_ps(a,b); #elif defined USE_IBM return vec_and(a,b); #endif }
template <bool align> v128_u8 LbpEstimate(const uint8_t * src, ptrdiff_t stride) { v128_u8 threshold = Load<false>(src); v128_u8 lbp = K8_00; lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<align>(src - 1 - stride), threshold), K8_01)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<false>(src - stride), threshold), K8_02)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<false>(src + 1 - stride), threshold), K8_04)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<false>(src + 1 ), threshold), K8_08)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<false>(src + 1 + stride), threshold), K8_10)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<false>(src + stride), threshold), K8_20)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<align>(src - 1 + stride), threshold), K8_40)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<align>(src - 1 ), threshold), K8_80)); return lbp; }
void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) { assert(width >= A); if (align) assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); v128_u8 _value = SetU8(value); *sum = 0; for (size_t row = 0; row < height; ++row) { size_t col = 0; v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (; col < alignedWidth; col += QA) { ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]); ConditionalSquareSum<align, compareType>(src, mask, col + A, _value, sums[1]); ConditionalSquareSum<align, compareType>(src, mask, col + 2 * A, _value, sums[2]); ConditionalSquareSum<align, compareType>(src, mask, col + 3 * A, _value, sums[3]); } sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3])); for (; col < bodyWidth; col += A) ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]); if (alignedWidth != width) { const v128_u8 _mask = Compare8u<compareType>(Load<false>(mask + width - A), _value); const v128_u8 _src = vec_and(vec_and(Load<false>(src + width - A), _mask), tailMask); sums[0] = vec_msum(_src, _src, sums[0]); } *sum += ExtractSum(sums[0]); src += srcStride; mask += maskStride; } }
template <bool align> void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, size_t width, size_t height, uint64_t * sums) { assert(height > 2 && width >= A + 2); if (align) assert(Aligned(background) && Aligned(backgroundStride)); width -= 2; height -= 2; current += 1 + currentStride; background += 1 + backgroundStride; size_t alignedWidth = AlignLo(width, DA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); memset(sums, 0, 9 * sizeof(uint64_t)); for (size_t row = 0; row < height; ++row) { v128_u32 _sums[2][9]; memset(_sums, 0, 18 * sizeof(v128_u32)); size_t col = 0; for (; col < alignedWidth; col += DA) { AbsDifferenceSums3x3<align>(Load<false>(current + col), background + col, backgroundStride, _sums[0]); AbsDifferenceSums3x3<align>(Load<false>(current + col + A), background + col + A, backgroundStride, _sums[0]); } for (; col < bodyWidth; col += A) AbsDifferenceSums3x3<align>(Load<false>(current + col), background + col, backgroundStride, _sums[0]); if (width - bodyWidth) { const v128_u8 _current = vec_and(tailMask, Load<false>(current + width - A)); AbsDifferenceSums3x3Masked<false>(_current, background + width - A, backgroundStride, tailMask, _sums[0]); } for (size_t i = 0; i < 9; ++i) sums[i] += ExtractSum(vec_add(_sums[0][i], _sums[1][i])); current += currentStride; background += backgroundStride; } }
template <bool align> void AbsDifferenceSumMasked( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) { assert(width >= A); if (align) { assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); assert(Aligned(mask) && Aligned(maskStride)); } size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); v128_u8 _index = SetU8(index); *sum = 0; for (size_t row = 0; row < height; ++row) { size_t col = 0; v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (; col < alignedWidth; col += QA) { AbsDifferenceSumMasked<align>(a, b, mask, col, _index, sums[0]); AbsDifferenceSumMasked<align>(a, b, mask, col + A, _index, sums[1]); AbsDifferenceSumMasked<align>(a, b, mask, col + 2 * A, _index, sums[2]); AbsDifferenceSumMasked<align>(a, b, mask, col + 3 * A, _index, sums[3]); } sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3])); for (; col < bodyWidth; col += A) AbsDifferenceSumMasked<align>(a, b, mask, col, _index, sums[0]); if (width - bodyWidth) { const v128_u8 _mask = vec_and(tailMask, LoadMaskU8<false>(mask + width - A, _index)); AbsDifferenceSumMasked<false>(a, b, width - A, _mask, sums[0]); } *sum += ExtractSum(sums[0]); a += aStride; b += bStride; mask += maskStride; } }
static void vorbis_inverse_coupling_altivec(float *mag, float *ang, intptr_t blocksize) { int i; vector float m, a; vector bool int t0, t1; const vector unsigned int v_31 = //XXX vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); for (i = 0; i < blocksize; i += 4) { m = vec_ld(0, mag+i); a = vec_ld(0, ang+i); t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); t0 = (vector bool int)vec_and(a, t1); t1 = (vector bool int)vec_andc(a, t1); a = vec_sub(m, (vector float)t1); m = vec_add(m, (vector float)t0); vec_stl(a, 0, ang+i); vec_stl(m, 0, mag+i); } }
void pix_background :: processYUVAltivec(imageStruct &image) { register int h,w,i,j,width; int pixsize = image.xsize * image.ysize * image.csize; h = image.ysize; w = image.xsize/8; width = image.xsize/8; //check to see if the buffer isn't 16byte aligned (highly unlikely) if (image.ysize*image.xsize % 16 != 0){ error("image not properly aligned for Altivec - try something SD or HD maybe?"); return; } union{ unsigned short s[8]; vector unsigned short v; }shortBuffer; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); m_reset = 0; } register vector unsigned short UVres1, Yres1, UVres2, Yres2;//interleave; register vector unsigned short hiImage, loImage; register vector unsigned short Yrange, UVrange, Yblank,UVblank,blank; register vector bool short Ymasklo,Ymaskhi, UVmaskhi; register vector unsigned short Yhi,Ylo,UVhi,UVlo; register vector unsigned char one = vec_splat_u8(1); register vector unsigned short sone = vec_splat_u16(1); register vector unsigned int Uhi, Ulo, Vhi, Vlo,Ures,Vres; register vector bool int Umasklo, Umaskhi, Vmaskhi, Vmasklo; vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) m_savedImage.data; shortBuffer.s[0] = m_Yrange; Yrange = shortBuffer.v; Yrange = vec_splat(Yrange,0); shortBuffer.s[0] = 128; shortBuffer.s[1] = 0; shortBuffer.s[2] = 128; shortBuffer.s[3] = 0; shortBuffer.s[4] = 128; shortBuffer.s[5] = 0; shortBuffer.s[6] = 128; shortBuffer.s[7] = 0; blank = shortBuffer.v; shortBuffer.s[0] = 0; Yblank = shortBuffer.v; Yblank = vec_splat(Yblank,0); shortBuffer.s[0] = 128; UVblank = shortBuffer.v; UVblank = vec_splat(UVblank,0); shortBuffer.s[0] = m_Urange; shortBuffer.s[1] = m_Vrange; shortBuffer.s[2] = m_Urange; shortBuffer.s[3] = m_Vrange; shortBuffer.s[4] = m_Urange; shortBuffer.s[5] = m_Vrange; shortBuffer.s[6] = m_Urange; shortBuffer.s[7] = m_Vrange; UVrange = shortBuffer.v; //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //PPC970 for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 //this function is probably memory bound on most G4's -- what else is new? vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //separate the U and V from Y UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); Yhi = vec_adds(Yres2,Yrange); Ylo = vec_subs(Yres2,Yrange); //go to ints for comparison UVhi = vec_adds(UVres2,UVrange); UVlo = vec_subs(UVres2,UVrange); Uhi = vec_mule(sone,UVhi); Ulo = vec_mule(sone,UVlo); Vhi = vec_mulo(sone,UVhi); Vlo = vec_mulo(sone,UVlo); Ures = vec_mule(sone,UVres1); Vres = vec_mulo(sone,UVres1); Umasklo = vec_cmpgt(Ures,Ulo); Umaskhi = vec_cmplt(Ures,Uhi); Vmasklo = vec_cmpgt(Vres,Vlo); Vmaskhi = vec_cmplt(Vres,Vhi); Umaskhi = vec_and(Umaskhi,Umasklo); Vmaskhi = vec_and(Vmaskhi,Vmasklo); Umasklo = vec_and(Umaskhi,Vmaskhi); Vmasklo = vec_and(Umaskhi,Vmaskhi); hiImage = (vector unsigned short)vec_mergeh(Umasklo,Vmasklo); loImage = (vector unsigned short)vec_mergel(Umasklo,Vmasklo); //pack it back down to bool short UVmaskhi = (vector bool short)vec_packsu(hiImage,loImage); Ymasklo = vec_cmpgt(Yres1,Ylo); Ymaskhi = vec_cmplt(Yres1,Yhi); Ymaskhi = vec_and(Ymaskhi,Ymasklo); Ymaskhi = vec_and(Ymaskhi,UVmaskhi); UVmaskhi = vec_and(Ymaskhi,UVmaskhi); //bitwise comparison and move using the result of the comparison as a mask Yres1 = vec_sel(Yres1,Yblank,Ymaskhi); //UVres1 = vec_sel(UVres1,UVres2,UVmaskhi); UVres1 = vec_sel(UVres1,UVblank,UVmaskhi); //merge the Y and UV back together hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); //pack it back down to unsigned char to store inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(0); vec_dss(1); vec_dss(2); vec_dss(3); #endif } }
int main () { vector float fa = {1.0, 2.0, 3.0, -4.0}; vector float fb = {-2.0, -3.0, -4.0, -5.0}; vector float fc = vec_cpsgn (fa, fb); vector long long la = {5L, 14L}; vector long long lb = {3L, 86L}; vector long long lc = vec_and (la, lb); vector bool long long ld = {0, -1}; vector long long le = vec_and (la, ld); vector long long lf = vec_and (ld, lb); vector unsigned long long ua = {5L, 14L}; vector unsigned long long ub = {3L, 86L}; vector unsigned long long uc = vec_and (ua, ub); vector bool long long ud = {0, -1}; vector unsigned long long ue = vec_and (ua, ud); vector unsigned long long uf = vec_and (ud, ub); vector long long lg = vec_andc (la, lb); vector long long lh = vec_andc (la, ld); vector long long li = vec_andc (ld, lb); vector unsigned long long ug = vec_andc (ua, ub); vector unsigned long long uh = vec_andc (ua, ud); vector unsigned long long ui = vec_andc (ud, ub); vector double da = {1.0, -4.0}; vector double db = {-2.0, 5.0}; vector double dc = vec_cpsgn (da, db); vector long long lj = vec_mergeh (la, lb); vector long long lk = vec_mergeh (la, ld); vector long long ll = vec_mergeh (ld, la); vector unsigned long long uj = vec_mergeh (ua, ub); vector unsigned long long uk = vec_mergeh (ua, ud); vector unsigned long long ul = vec_mergeh (ud, ua); vector long long lm = vec_mergel (la, lb); vector long long ln = vec_mergel (la, ld); vector long long lo = vec_mergel (ld, la); vector unsigned long long um = vec_mergel (ua, ub); vector unsigned long long un = vec_mergel (ua, ud); vector unsigned long long uo = vec_mergel (ud, ua); vector long long lp = vec_nor (la, lb); vector long long lq = vec_nor (la, ld); vector long long lr = vec_nor (ld, la); vector unsigned long long up = vec_nor (ua, ub); vector unsigned long long uq = vec_nor (ua, ud); vector unsigned long long ur = vec_nor (ud, ua); vector long long ls = vec_or (la, lb); vector long long lt = vec_or (la, ld); vector long long lu = vec_or (ld, la); vector unsigned long long us = vec_or (ua, ub); vector unsigned long long ut = vec_or (ua, ud); vector unsigned long long uu = vec_or (ud, ua); vector unsigned char ca = {0,4,8,1,5,9,2,6,10,3,7,11,15,12,14,13}; vector long long lv = vec_perm (la, lb, ca); vector unsigned long long uv = vec_perm (ua, ub, ca); vector long long lw = vec_sel (la, lb, lc); vector long long lx = vec_sel (la, lb, uc); vector long long ly = vec_sel (la, lb, ld); vector unsigned long long uw = vec_sel (ua, ub, lc); vector unsigned long long ux = vec_sel (ua, ub, uc); vector unsigned long long uy = vec_sel (ua, ub, ld); vector long long lz = vec_xor (la, lb); vector long long l0 = vec_xor (la, ld); vector long long l1 = vec_xor (ld, la); vector unsigned long long uz = vec_xor (ua, ub); vector unsigned long long u0 = vec_xor (ua, ud); vector unsigned long long u1 = vec_xor (ud, ua); int ia = vec_all_eq (ua, ub); int ib = vec_all_ge (ua, ub); int ic = vec_all_gt (ua, ub); int id = vec_all_le (ua, ub); int ie = vec_all_lt (ua, ub); int ig = vec_all_ne (ua, ub); int ih = vec_any_eq (ua, ub); int ii = vec_any_ge (ua, ub); int ij = vec_any_gt (ua, ub); int ik = vec_any_le (ua, ub); int il = vec_any_lt (ua, ub); int im = vec_any_ne (ua, ub); vector int sia = {9, 16, 25, 36}; vector int sib = {-8, -27, -64, -125}; vector int sic = vec_mergee (sia, sib); vector int sid = vec_mergeo (sia, sib); vector unsigned int uia = {9, 16, 25, 36}; vector unsigned int uib = {8, 27, 64, 125}; vector unsigned int uic = vec_mergee (uia, uib); vector unsigned int uid = vec_mergeo (uia, uib); vector bool int bia = {0, -1, -1, 0}; vector bool int bib = {-1, -1, 0, -1}; vector bool int bic = vec_mergee (bia, bib); vector bool int bid = vec_mergeo (bia, bib); vector unsigned int uie = vec_packsu (ua, ub); vector long long l2 = vec_cntlz (la); vector unsigned long long u2 = vec_cntlz (ua); vector int sie = vec_cntlz (sia); vector unsigned int uif = vec_cntlz (uia); vector short ssa = {20, -40, -60, 80, 100, -120, -140, 160}; vector short ssb = vec_cntlz (ssa); vector unsigned short usa = {81, 72, 63, 54, 45, 36, 27, 18}; vector unsigned short usb = vec_cntlz (usa); vector signed char sca = {-4, 3, -9, 15, -31, 31, 0, 0, 1, 117, -36, 99, 98, 97, 96, 95}; vector signed char scb = vec_cntlz (sca); vector unsigned char cb = vec_cntlz (ca); vector double dd = vec_xl (0, &y); vec_xst (dd, 0, &z); vector double de = vec_round (dd); vector double df = vec_splat (de, 0); vector double dg = vec_splat (de, 1); vector long long l3 = vec_splat (l2, 0); vector long long l4 = vec_splat (l2, 1); vector unsigned long long u3 = vec_splat (u2, 0); vector unsigned long long u4 = vec_splat (u2, 1); vector bool long long l5 = vec_splat (ld, 0); vector bool long long l6 = vec_splat (ld, 1); vector long long l7 = vec_div (l3, l4); vector unsigned long long u5 = vec_div (u3, u4); vector long long l8 = vec_mul (l3, l4); vector unsigned long long u6 = vec_mul (u3, u4); vector double dh = vec_ctf (la, -2); vector double di = vec_ctf (ua, 2); vector long long l9 = vec_cts (dh, -2); vector unsigned long long u7 = vec_ctu (di, 2); return 0; }