// about twice as fast as generic void MipMap_32_neon( int width, int height, unsigned char *source, unsigned char *target ) { if(width < 8) { MipMap_32_generic(width, height, source, target); return; } int newwidth = width / 2; int newheight = height / 2; int stride = width * 4; unsigned char *s = target; unsigned char *t = source; unsigned char *u = t+stride; int y, x; for( y = 0; y < newheight; y++ ) { for( x = 0; x < newwidth; x+=4 ) { uint8x16_t a0, a1, a2, a3; memcpy(&a0, t, 16); memcpy(&a1, t+16, 16); memcpy(&a2, u, 16); memcpy(&a3, u+16, 16); // average first and second scan lines a0 = vhaddq_u8(a0, a2); a1 = vhaddq_u8(a1, a3); // repack uint32x4x2_t z = vuzpq_u32(vreinterpretq_u32_u8(a0), vreinterpretq_u32_u8(a1)); uint8x16_t d0, d1; memcpy(&d0, &z.val[0], 16), memcpy(&d1, &z.val[1], 16); // average even and odd x pixels a0 = vhaddq_u8(vreinterpretq_u8_u32(z.val[0]), vreinterpretq_u8_u32(z.val[1])); memcpy(s, &a0, 16); s+=16; t+=32; u+=32; } t += stride; u += stride; } }
void test_vhaddQu8 (void) { uint8x16_t out_uint8x16_t; uint8x16_t arg0_uint8x16_t; uint8x16_t arg1_uint8x16_t; out_uint8x16_t = vhaddq_u8 (arg0_uint8x16_t, arg1_uint8x16_t); }
void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t A0 = vld1q_u8(above); // top row const uint8x16_t above_right = vld1q_dup_u8(above + 15); const uint8x16_t A1 = vextq_u8(A0, above_right, 1); const uint8x16_t A2 = vextq_u8(A0, above_right, 2); const uint8x16_t avg1 = vhaddq_u8(A0, A2); uint8x16_t row = vrhaddq_u8(avg1, A1); int i; (void)left; for (i = 0; i < 15; ++i) { vst1q_u8(dst + i * stride, row); row = vextq_u8(row, above_right, 1); } vst1q_u8(dst + i * stride, row); }
// about twice as fast as generic void MipMap_24_neon( int width, int height, unsigned char *source, unsigned char *target ) { if(width < 32) { MipMap_24_generic(width, height, source, target); return; } int newwidth = width / 2; int newheight = height / 2; int stride = width * 3; unsigned char *s = target; unsigned char *t = source; unsigned char *u = t+stride; int y, x; for( y = 0; y < newheight; y++ ) { for( x = 0; x < newwidth; x+=16 ) { uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab; memcpy(&a0, t, 16); memcpy(&a1, t+16, 16); memcpy(&a2, t+32, 16); memcpy(&a3, t+48, 16); memcpy(&a4, t+64, 16); memcpy(&a5, t+80, 16); memcpy(&a6, u, 16); memcpy(&a7, u+16, 16); memcpy(&a8, u+32, 16); memcpy(&a9, u+48, 16); memcpy(&aa, u+64, 16); memcpy(&ab, u+80, 16); // average first and second scan lines a0 = vhaddq_u8(a0, a6); a1 = vhaddq_u8(a1, a7); a2 = vhaddq_u8(a2, a8); a3 = vhaddq_u8(a3, a9); a4 = vhaddq_u8(a4, aa); a5 = vhaddq_u8(a5, ab); // at this point, we have averaged the two scanlines, // in 24bit it's a bit of a pain to average the pixels // because they are aligned to 3 bytes #if 0 // simple readable version finishing without neon (slower) uint8_t b[96], c[96]; memcpy(b, &a0, 16); memcpy(b+16, &a1, 16); memcpy(b+32, &a2, 16); memcpy(b+48, &a3, 16); memcpy(b+64, &a4, 16); memcpy(b+80, &a5, 16); int i,j; for( i=0; i<16; i++) for( j=0; j<3; j++) s[3*i+j] = (b[3*2*i+j] + b[3*(2*i+1)+j]) / 2; #else // full neon version with swizzel (ugly but fast) uint8x8_t r0, r1; uint8x8x4_t z; uint8x8x2_t *z2 = (uint8x8x2_t*)&z, *z3 = (uint8x8x2_t*)&z+1; // a00 a01 a02 a06 a07 a08 a0c a0d a0e a12 a13 a14 a18 a19 a1a a1e // a03 a04 a05 a09 a0a a0b a0f a10 a11 a15 a16 a17 a1b a1c a1d a21 #define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) }) uint8x8_t s0l = {0, 1, 2, 6, 7, 8, 12, 13}; memcpy(&z, &a0, sizeof a0); r0 = vtbl2_u8(*z2, s0l); memcpy(z3, &a1, sizeof a1); uint8x8_t s0h = {14, 16+2, 16+3, 16+4, 16+8, 16+9, 16+10, 16+14}; r1 = vtbl4_u8(z, s0h); a6 = vcombine_u8 (r0, r1); uint8x8_t s1l = {3, 4, 5, 9, 10, 11, 15, 16+0}; r0 = vtbl4_u8(z, s1l); memcpy(&z, &a2, sizeof a2); uint8x8_t s1h = {16+1, 16+5, 16+6, 16+7, 16+11, 16+12, 16+13, 1}; r1 = vtbl4_u8(z, s1h); a7 = vcombine_u8 (r0, r1); a0 = vhaddq_u8(a6, a7); // a1f a20 a24 a25 a26 a2a a2b a2c a30 a31 a32 a36 a37 a38 a3c a3d // a22 a23 a27 a28 a29 a2d a2e a2f a33 a34 a35 a39 a3a a3b a3f a40 uint8x8_t s2l = {16+15, 0, 4, 5, 6, 10, 11, 12}; r0 = vtbl4_u8(z, s2l); uint8x8_t s2h = {0, 1, 2, 6, 7, 8, 12, 13}; memcpy(z3, &a3, sizeof a3); r1 = vtbl2_u8(*z3, s2h); a6 = vcombine_u8 (r0, r1); uint8x8_t s3l = {2, 3, 7, 8, 9, 13, 14, 15}; r0 = vtbl2_u8(*z2, s3l); memcpy(&z, &a4, sizeof a4); uint8x8_t s3h = {16+3, 16+4, 16+5, 16+9, 16+10, 16+11, 16+15, 0}; r1 = vtbl4_u8(z, s3h); a7 = vcombine_u8 (r0, r1); a1 = vhaddq_u8(a6, a7); // a3e a42 a43 a44 a48 a49 a4a a4e a4f a50 a54 a55 a56 a5a a5b a5c // a41 a45 a46 a47 a4b a4c a4d a51 a52 a53 a57 a58 a59 a5d a5e a5f uint8x8_t s4l = {16+14, 2, 3, 4, 8, 9, 10, 14}; r0 = vtbl4_u8(z, s4l); memcpy(z3, &a5, sizeof a5); uint8x8_t s4h = {15, 16+0, 16+4, 16+5, 16+6, 16+10, 16+11, 16+12}; r1 = vtbl4_u8(z, s4h); a6 = vcombine_u8 (r0, r1); uint8x8_t s5l = {1, 5, 6, 7, 11, 12, 13, 16+1}; r0 = vtbl4_u8(z, s5l); uint8x8_t s5h = {2, 3, 7, 8, 9, 13, 14, 15}; r1 = vtbl2_u8(*z3, s5h); a7 = vcombine_u8 (r0, r1); a2 = vhaddq_u8(a6, a7); memcpy(s, &a0, 16); memcpy(s+16, &a1, 16); memcpy(s+32, &a2, 16); #endif s+=48; t+=96; u+=96; } t += stride; u += stride; } }
} return out; } #ifdef __ARM_NEON__ #include <arm_neon.h> void neon_decimate2(uint8_t * __restrict dest, int destwidth, int destheight, int deststride, uint8_t * __restrict src, int srcwidth, int srcheight, int srcstride) { for (int y = 0; y < destheight; y++) { for (int x = 0; x < destwidth; x+=8) { uint8x16x2_t row0 = vld2q_u8(src + 2*x); uint8x16x2_t row1 = vld2q_u8(src + 2*x + srcstride); uint8x16_t sum0 = vhaddq_u8(row0.val[0], row1.val[1]); uint8x16_t sum1 = vhaddq_u8(row1.val[0], row0.val[1]); uint8x16_t sum = vhaddq_u8(sum0, sum1); vst1q_u8(dest + x, sum); } src += 2*srcstride; dest += deststride; } } void neon_decimate3(uint8_t * __restrict dest, int destwidth, int destheight, int deststride, uint8_t * __restrict src, int srcwidth, int srcheight, int srcstride) { for (int y = 0; y < destheight; y++) { for (int x = 0; x < destwidth; x+=8) { uint8x16x3_t row0 = vld3q_u8(src + 3*x);