void test_vtbl4u8 (void) { uint8x8_t out_uint8x8_t; uint8x8x4_t arg0_uint8x8x4_t; uint8x8_t arg1_uint8x8_t; out_uint8x8_t = vtbl4_u8 (arg0_uint8x8x4_t, arg1_uint8x8_t); }
static void ConvertBGRAToBGR(const uint32_t* src, int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~7); const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]); const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]); const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]); for (; src < end; src += 8) { uint8x8x4_t pixels; INIT_VECTOR4(pixels, vld1_u8((const uint8_t*)(src + 0)), vld1_u8((const uint8_t*)(src + 2)), vld1_u8((const uint8_t*)(src + 4)), vld1_u8((const uint8_t*)(src + 6))); vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); dst += 8 * 3; } VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst); // left-overs }
// about twice as fast as generic void MipMap_24_neon( int width, int height, unsigned char *source, unsigned char *target ) { if(width < 32) { MipMap_24_generic(width, height, source, target); return; } int newwidth = width / 2; int newheight = height / 2; int stride = width * 3; unsigned char *s = target; unsigned char *t = source; unsigned char *u = t+stride; int y, x; for( y = 0; y < newheight; y++ ) { for( x = 0; x < newwidth; x+=16 ) { uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab; memcpy(&a0, t, 16); memcpy(&a1, t+16, 16); memcpy(&a2, t+32, 16); memcpy(&a3, t+48, 16); memcpy(&a4, t+64, 16); memcpy(&a5, t+80, 16); memcpy(&a6, u, 16); memcpy(&a7, u+16, 16); memcpy(&a8, u+32, 16); memcpy(&a9, u+48, 16); memcpy(&aa, u+64, 16); memcpy(&ab, u+80, 16); // average first and second scan lines a0 = vhaddq_u8(a0, a6); a1 = vhaddq_u8(a1, a7); a2 = vhaddq_u8(a2, a8); a3 = vhaddq_u8(a3, a9); a4 = vhaddq_u8(a4, aa); a5 = vhaddq_u8(a5, ab); // at this point, we have averaged the two scanlines, // in 24bit it's a bit of a pain to average the pixels // because they are aligned to 3 bytes #if 0 // simple readable version finishing without neon (slower) uint8_t b[96], c[96]; memcpy(b, &a0, 16); memcpy(b+16, &a1, 16); memcpy(b+32, &a2, 16); memcpy(b+48, &a3, 16); memcpy(b+64, &a4, 16); memcpy(b+80, &a5, 16); int i,j; for( i=0; i<16; i++) for( j=0; j<3; j++) s[3*i+j] = (b[3*2*i+j] + b[3*(2*i+1)+j]) / 2; #else // full neon version with swizzel (ugly but fast) uint8x8_t r0, r1; uint8x8x4_t z; uint8x8x2_t *z2 = (uint8x8x2_t*)&z, *z3 = (uint8x8x2_t*)&z+1; // a00 a01 a02 a06 a07 a08 a0c a0d a0e a12 a13 a14 a18 a19 a1a a1e // a03 a04 a05 a09 a0a a0b a0f a10 a11 a15 a16 a17 a1b a1c a1d a21 #define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) }) uint8x8_t s0l = {0, 1, 2, 6, 7, 8, 12, 13}; memcpy(&z, &a0, sizeof a0); r0 = vtbl2_u8(*z2, s0l); memcpy(z3, &a1, sizeof a1); uint8x8_t s0h = {14, 16+2, 16+3, 16+4, 16+8, 16+9, 16+10, 16+14}; r1 = vtbl4_u8(z, s0h); a6 = vcombine_u8 (r0, r1); uint8x8_t s1l = {3, 4, 5, 9, 10, 11, 15, 16+0}; r0 = vtbl4_u8(z, s1l); memcpy(&z, &a2, sizeof a2); uint8x8_t s1h = {16+1, 16+5, 16+6, 16+7, 16+11, 16+12, 16+13, 1}; r1 = vtbl4_u8(z, s1h); a7 = vcombine_u8 (r0, r1); a0 = vhaddq_u8(a6, a7); // a1f a20 a24 a25 a26 a2a a2b a2c a30 a31 a32 a36 a37 a38 a3c a3d // a22 a23 a27 a28 a29 a2d a2e a2f a33 a34 a35 a39 a3a a3b a3f a40 uint8x8_t s2l = {16+15, 0, 4, 5, 6, 10, 11, 12}; r0 = vtbl4_u8(z, s2l); uint8x8_t s2h = {0, 1, 2, 6, 7, 8, 12, 13}; memcpy(z3, &a3, sizeof a3); r1 = vtbl2_u8(*z3, s2h); a6 = vcombine_u8 (r0, r1); uint8x8_t s3l = {2, 3, 7, 8, 9, 13, 14, 15}; r0 = vtbl2_u8(*z2, s3l); memcpy(&z, &a4, sizeof a4); uint8x8_t s3h = {16+3, 16+4, 16+5, 16+9, 16+10, 16+11, 16+15, 0}; r1 = vtbl4_u8(z, s3h); a7 = vcombine_u8 (r0, r1); a1 = vhaddq_u8(a6, a7); // a3e a42 a43 a44 a48 a49 a4a a4e a4f a50 a54 a55 a56 a5a a5b a5c // a41 a45 a46 a47 a4b a4c a4d a51 a52 a53 a57 a58 a59 a5d a5e a5f uint8x8_t s4l = {16+14, 2, 3, 4, 8, 9, 10, 14}; r0 = vtbl4_u8(z, s4l); memcpy(z3, &a5, sizeof a5); uint8x8_t s4h = {15, 16+0, 16+4, 16+5, 16+6, 16+10, 16+11, 16+12}; r1 = vtbl4_u8(z, s4h); a6 = vcombine_u8 (r0, r1); uint8x8_t s5l = {1, 5, 6, 7, 11, 12, 13, 16+1}; r0 = vtbl4_u8(z, s5l); uint8x8_t s5h = {2, 3, 7, 8, 9, 13, 14, 15}; r1 = vtbl2_u8(*z3, s5h); a7 = vcombine_u8 (r0, r1); a2 = vhaddq_u8(a6, a7); memcpy(s, &a0, 16); memcpy(s+16, &a1, 16); memcpy(s+32, &a2, 16); #endif s+=48; t+=96; u+=96; } t += stride; u += stride; } }
uint8x8_t tbl_testu8_4 (uint8x8x4_t tab, uint8x8_t idx) { return vtbl4_u8 (tab, idx); }
uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) { // CHECK: test_vtbl4_u8 return vtbl4_u8(a, b); // CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b }