void test_vuzpQu32 (void) { uint32x4x2_t out_uint32x4x2_t; uint32x4_t arg0_uint32x4_t; uint32x4_t arg1_uint32x4_t; out_uint32x4x2_t = vuzpq_u32 (arg0_uint32x4_t, arg1_uint32x4_t); }
// about twice as fast as generic void MipMap_32_neon( int width, int height, unsigned char *source, unsigned char *target ) { if(width < 8) { MipMap_32_generic(width, height, source, target); return; } int newwidth = width / 2; int newheight = height / 2; int stride = width * 4; unsigned char *s = target; unsigned char *t = source; unsigned char *u = t+stride; int y, x; for( y = 0; y < newheight; y++ ) { for( x = 0; x < newwidth; x+=4 ) { uint8x16_t a0, a1, a2, a3; memcpy(&a0, t, 16); memcpy(&a1, t+16, 16); memcpy(&a2, u, 16); memcpy(&a3, u+16, 16); // average first and second scan lines a0 = vhaddq_u8(a0, a2); a1 = vhaddq_u8(a1, a3); // repack uint32x4x2_t z = vuzpq_u32(vreinterpretq_u32_u8(a0), vreinterpretq_u32_u8(a1)); uint8x16_t d0, d1; memcpy(&d0, &z.val[0], 16), memcpy(&d1, &z.val[1], 16); // average even and odd x pixels a0 = vhaddq_u8(vreinterpretq_u8_u32(z.val[0]), vreinterpretq_u8_u32(z.val[1])); memcpy(s, &a0, 16); s+=16; t+=32; u+=32; } t += stride; u += stride; } }
uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) { // CHECK-LABEL: test_vuzpq_u32 return vuzpq_u32(a, b); // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }