void test_vzipu32 (void) { uint32x2x2_t out_uint32x2x2_t; uint32x2_t arg0_uint32x2_t; uint32x2_t arg1_uint32x2_t; out_uint32x2x2_t = vzip_u32 (arg0_uint32x2_t, arg1_uint32x2_t); }
uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) { // CHECK-LABEL: test_vzip_u32 return vzip_u32(a, b); // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}} // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}} }
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ shuffle8_neon(uint8_t * const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements ) { size_t i, j, k, l; static const size_t bytesoftype = 8; uint8x8x2_t r0[4]; uint16x4x2_t r1[4]; uint32x2x2_t r2[4]; for( i = 0, k = 0; i<vectorizable_elements*bytesoftype; i += 64, k++) { /* Load and interleave groups of 8 bytes (64 bytes) to the structure r0 */ for( j = 0; j < 4; j++) { r0[j] = vzip_u8(vld1_u8(src + i + (2 * j) * 8), vld1_u8(src + i + (2 * j + 1) * 8) ); } /* Interleave 16 bytes */ for( j = 0; j < 2; j++) { for( l = 0; l < 2; l++) { r1[j*2+l] = vzip_u16(vreinterpret_u16_u8(r0[j * 2].val[l]), vreinterpret_u16_u8(r0[j * 2 + 1].val[l]) ); } } /* Interleave 32 bytes */ for( j = 0; j < 2; j++) { for( l = 0; l < 2; l++) { r2[j*2+l] = vzip_u32(vreinterpret_u32_u16(r1[j].val[l]), vreinterpret_u32_u16(r1[j + 2].val[l]) ); } } /* Store the results in the destination vector */ for( j = 0; j < 4; j++) { for( l = 0; l < 2; l++) { vst1_u8(dest + k*8 + (j*2+l)*total_elements, vreinterpret_u8_u32(r2[j] .val[l])); } } } }
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ unshuffle16_neon(uint8_t * const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements ) { size_t i, j, k, l, m; static const size_t bytesoftype = 16; uint8x8x2_t r0[8]; uint16x4x2_t r1[8]; uint32x2x2_t r2[8]; for( i = 0, k = 0; i<vectorizable_elements*bytesoftype; i += 128, k++) { /* Load and interleave groups of 16 bytes (128 bytes) to the structure r0*/ for( j = 0; j < 8; j++) { r0[j] = vzip_u8(vld1_u8(src + (2 * j) * total_elements + k * 8), vld1_u8(src + (2 * j + 1) * total_elements + k * 8) ); } /* Interleave 16 bytes */ for( j = 0; j < 4; j++) { for( l = 0; l < 2; l++) { r1[2*j+l] = vzip_u16(vreinterpret_u16_u8(r0[2 * j].val[l]), vreinterpret_u16_u8(r0[2 * j + 1].val[l]) ); } } /* Interleave 32 bytes */ for( j = 0; j < 2; j++) { for( l = 0; l < 2; l++) { for( m = 0; m < 2; m++) { r2[j*2+l+4*m] = vzip_u32(vreinterpret_u32_u16(r1[j + 4 * m].val[l]), vreinterpret_u32_u16(r1[j + 2 + 4 * m].val[l]) ); } } } /* Store the results in the destination vector */ for( j = 0; j < 4; j++) { for( l = 0; l < 2; l++) { for( m = 0; m < 2; m++) { vst1_u8(dest + i + (4*j+m+2*l)*8, vreinterpret_u8_u32(r2[j + 4 * m] .val[l])); } } } } }