int main(void) { uint8_t v1_init[8] = {1, 1, 1, 1, 1, 1, 1, 1}; uint8_t v2_init[8] = {2, 2, 2, 2, 2, 2, 2, 2}; uint8x8_t v1 = vld1_u8 (v1_init); uint8x8_t v2 = vld1_u8 (v2_init); uint8x8x2_t vd1, vd2; union {uint8x8_t v; uint8_t buf[8];} d1, d2, d3, d4; int i; uint8_t odd, even; vd1 = vzip_u8(v1, vdup_n_u8(0)); vd2 = vzip_u8(v2, vdup_n_u8(0)); vst1_u8(d1.buf, vd1.val[0]); vst1_u8(d2.buf, vd1.val[1]); vst1_u8(d3.buf, vd2.val[0]); vst1_u8(d4.buf, vd2.val[1]); #ifdef __ARMEL__ odd = 1; even = 0; #else odd = 0; even = 1; #endif for (i = 0; i < 8; i++) if ((i % 2 == even && d4.buf[i] != 2) || (i % 2 == odd && d4.buf[i] != 0)) abort (); return 0; }
void test_vzipu8 (void) { uint8x8x2_t out_uint8x8x2_t; uint8x8_t arg0_uint8x8_t; uint8x8_t arg1_uint8x8_t; out_uint8x8x2_t = vzip_u8 (arg0_uint8x8_t, arg1_uint8x8_t); }
uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) { // CHECK-LABEL: test_vzip_u8 return vzip_u8(a, b); // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b }
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ shuffle8_neon(uint8_t * const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements ) { size_t i, j, k, l; static const size_t bytesoftype = 8; uint8x8x2_t r0[4]; uint16x4x2_t r1[4]; uint32x2x2_t r2[4]; for( i = 0, k = 0; i<vectorizable_elements*bytesoftype; i += 64, k++) { /* Load and interleave groups of 8 bytes (64 bytes) to the structure r0 */ for( j = 0; j < 4; j++) { r0[j] = vzip_u8(vld1_u8(src + i + (2 * j) * 8), vld1_u8(src + i + (2 * j + 1) * 8) ); } /* Interleave 16 bytes */ for( j = 0; j < 2; j++) { for( l = 0; l < 2; l++) { r1[j*2+l] = vzip_u16(vreinterpret_u16_u8(r0[j * 2].val[l]), vreinterpret_u16_u8(r0[j * 2 + 1].val[l]) ); } } /* Interleave 32 bytes */ for( j = 0; j < 2; j++) { for( l = 0; l < 2; l++) { r2[j*2+l] = vzip_u32(vreinterpret_u32_u16(r1[j].val[l]), vreinterpret_u32_u16(r1[j + 2].val[l]) ); } } /* Store the results in the destination vector */ for( j = 0; j < 4; j++) { for( l = 0; l < 2; l++) { vst1_u8(dest + k*8 + (j*2+l)*total_elements, vreinterpret_u8_u32(r2[j] .val[l])); } } } }
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ unshuffle16_neon(uint8_t * const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements ) { size_t i, j, k, l, m; static const size_t bytesoftype = 16; uint8x8x2_t r0[8]; uint16x4x2_t r1[8]; uint32x2x2_t r2[8]; for( i = 0, k = 0; i<vectorizable_elements*bytesoftype; i += 128, k++) { /* Load and interleave groups of 16 bytes (128 bytes) to the structure r0*/ for( j = 0; j < 8; j++) { r0[j] = vzip_u8(vld1_u8(src + (2 * j) * total_elements + k * 8), vld1_u8(src + (2 * j + 1) * total_elements + k * 8) ); } /* Interleave 16 bytes */ for( j = 0; j < 4; j++) { for( l = 0; l < 2; l++) { r1[2*j+l] = vzip_u16(vreinterpret_u16_u8(r0[2 * j].val[l]), vreinterpret_u16_u8(r0[2 * j + 1].val[l]) ); } } /* Interleave 32 bytes */ for( j = 0; j < 2; j++) { for( l = 0; l < 2; l++) { for( m = 0; m < 2; m++) { r2[j*2+l+4*m] = vzip_u32(vreinterpret_u32_u16(r1[j + 4 * m].val[l]), vreinterpret_u32_u16(r1[j + 2 + 4 * m].val[l]) ); } } } /* Store the results in the destination vector */ for( j = 0; j < 4; j++) { for( l = 0; l < 2; l++) { for( m = 0; m < 2; m++) { vst1_u8(dest + i + (4*j+m+2*l)*8, vreinterpret_u8_u32(r2[j + 4 * m] .val[l])); } } } } }