コード例 #1
0
void test_vzipu32 (void)
{
    uint32x2x2_t out_uint32x2x2_t;
    uint32x2_t arg0_uint32x2_t;
    uint32x2_t arg1_uint32x2_t;

    out_uint32x2x2_t = vzip_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
}
コード例 #2
0
ファイル: aarch64-neon-perm.c プロジェクト: Bigcheese/clang
uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
  // CHECK-LABEL: test_vzip_u32
  return vzip_u32(a, b);
  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
}
コード例 #3
0
ファイル: shuffle-neon.c プロジェクト: BillTheBest/c-blosc2
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
shuffle8_neon(uint8_t
* const dest,
const uint8_t* const src,
const size_t vectorizable_elements,
const size_t total_elements
)
{
size_t i, j, k, l;
static const size_t bytesoftype = 8;
uint8x8x2_t r0[4];
uint16x4x2_t r1[4];
uint32x2x2_t r2[4];

for(
i = 0, k = 0;
i<vectorizable_elements*bytesoftype;
i += 64, k++) {
/* Load and interleave groups of 8 bytes (64 bytes) to the structure r0 */
for(
j = 0;
j < 4; j++) {
r0[j] =
vzip_u8(vld1_u8(src + i + (2 * j) * 8), vld1_u8(src + i + (2 * j + 1) * 8)
);
}
/* Interleave 16 bytes */
for(
j = 0;
j < 2; j++) {
for(
l = 0;
l < 2; l++) {
r1[j*2+l] =
vzip_u16(vreinterpret_u16_u8(r0[j * 2].val[l]), vreinterpret_u16_u8(r0[j * 2 + 1].val[l])
);
}
}
/* Interleave 32 bytes */
for(
j = 0;
j < 2; j++) {
for(
l = 0;
l < 2; l++) {
r2[j*2+l] =
vzip_u32(vreinterpret_u32_u16(r1[j].val[l]), vreinterpret_u32_u16(r1[j + 2].val[l])
);
}
}
/* Store the results in the destination vector */
for(
j = 0;
j < 4; j++) {
for(
l = 0;
l < 2; l++) {
vst1_u8(dest
+ k*8 + (j*2+l)*total_elements,
vreinterpret_u8_u32(r2[j]
.val[l]));
}
}
}
}
コード例 #4
0
ファイル: shuffle-neon.c プロジェクト: BillTheBest/c-blosc2
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
unshuffle16_neon(uint8_t
* const dest,
const uint8_t* const src,
const size_t vectorizable_elements,
const size_t total_elements
)
{
size_t i, j, k, l, m;
static const size_t bytesoftype = 16;
uint8x8x2_t r0[8];
uint16x4x2_t r1[8];
uint32x2x2_t r2[8];

for(
i = 0, k = 0;
i<vectorizable_elements*bytesoftype;
i += 128, k++) {
/* Load and interleave groups of 16 bytes (128 bytes) to the structure r0*/
for(
j = 0;
j < 8; j++) {
r0[j] =
vzip_u8(vld1_u8(src + (2 * j) * total_elements + k * 8), vld1_u8(src + (2 * j + 1) * total_elements + k * 8)
);
}
/* Interleave 16 bytes */
for(
j = 0;
j < 4; j++) {
for(
l = 0;
l < 2; l++) {
r1[2*j+l] =
vzip_u16(vreinterpret_u16_u8(r0[2 * j].val[l]), vreinterpret_u16_u8(r0[2 * j + 1].val[l])
);
}
}
/* Interleave 32 bytes */
for(
j = 0;
j < 2; j++) {
for(
l = 0;
l < 2; l++) {
for(
m = 0;
m < 2; m++) {
r2[j*2+l+4*m] =
vzip_u32(vreinterpret_u32_u16(r1[j + 4 * m].val[l]), vreinterpret_u32_u16(r1[j + 2 + 4 * m].val[l])
);
}
}
}
/* Store the results in the destination vector */
for(
j = 0;
j < 4; j++) {
for(
l = 0;
l < 2; l++) {
for(
m = 0;
m < 2; m++) {
vst1_u8(dest
+ i + (4*j+m+2*l)*8,
vreinterpret_u8_u32(r2[j + 4 * m]
.val[l]));
}
}
}
}
}