static void fletcher_4_avx2_fini(zio_cksum_t *zcp) { uint64_t __attribute__((aligned(32))) a[4]; uint64_t __attribute__((aligned(32))) b[4]; uint64_t __attribute__((aligned(32))) c[4]; uint64_t __attribute__((aligned(32))) d[4]; uint64_t A, B, C, D; asm volatile("vmovdqu %%ymm0, %0":"=m" (a)); asm volatile("vmovdqu %%ymm1, %0":"=m" (b)); asm volatile("vmovdqu %%ymm2, %0":"=m" (c)); asm volatile("vmovdqu %%ymm3, %0":"=m" (d)); asm volatile("vzeroupper"); kfpu_end(); A = a[0] + a[1] + a[2] + a[3]; B = 0 - a[1] - 2*a[2] - 3*a[3] + 4*b[0] + 4*b[1] + 4*b[2] + 4*b[3]; C = a[2] + 3*a[3] - 6*b[0] - 10*b[1] - 14*b[2] - 18*b[3] + 16*c[0] + 16*c[1] + 16*c[2] + 16*c[3]; D = 0 - a[3] + 4*b[0] + 10*b[1] + 20*b[2] + 34*b[3] - 48*c[0] - 64*c[1] - 80*c[2] - 96*c[3] + 64*d[0] + 64*d[1] + 64*d[2] + 64*d[3]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); }
static void fletcher_4_aarch64_neon_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); #if defined(_KERNEL) register unsigned char ZERO asm("v0") __attribute__((vector_size(16))); register unsigned char ACC0 asm("v1") __attribute__((vector_size(16))); register unsigned char ACC1 asm("v2") __attribute__((vector_size(16))); register unsigned char ACC2 asm("v3") __attribute__((vector_size(16))); register unsigned char ACC3 asm("v4") __attribute__((vector_size(16))); register unsigned char TMP1 asm("v5") __attribute__((vector_size(16))); register unsigned char TMP2 asm("v6") __attribute__((vector_size(16))); register unsigned char SRC asm("v7") __attribute__((vector_size(16))); #else unsigned char ZERO __attribute__((vector_size(16))); unsigned char ACC0 __attribute__((vector_size(16))); unsigned char ACC1 __attribute__((vector_size(16))); unsigned char ACC2 __attribute__((vector_size(16))); unsigned char ACC3 __attribute__((vector_size(16))); unsigned char TMP1 __attribute__((vector_size(16))); unsigned char TMP2 __attribute__((vector_size(16))); unsigned char SRC __attribute__((vector_size(16))); #endif kfpu_begin(); NEON_INIT_LOOP(); for (; ip < ipend; ip += 2) { NEON_MAIN_LOOP(NEON_DO_REVERSE); } NEON_FINI_LOOP(); kfpu_end(); }