Пример #1
0
static void
fletcher_4_avx2_fini(zio_cksum_t *zcp)
{
    uint64_t __attribute__((aligned(32))) a[4];
    uint64_t __attribute__((aligned(32))) b[4];
    uint64_t __attribute__((aligned(32))) c[4];
    uint64_t __attribute__((aligned(32))) d[4];
    uint64_t A, B, C, D;

    asm volatile("vmovdqu %%ymm0, %0":"=m" (a));
    asm volatile("vmovdqu %%ymm1, %0":"=m" (b));
    asm volatile("vmovdqu %%ymm2, %0":"=m" (c));
    asm volatile("vmovdqu %%ymm3, %0":"=m" (d));
    asm volatile("vzeroupper");

    kfpu_end();

    A = a[0] + a[1] + a[2] + a[3];
    B = 0 - a[1] - 2*a[2] - 3*a[3]
        + 4*b[0] + 4*b[1] + 4*b[2] + 4*b[3];

    C = a[2] + 3*a[3]
        -  6*b[0] - 10*b[1] - 14*b[2] - 18*b[3]
        + 16*c[0] + 16*c[1] + 16*c[2] + 16*c[3];

    D = 0 - a[3]
        +  4*b[0] + 10*b[1] + 20*b[2] + 34*b[3]
        - 48*c[0] - 64*c[1] - 80*c[2] - 96*c[3]
        + 64*d[0] + 64*d[1] + 64*d[2] + 64*d[3];

    ZIO_SET_CHECKSUM(zcp, A, B, C, D);
}
Пример #2
0
static void
fletcher_4_aarch64_neon_byteswap(fletcher_4_ctx_t *ctx,
    const void *buf, uint64_t size)
{
	const uint64_t *ip = buf;
	const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
#if defined(_KERNEL)
register unsigned char ZERO asm("v0") __attribute__((vector_size(16)));
register unsigned char ACC0 asm("v1") __attribute__((vector_size(16)));
register unsigned char ACC1 asm("v2") __attribute__((vector_size(16)));
register unsigned char ACC2 asm("v3") __attribute__((vector_size(16)));
register unsigned char ACC3 asm("v4") __attribute__((vector_size(16)));
register unsigned char TMP1 asm("v5") __attribute__((vector_size(16)));
register unsigned char TMP2 asm("v6") __attribute__((vector_size(16)));
register unsigned char SRC asm("v7") __attribute__((vector_size(16)));
#else
unsigned char ZERO __attribute__((vector_size(16)));
unsigned char ACC0 __attribute__((vector_size(16)));
unsigned char ACC1 __attribute__((vector_size(16)));
unsigned char ACC2 __attribute__((vector_size(16)));
unsigned char ACC3 __attribute__((vector_size(16)));
unsigned char TMP1 __attribute__((vector_size(16)));
unsigned char TMP2 __attribute__((vector_size(16)));
unsigned char SRC __attribute__((vector_size(16)));
#endif

	kfpu_begin();

	NEON_INIT_LOOP();

	for (; ip < ipend; ip += 2) {
		NEON_MAIN_LOOP(NEON_DO_REVERSE);
	}

	NEON_FINI_LOOP();

	kfpu_end();
}