void test_crc32cd (void) { uint32_t out_uint32_t; uint32_t arg0_uint32_t; uint64_t arg1_uint64_t; out_uint32_t = __crc32cd (arg0_uint32_t, arg1_uint64_t); }
// ARM-LABEL: test_crc32cd // AArch32: call i32 @llvm.arm.crc32cw // AArch32: call i32 @llvm.arm.crc32cw // AArch64: call i32 @llvm.aarch64.crc32cx uint32_t test_crc32cd(uint32_t a, uint64_t b) { return __crc32cd(a, b); }
uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len) { int64_t length = len; uint32_t crc0, crc1, crc2; if (buffer) { #ifdef HAVE_ARMV8_CRYPTO if (ceph_arch_aarch64_pmull) { #ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS /* Calculate reflected crc with PMULL Instruction */ const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014; uint64_t t0, t1; /* crc done "by 3" for fixed input block size of 1024 bytes */ while ((length -= 1024) >= 0) { /* Prefetch data for following block to avoid cache miss */ PREF1KL2(1024*3); /* Do first 8 bytes here for better pipelining */ crc0 = __crc32cd(crc, *(const uint64_t *)buffer); crc1 = 0; crc2 = 0; buffer += sizeof(uint64_t); /* Process block inline Process crc0 last to avoid dependency with above */ CRC32C7X3X8(0); CRC32C7X3X8(1); CRC32C7X3X8(2); CRC32C7X3X8(3); CRC32C7X3X8(4); CRC32C7X3X8(5); buffer += 42*3*sizeof(uint64_t); /* Prefetch data for following block to avoid cache miss */ PREF1KL1(1024); /* Merge crc0 and crc1 into crc2 crc1 multiply by K2 crc0 multiply by K1 */ t1 = (uint64_t)vmull_p64(crc1, k2); t0 = (uint64_t)vmull_p64(crc0, k1); crc = __crc32cd(crc2, *(const uint64_t *)buffer); crc1 = __crc32cd(0, t1); crc ^= crc1; crc0 = __crc32cd(0, t0); crc ^= crc0; buffer += sizeof(uint64_t); } #else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ __asm__("mov x16, #0xf38a \n\t" "movk x16, #0xe417, lsl 16 \n\t" "mov v1.2d[0], x16 \n\t" "mov x16, #0x8014 \n\t" "movk x16, #0x8f15, lsl 16 \n\t" "mov v0.2d[0], x16 \n\t" :::"x16"); while ((length -= 1024) >= 0) { PREF1KL2(1024*3); __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t" :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):); crc1 = 0; crc2 = 0; buffer += sizeof(uint64_t); CRC32C7X3X8(0); CRC32C7X3X8(1); CRC32C7X3X8(2); CRC32C7X3X8(3); CRC32C7X3X8(4); CRC32C7X3X8(5); buffer += 42*3*sizeof(uint64_t); PREF1KL1(1024); __asm__("mov v2.2d[0], %x[c1] \n\t" "pmull v2.1q, v2.1d, v0.1d \n\t" "mov v3.2d[0], %x[c0] \n\t" "pmull v3.1q, v3.1d, v1.1d \n\t" "crc32cx %w[c], %w[c2], %x[v] \n\t" "mov %x[c1], v2.2d[0] \n\t" "crc32cx %w[c1], wzr, %x[c1] \n\t" "eor %w[c], %w[c], %w[c1] \n\t" "mov %x[c0], v3.2d[0] \n\t" "crc32cx %w[c0], wzr, %x[c0] \n\t" "eor %w[c], %w[c], %w[c0] \n\t" :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc) :[v]"r"(*((const uint64_t *)buffer))); buffer += sizeof(uint64_t); } #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ if(!(length += 1024)) return crc; } #endif /* HAVE_ARMV8_CRYPTO */ while ((length -= sizeof(uint64_t)) >= 0) { CRC32CX(crc, *(uint64_t *)buffer); buffer += sizeof(uint64_t); } /* The following is more efficient than the straight loop */ if (length & sizeof(uint32_t)) { CRC32CW(crc, *(uint32_t *)buffer); buffer += sizeof(uint32_t); } if (length & sizeof(uint16_t)) { CRC32CH(crc, *(uint16_t *)buffer); buffer += sizeof(uint16_t); } if (length & sizeof(uint8_t)) CRC32CB(crc, *buffer); } else { #ifdef HAVE_ARMV8_CRYPTO if (ceph_arch_aarch64_pmull) {
uint32_t test_crc32cd (uint32_t arg0, uint64_t arg1) { return __crc32cd (arg0, arg1); }
/* * Function to calculate reflected crc with PMULL Instruction * crc done "by 3" for fixed input block size of 1024 bytes */ uint32_t crc32c_arm64(unsigned char const *data, unsigned long length) { signed long len = length; uint32_t crc = ~0; uint32_t crc0, crc1, crc2; /* Load two consts: K1 and K2 */ const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014; uint64_t t0, t1; while ((len -= 1024) >= 0) { /* Do first 8 bytes here for better pipelining */ crc0 = __crc32cd(crc, *(const uint64_t *)data); crc1 = 0; crc2 = 0; data += sizeof(uint64_t); /* Process block inline Process crc0 last to avoid dependency with above */ CRC32C7X3X8(0); CRC32C7X3X8(1); CRC32C7X3X8(2); CRC32C7X3X8(3); CRC32C7X3X8(4); CRC32C7X3X8(5); data += 42*3*sizeof(uint64_t); /* Merge crc0 and crc1 into crc2 crc1 multiply by K2 crc0 multiply by K1 */ t1 = (uint64_t)vmull_p64(crc1, k2); t0 = (uint64_t)vmull_p64(crc0, k1); crc = __crc32cd(crc2, *(const uint64_t *)data); crc1 = __crc32cd(0, t1); crc ^= crc1; crc0 = __crc32cd(0, t0); crc ^= crc0; data += sizeof(uint64_t); } if (!(len += 1024)) return crc; while ((len -= sizeof(uint64_t)) >= 0) { crc = __crc32cd(crc, *(const uint64_t *)data); data += sizeof(uint64_t); } /* The following is more efficient than the straight loop */ if (len & sizeof(uint32_t)) { crc = __crc32cw(crc, *(const uint32_t *)data); data += sizeof(uint32_t); } if (len & sizeof(uint16_t)) { crc = __crc32ch(crc, *(const uint16_t *)data); data += sizeof(uint16_t); } if (len & sizeof(uint8_t)) { crc = __crc32cb(crc, *(const uint8_t *)data); } return crc; }