Example #1
0
uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
{
	int64_t length = len;
	uint32_t crc0, crc1, crc2;

	if (buffer) {
#ifdef HAVE_ARMV8_CRYPTO
	        if (ceph_arch_aarch64_pmull) {
#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
		/* Calculate reflected crc with PMULL Instruction */
		const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
		uint64_t t0, t1;

		/* crc done "by 3" for fixed input block size of 1024 bytes */
		while ((length -= 1024) >= 0) {
			/* Prefetch data for following block to avoid cache miss */
			PREF1KL2(1024*3);
			/* Do first 8 bytes here for better pipelining */
			crc0 = __crc32cd(crc, *(const uint64_t *)buffer);
			crc1 = 0;
			crc2 = 0;
			buffer += sizeof(uint64_t);

			/* Process block inline
			Process crc0 last to avoid dependency with above */
			CRC32C7X3X8(0);
			CRC32C7X3X8(1);
			CRC32C7X3X8(2);
			CRC32C7X3X8(3);
			CRC32C7X3X8(4);
			CRC32C7X3X8(5);

			buffer += 42*3*sizeof(uint64_t);
			/* Prefetch data for following block to avoid cache miss */
			PREF1KL1(1024);

			/* Merge crc0 and crc1 into crc2
			   crc1 multiply by K2
			   crc0 multiply by K1 */

			t1 = (uint64_t)vmull_p64(crc1, k2);
			t0 = (uint64_t)vmull_p64(crc0, k1);
			crc = __crc32cd(crc2, *(const uint64_t *)buffer);
			crc1 = __crc32cd(0, t1);
			crc ^= crc1;
			crc0 = __crc32cd(0, t0);
			crc ^= crc0;

			buffer += sizeof(uint64_t);
		}
#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
		__asm__("mov    x16,            #0xf38a         \n\t"
			"movk   x16,            #0xe417, lsl 16 \n\t"
			"mov    v1.2d[0],       x16             \n\t"
			"mov    x16,            #0x8014         \n\t"
			"movk   x16,            #0x8f15, lsl 16 \n\t"
			"mov    v0.2d[0],       x16             \n\t"
			:::"x16");

		while ((length -= 1024) >= 0) {
			PREF1KL2(1024*3);
			__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
				:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
			crc1 = 0;
			crc2 = 0;
			buffer += sizeof(uint64_t);

			CRC32C7X3X8(0);
			CRC32C7X3X8(1);
			CRC32C7X3X8(2);
			CRC32C7X3X8(3);
			CRC32C7X3X8(4);
			CRC32C7X3X8(5);

			buffer += 42*3*sizeof(uint64_t);
			PREF1KL1(1024);
			__asm__("mov            v2.2d[0],       %x[c1]          \n\t"
				"pmull          v2.1q,          v2.1d,  v0.1d   \n\t"
				"mov            v3.2d[0],       %x[c0]          \n\t"
				"pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
				"crc32cx        %w[c],          %w[c2], %x[v]   \n\t"
				"mov            %x[c1],         v2.2d[0]        \n\t"
				"crc32cx        %w[c1],         wzr,    %x[c1]  \n\t"
				"eor            %w[c],          %w[c],  %w[c1]  \n\t"
				"mov            %x[c0],         v3.2d[0]        \n\t"
				"crc32cx        %w[c0],         wzr,    %x[c0]  \n\t"
				"eor            %w[c],          %w[c],  %w[c0]  \n\t"
				:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
				:[v]"r"(*((const uint64_t *)buffer)));
			buffer += sizeof(uint64_t);
		}
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */

		if(!(length += 1024))
			return crc;
	        }
#endif /* HAVE_ARMV8_CRYPTO */
		while ((length -= sizeof(uint64_t)) >= 0) {
			CRC32CX(crc, *(uint64_t *)buffer);
			buffer += sizeof(uint64_t);
		}

		/* The following is more efficient than the straight loop */
		if (length & sizeof(uint32_t)) {
			CRC32CW(crc, *(uint32_t *)buffer);
			buffer += sizeof(uint32_t);
		}
		if (length & sizeof(uint16_t)) {
			CRC32CH(crc, *(uint16_t *)buffer);
			buffer += sizeof(uint16_t);
		}
		if (length & sizeof(uint8_t))
			CRC32CB(crc, *buffer);
	} else {
#ifdef HAVE_ARMV8_CRYPTO
	        if (ceph_arch_aarch64_pmull) {
Example #2
0
/*
 * Function to calculate reflected crc with PMULL Instruction
 * crc done "by 3" for fixed input block size of 1024 bytes
 */
uint32_t crc32c_arm64(unsigned char const *data, unsigned long length)
{
	signed long len = length;
	uint32_t crc = ~0;
	uint32_t crc0, crc1, crc2;

	/* Load two consts: K1 and K2 */
	const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
	uint64_t t0, t1;

	while ((len -= 1024) >= 0) {
		/* Do first 8 bytes here for better pipelining */
		crc0 = __crc32cd(crc, *(const uint64_t *)data);
		crc1 = 0;
		crc2 = 0;
		data += sizeof(uint64_t);

		/* Process block inline
		   Process crc0 last to avoid dependency with above */
		CRC32C7X3X8(0);
		CRC32C7X3X8(1);
		CRC32C7X3X8(2);
		CRC32C7X3X8(3);
		CRC32C7X3X8(4);
		CRC32C7X3X8(5);

		data += 42*3*sizeof(uint64_t);

		/* Merge crc0 and crc1 into crc2
		   crc1 multiply by K2
		   crc0 multiply by K1 */

		t1 = (uint64_t)vmull_p64(crc1, k2);
		t0 = (uint64_t)vmull_p64(crc0, k1);
		crc = __crc32cd(crc2, *(const uint64_t *)data);
		crc1 = __crc32cd(0, t1);
		crc ^= crc1;
		crc0 = __crc32cd(0, t0);
		crc ^= crc0;

		data += sizeof(uint64_t);
	}

	if (!(len += 1024))
		return crc;

	while ((len -= sizeof(uint64_t)) >= 0) {
                crc = __crc32cd(crc, *(const uint64_t *)data);
                data += sizeof(uint64_t);
        }

        /* The following is more efficient than the straight loop */
        if (len & sizeof(uint32_t)) {
                crc = __crc32cw(crc, *(const uint32_t *)data);
                data += sizeof(uint32_t);
        }
        if (len & sizeof(uint16_t)) {
                crc = __crc32ch(crc, *(const uint16_t *)data);
                data += sizeof(uint16_t);
        }
        if (len & sizeof(uint8_t)) {
                crc = __crc32cb(crc, *(const uint8_t *)data);
        }

	return crc;
}