unsigned int CRC32C(unsigned int length, const unsigned char* value) { unsigned int hash_value = 0; if (length == 1) return _mm_crc32_u8(hash_value, *value); if (length == 2) return _mm_crc32_u16(hash_value, *(unsigned short*) value); while (length >= 4) { hash_value = _mm_crc32_u32(hash_value, *(unsigned int*) value); value += 4; length -= 4; } if (length >= 2) { hash_value = _mm_crc32_u16(hash_value, *(unsigned short*) value); value += 2; length -= 2; } if (length) { hash_value = _mm_crc32_u8(hash_value, *value); } return hash_value; }
void CRC32C_Update_SSE42(const byte *s, size_t n, word32& c) { for(; !IsAligned<word32>(s) && n > 0; s++, n--) c = _mm_crc32_u8(c, *s); for(; n > 4; s+=4, n-=4) c = _mm_crc32_u32(c, *(const word32 *)(void*)s); for(; n > 0; s++, n--) c = _mm_crc32_u8(c, *s); }
void CRC32C::Update(const byte *s, size_t n) { #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE if (HasSSE4()) { for(; !IsAligned<word32>(s) && n > 0; s++, n--) m_crc = _mm_crc32_u8(m_crc, *s); for(; n > 4; s+=4, n-=4) m_crc = _mm_crc32_u32(m_crc, *(const word32 *)(void*)s); for(; n > 0; s++, n--) m_crc = _mm_crc32_u8(m_crc, *s); return; } #elif (CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE) if (HasCRC32()) { for(; !IsAligned<word32>(s) && n > 0; s++, n--) m_crc = __crc32cb(m_crc, *s); for(; n > 4; s+=4, n-=4) m_crc = __crc32cw(m_crc, *(const word32 *)(void*)s); for(; n > 0; s++, n--) m_crc = __crc32cb(m_crc, *s); return; } #endif word32 crc = m_crc; for(; !IsAligned<word32>(s) && n > 0; n--) crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc); while (n >= 4) { crc ^= *(const word32 *)(void*)s; crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc); crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc); crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc); crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc); n -= 4; s += 4; } while (n--) crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc); m_crc = crc; }
/** * Calculates 32-bit CRC for the specified input string * * @return A 32-bit unsigned integer representing the CRC */ uint32_t calculateCRC32C(uint32_t crc, const char *buf, size_t len) { // If the string is empty, return 0 if (len == 0) { return crc; } // XOR the initial CRC with INT_MAX crc ^= 0xFFFFFFFF; // Align the input to the word boundary for (; (len > 0) && ((size_t)buf & ALIGN_MASK); len--, buf++) { crc = _mm_crc32_u8(crc, *buf); } // Blast off the CRC32 calculation #ifdef __x86_64__ CALC_CRC(_mm_crc32_u64, crc, uint64_t, buf, len); #endif CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len); CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len); CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len); // Post-process the crc return (crc ^= 0xFFFFFFFF); }
ZIX_API uint32_t zix_digest_add(uint32_t hash, const void* const buf, const size_t len) { const uint8_t* str = (const uint8_t*)buf; #ifdef __SSE4_2__ // SSE 4.2 CRC32 for (size_t i = 0; i < (len / sizeof(uint32_t)); ++i) { hash = _mm_crc32_u32(hash, *(const uint32_t*)str); str += sizeof(uint32_t); } if (len & sizeof(uint16_t)) { hash = _mm_crc32_u16(hash, *(const uint16_t*)str); str += sizeof(uint16_t); } if (len & sizeof(uint8_t)) { hash = _mm_crc32_u8(hash, *(const uint8_t*)str); } #else // Classic DJB hash for (size_t i = 0; i < len; ++i) { hash = (hash << 5) + hash + str[i]; } #endif return hash; }
pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = static_cast<const unsigned char *>(data); const unsigned char *pend = p + len; /* * Process eight bytes of data at a time. * * NB: We do unaligned accesses here. The Intel architecture allows that, * and performance testing didn't show any performance gain from aligning * the begin address. */ #ifdef __x86_64__ while (p + 8 <= pend) { crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p)); p += 8; } /* Process remaining full four bytes if any */ if (p + 4 <= pend) { crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); p += 4; } #else /* * Process four bytes at a time. (The eight byte instruction is not * available on the 32-bit x86 architecture). */ while (p + 4 <= pend) { crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); p += 4; } #endif /* __x86_64__ */ /* Process any remaining bytes one at a time. */ while (p < pend) { crc = _mm_crc32_u8(crc, *p); p++; } return crc; }
int CrcHash(const void* data, int bytes, int hash) { int words = bytes / 4; bytes = bytes % 4; const int* p = reinterpret_cast<const int*>(data); while (words--) { hash = _mm_crc32_u32(hash, *p); ++p; } const char* s = reinterpret_cast<const char*>(p); while (bytes--) { hash = _mm_crc32_u8(hash, *s); ++s; } return hash; }
unsigned int test_mm_crc32_u8(unsigned int CRC, unsigned char V) { // CHECK-LABEL: test_mm_crc32_u8 // CHECK: call i32 @llvm.x86.sse42.crc32.32.8(i32 %{{.*}}, i8 %{{.*}}) return _mm_crc32_u8(CRC, V); }
unsigned int __attribute__((__target__("sse4.2"))) mm_crc32_u8_wrap(unsigned int c, unsigned char d) { return _mm_crc32_u8(c, d); }
SIMD_INLINE void Crc32c(size_t & crc, const uint8_t * p, const uint8_t * end) { while(p < end) crc = _mm_crc32_u8((uint32_t)crc, *p++); }
void libmaus::digest::CRC32C_sse42::update(uint8_t const * t, size_t l) { #if defined(LIBMAUS_HAVE_SMMINTRIN_H) && defined(LIBMAUS_USE_ASSEMBLY) && defined(LIBMAUS_HAVE_x86_64) && defined(LIBMAUS_HAVE_i386) ctx = ~ctx; size_t const offset = reinterpret_cast<size_t>(t); // check for 3 LSB if ( offset & 7 ) { // check for LSB if ( (offset & 1) && l ) { ctx = _mm_crc32_u8(ctx, *t); t += 1; l -= 1; } // check for 2nd LSB if ( (offset & 2) && (l>=2) ) { ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t)); t += 2; l -= 2; } // check for 3rd LSB if ( (offset & 4) && l >= 4 ) { ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t)); t += 4; l -= 4; } } uint64_t const * t64 = reinterpret_cast<uint64_t const *>(t); uint64_t const * const t64e = t64 + (l>>3); while ( t64 != t64e ) ctx = _mm_crc32_u64(ctx, *(t64++)); l &= 7; t = reinterpret_cast<uint8_t const *>(t64); if ( l >= 4 ) { ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t)); t += 4; l -= 4; } if ( l >= 2 ) { ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t)); t += 2; l -= 2; } if ( l ) { ctx = _mm_crc32_u8(ctx, *t); } ctx = ~ctx; #endif }
uint32_t sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len) { #ifdef __amd64__ const size_t align = 8; #else const size_t align = 4; #endif const unsigned char *next, *end; #ifdef __amd64__ uint64_t crc0, crc1, crc2; #else uint32_t crc0, crc1, crc2; #endif next = buf; crc0 = crc; /* Compute the crc to bring the data pointer to an aligned boundary. */ while (len && ((uintptr_t)next & (align - 1)) != 0) { crc0 = _mm_crc32_u8(crc0, *next); next++; len--; } #if LONG > SHORT /* * Compute the crc on sets of LONG*3 bytes, executing three independent * crc instructions, each on LONG bytes -- this is optimized for the * Nehalem, Westmere, Sandy Bridge, and Ivy Bridge architectures, which * have a throughput of one crc per cycle, but a latency of three * cycles. */ crc = 0; while (len >= LONG * 3) { crc1 = 0; crc2 = 0; end = next + LONG; do { #ifdef __amd64__ crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next); crc1 = _mm_crc32_u64(crc1, *(const uint64_t *)(next + LONG)); crc2 = _mm_crc32_u64(crc2, *(const uint64_t *)(next + (LONG * 2))); #else crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next); crc1 = _mm_crc32_u32(crc1, *(const uint32_t *)(next + LONG)); crc2 = _mm_crc32_u32(crc2, *(const uint32_t *)(next + (LONG * 2))); #endif next += align; } while (next < end); /*- * Update the crc. Try to do it in parallel with the inner * loop. 'crc' is used to accumulate crc0 and crc1 * produced by the inner loop so that the next iteration * of the loop doesn't depend on anything except crc2. * * The full expression for the update is: * crc = S*S*S*crc + S*S*crc0 + S*crc1 * where the terms are polynomials modulo the CRC polynomial. * We regroup this subtly as: * crc = S*S * (S*crc + crc0) + S*crc1. * This has an extra dependency which reduces possible * parallelism for the expression, but it turns out to be * best to intentionally delay evaluation of this expression * so that it competes less with the inner loop. * * We also intentionally reduce parallelism by feedng back * crc2 to the inner loop as crc0 instead of accumulating * it in crc. This synchronizes the loop with crc update. * CPU and/or compiler schedulers produced bad order without * this. * * Shifts take about 12 cycles each, so 3 here with 2 * parallelizable take about 24 cycles and the crc update * takes slightly longer. 8 dependent crc32 instructions * can run in 24 cycles, so the 3-way blocking is worse * than useless for sizes less than 8 * <word size> = 64 * on amd64. In practice, SHORT = 32 confirms these * timing calculations by giving a small improvement * starting at size 96. Then the inner loop takes about * 12 cycles and the crc update about 24, but these are * partly in parallel so the total time is less than the * 36 cycles that 12 dependent crc32 instructions would * take. * * To have a chance of completely hiding the overhead for * the crc update, the inner loop must take considerably * longer than 24 cycles. LONG = 64 makes the inner loop * take about 24 cycles, so is not quite large enough. * LONG = 128 works OK. Unhideable overheads are about * 12 cycles per inner loop. All assuming timing like * Haswell. */ crc = crc32c_shift(crc32c_long, crc) ^ crc0; crc1 = crc32c_shift(crc32c_long, crc1); crc = crc32c_shift(crc32c_2long, crc) ^ crc1; crc0 = crc2; next += LONG * 2; len -= LONG * 3; } crc0 ^= crc; #endif /* LONG > SHORT */ /* * Do the same thing, but now on SHORT*3 blocks for the remaining data * less than a LONG*3 block */ crc = 0; while (len >= SHORT * 3) { crc1 = 0; crc2 = 0; end = next + SHORT; do { #ifdef __amd64__ crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next); crc1 = _mm_crc32_u64(crc1, *(const uint64_t *)(next + SHORT)); crc2 = _mm_crc32_u64(crc2, *(const uint64_t *)(next + (SHORT * 2))); #else crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next); crc1 = _mm_crc32_u32(crc1, *(const uint32_t *)(next + SHORT)); crc2 = _mm_crc32_u32(crc2, *(const uint32_t *)(next + (SHORT * 2))); #endif next += align; } while (next < end); crc = crc32c_shift(crc32c_short, crc) ^ crc0; crc1 = crc32c_shift(crc32c_short, crc1); crc = crc32c_shift(crc32c_2short, crc) ^ crc1; crc0 = crc2; next += SHORT * 2; len -= SHORT * 3; } crc0 ^= crc; /* Compute the crc on the remaining bytes at native word size. */ end = next + (len - (len & (align - 1))); while (next < end) { #ifdef __amd64__ crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next); #else crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next); #endif next += align; } len &= (align - 1); /* Compute the crc for any trailing bytes. */ while (len) { crc0 = _mm_crc32_u8(crc0, *next); next++; len--; } return ((uint32_t)crc0); }