ZIX_API uint32_t zix_digest_add(uint32_t hash, const void* const buf, const size_t len) { const uint8_t* str = (const uint8_t*)buf; #ifdef __SSE4_2__ // SSE 4.2 CRC32 for (size_t i = 0; i < (len / sizeof(uint32_t)); ++i) { hash = _mm_crc32_u32(hash, *(const uint32_t*)str); str += sizeof(uint32_t); } if (len & sizeof(uint16_t)) { hash = _mm_crc32_u16(hash, *(const uint16_t*)str); str += sizeof(uint16_t); } if (len & sizeof(uint8_t)) { hash = _mm_crc32_u8(hash, *(const uint8_t*)str); } #else // Classic DJB hash for (size_t i = 0; i < len; ++i) { hash = (hash << 5) + hash + str[i]; } #endif return hash; }
unsigned int CRC32C(unsigned int length, const unsigned char* value) { unsigned int hash_value = 0; if (length == 1) return _mm_crc32_u8(hash_value, *value); if (length == 2) return _mm_crc32_u16(hash_value, *(unsigned short*) value); while (length >= 4) { hash_value = _mm_crc32_u32(hash_value, *(unsigned int*) value); value += 4; length -= 4; } if (length >= 2) { hash_value = _mm_crc32_u16(hash_value, *(unsigned short*) value); value += 2; length -= 2; } if (length) { hash_value = _mm_crc32_u8(hash_value, *value); } return hash_value; }
pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = static_cast<const unsigned char *>(data); const unsigned char *pend = p + len; /* * Process eight bytes of data at a time. * * NB: We do unaligned accesses here. The Intel architecture allows that, * and performance testing didn't show any performance gain from aligning * the begin address. */ #ifdef __x86_64__ while (p + 8 <= pend) { crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p)); p += 8; } /* Process remaining full four bytes if any */ if (p + 4 <= pend) { crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); p += 4; } #else /* * Process four bytes at a time. (The eight byte instruction is not * available on the 32-bit x86 architecture). */ while (p + 4 <= pend) { crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); p += 4; } #endif /* __x86_64__ */ /* Process any remaining bytes one at a time. */ while (p < pend) { crc = _mm_crc32_u8(crc, *p); p++; } return crc; }
UINT operator()(const T& k) const { UINT *pData = (UINT*)&k; UINT crc = 0; for (UINT i = 0; i < sizeof(T) / sizeof(UINT); ++i) { crc = _mm_crc32_u32(crc, pData[i]); } return crc; }
void CRC32C_Update_SSE42(const byte *s, size_t n, word32& c) { for(; !IsAligned<word32>(s) && n > 0; s++, n--) c = _mm_crc32_u8(c, *s); for(; n > 4; s+=4, n-=4) c = _mm_crc32_u32(c, *(const word32 *)(void*)s); for(; n > 0; s++, n--) c = _mm_crc32_u8(c, *s); }
SIMD_INLINE void Crc32c(size_t & crc, const size_t * p, const size_t * end) { while(p < end) { #ifdef SIMD_X64_ENABLE crc = _mm_crc32_u64(crc, *p++); #else crc = _mm_crc32_u32(crc, *p++); #endif } }
void CRC32C::Update(const byte *s, size_t n) { #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE if (HasSSE4()) { for(; !IsAligned<word32>(s) && n > 0; s++, n--) m_crc = _mm_crc32_u8(m_crc, *s); for(; n > 4; s+=4, n-=4) m_crc = _mm_crc32_u32(m_crc, *(const word32 *)(void*)s); for(; n > 0; s++, n--) m_crc = _mm_crc32_u8(m_crc, *s); return; } #elif (CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE) if (HasCRC32()) { for(; !IsAligned<word32>(s) && n > 0; s++, n--) m_crc = __crc32cb(m_crc, *s); for(; n > 4; s+=4, n-=4) m_crc = __crc32cw(m_crc, *(const word32 *)(void*)s); for(; n > 0; s++, n--) m_crc = __crc32cb(m_crc, *s); return; } #endif word32 crc = m_crc; for(; !IsAligned<word32>(s) && n > 0; n--) crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc); while (n >= 4) { crc ^= *(const word32 *)(void*)s; crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc); crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc); crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc); crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc); n -= 4; s += 4; } while (n--) crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc); m_crc = crc; }
void increment_sse42(float arr[4]) { ALIGN_16 double darr[4]; __m128d val1 = _mm_set_pd(arr[0], arr[1]); __m128d val2 = _mm_set_pd(arr[2], arr[3]); __m128d one = _mm_set_pd(1.0, 1.0); __m128d result = _mm_add_pd(val1, one); _mm_store_pd(darr, result); result = _mm_add_pd(val2, one); _mm_store_pd(&darr[2], result); _mm_crc32_u32(42, 99); /* A no-op, only here to use an SSE4.2 instruction. */ arr[0] = (float)darr[1]; arr[1] = (float)darr[0]; arr[2] = (float)darr[3]; arr[3] = (float)darr[2]; }
int CrcHash(const void* data, int bytes, int hash) { int words = bytes / 4; bytes = bytes % 4; const int* p = reinterpret_cast<const int*>(data); while (words--) { hash = _mm_crc32_u32(hash, *p); ++p; } const char* s = reinterpret_cast<const char*>(p); while (bytes--) { hash = _mm_crc32_u8(hash, *s); ++s; } return hash; }
unsigned int test_mm_crc32_u32(unsigned int CRC, unsigned int V) { // CHECK-LABEL: test_mm_crc32_u32 // CHECK: call i32 @llvm.x86.sse42.crc32.32.32(i32 %{{.*}}, i32 %{{.*}}) return _mm_crc32_u32(CRC, V); }
int is_false_positive(struct tuple4 addr, idx_type tcb_index, TCP_THREAD_LOCAL_P tcp_thread_local_p) { struct tcp_stream *tcb_p = &(tcp_thread_local_p->tcb_array[tcb_index]); if (!((addr.source == tcb_p->addr.source && addr.dest == tcb_p->addr.dest && addr.saddr == tcb_p->addr.saddr && addr.daddr == tcb_p->addr.daddr ) || (addr.dest == tcb_p->addr.source && addr.source == tcb_p->addr.dest && addr.daddr == tcb_p->addr.saddr && addr.saddr == tcb_p->addr.daddr ))) { // Yes, it is false positive #if defined(DEBUG) tcp_test[tcp_thread_local_p->self_cpu_id].false_positive ++; #endif #if 0 int sign2 = calc_signature( tcb_p->addr.saddr, tcb_p->addr.daddr, tcb_p->addr.source, tcb_p->addr.dest); printf("||the Founded one in the table: Sip: %d.%d.%d.%d, Sport:%d, Dip : %d.%d.%d.%d, Dport:%d , sign = %x\n", tcb_p->addr.saddr & 0x000000FF, (tcb_p->addr.saddr & 0x0000FF00)>>8, (tcb_p->addr.saddr & 0x00FF0000)>>16, (tcb_p->addr.saddr & 0xFF000000)>>24, tcb_p->addr.source, tcb_p->addr.daddr & 0x000000FF, (tcb_p->addr.daddr & 0x0000FF00)>>8, (tcb_p->addr.daddr & 0x00FF0000)>>16, (tcb_p->addr.daddr & 0xFF000000)>>24, tcb_p->addr.dest, sign2 ); int crc1 = 0; crc1 = _mm_crc32_u32(crc1, tcb_p->addr.saddr); crc1 = _mm_crc32_u32(crc1, tcb_p->addr.daddr); crc1 = _mm_crc32_u32(crc1, tcb_p->addr.source ^ tcb_p->addr.dest); printf("(%x", crc1); crc1 = 0; crc1 = _mm_crc32_u32(crc1, tcb_p->addr.daddr); crc1 = _mm_crc32_u32(crc1, tcb_p->addr.saddr); crc1 = _mm_crc32_u32(crc1, tcb_p->addr.source ^ tcb_p->addr.dest); printf("-- %x)\n", crc1); sign2 = calc_signature( addr.saddr, addr.daddr, addr.source, addr.dest); printf("Current one: Sip: %d.%d.%d.%d, Sport:%d, Dip : %d.%d.%d.%d, Dport:%d , sign = %x||\n", addr.saddr & 0x000000FF, (addr.saddr & 0x0000FF00)>>8, (addr.saddr & 0x00FF0000)>>16, (addr.saddr & 0xFF000000)>>24, addr.source, addr.daddr & 0x000000FF, (addr.daddr & 0x0000FF00)>>8, (addr.daddr & 0x00FF0000)>>16, (addr.daddr & 0xFF000000)>>24, addr.dest, sign2 ); crc1 = 0; crc1 = _mm_crc32_u32(crc1, addr.saddr); crc1 = _mm_crc32_u32(crc1, addr.daddr); crc1 = _mm_crc32_u32(crc1, addr.source ^ addr.dest); printf("(%x", crc1); crc1 = 0; crc1 = _mm_crc32_u32(crc1, addr.daddr); crc1 = _mm_crc32_u32(crc1, addr.saddr); crc1 = _mm_crc32_u32(crc1, addr.source ^ addr.dest); printf("-- %x)\n", crc1); #endif return 1; } else {
void libmaus::digest::CRC32C_sse42::update(uint8_t const * t, size_t l) { #if defined(LIBMAUS_HAVE_SMMINTRIN_H) && defined(LIBMAUS_USE_ASSEMBLY) && defined(LIBMAUS_HAVE_x86_64) && defined(LIBMAUS_HAVE_i386) ctx = ~ctx; size_t const offset = reinterpret_cast<size_t>(t); // check for 3 LSB if ( offset & 7 ) { // check for LSB if ( (offset & 1) && l ) { ctx = _mm_crc32_u8(ctx, *t); t += 1; l -= 1; } // check for 2nd LSB if ( (offset & 2) && (l>=2) ) { ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t)); t += 2; l -= 2; } // check for 3rd LSB if ( (offset & 4) && l >= 4 ) { ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t)); t += 4; l -= 4; } } uint64_t const * t64 = reinterpret_cast<uint64_t const *>(t); uint64_t const * const t64e = t64 + (l>>3); while ( t64 != t64e ) ctx = _mm_crc32_u64(ctx, *(t64++)); l &= 7; t = reinterpret_cast<uint8_t const *>(t64); if ( l >= 4 ) { ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t)); t += 4; l -= 4; } if ( l >= 2 ) { ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t)); t += 2; l -= 2; } if ( l ) { ctx = _mm_crc32_u8(ctx, *t); } ctx = ~ctx; #endif }
uint32_t sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len) { #ifdef __amd64__ const size_t align = 8; #else const size_t align = 4; #endif const unsigned char *next, *end; #ifdef __amd64__ uint64_t crc0, crc1, crc2; #else uint32_t crc0, crc1, crc2; #endif next = buf; crc0 = crc; /* Compute the crc to bring the data pointer to an aligned boundary. */ while (len && ((uintptr_t)next & (align - 1)) != 0) { crc0 = _mm_crc32_u8(crc0, *next); next++; len--; } #if LONG > SHORT /* * Compute the crc on sets of LONG*3 bytes, executing three independent * crc instructions, each on LONG bytes -- this is optimized for the * Nehalem, Westmere, Sandy Bridge, and Ivy Bridge architectures, which * have a throughput of one crc per cycle, but a latency of three * cycles. */ crc = 0; while (len >= LONG * 3) { crc1 = 0; crc2 = 0; end = next + LONG; do { #ifdef __amd64__ crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next); crc1 = _mm_crc32_u64(crc1, *(const uint64_t *)(next + LONG)); crc2 = _mm_crc32_u64(crc2, *(const uint64_t *)(next + (LONG * 2))); #else crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next); crc1 = _mm_crc32_u32(crc1, *(const uint32_t *)(next + LONG)); crc2 = _mm_crc32_u32(crc2, *(const uint32_t *)(next + (LONG * 2))); #endif next += align; } while (next < end); /*- * Update the crc. Try to do it in parallel with the inner * loop. 'crc' is used to accumulate crc0 and crc1 * produced by the inner loop so that the next iteration * of the loop doesn't depend on anything except crc2. * * The full expression for the update is: * crc = S*S*S*crc + S*S*crc0 + S*crc1 * where the terms are polynomials modulo the CRC polynomial. * We regroup this subtly as: * crc = S*S * (S*crc + crc0) + S*crc1. * This has an extra dependency which reduces possible * parallelism for the expression, but it turns out to be * best to intentionally delay evaluation of this expression * so that it competes less with the inner loop. * * We also intentionally reduce parallelism by feedng back * crc2 to the inner loop as crc0 instead of accumulating * it in crc. This synchronizes the loop with crc update. * CPU and/or compiler schedulers produced bad order without * this. * * Shifts take about 12 cycles each, so 3 here with 2 * parallelizable take about 24 cycles and the crc update * takes slightly longer. 8 dependent crc32 instructions * can run in 24 cycles, so the 3-way blocking is worse * than useless for sizes less than 8 * <word size> = 64 * on amd64. In practice, SHORT = 32 confirms these * timing calculations by giving a small improvement * starting at size 96. Then the inner loop takes about * 12 cycles and the crc update about 24, but these are * partly in parallel so the total time is less than the * 36 cycles that 12 dependent crc32 instructions would * take. * * To have a chance of completely hiding the overhead for * the crc update, the inner loop must take considerably * longer than 24 cycles. LONG = 64 makes the inner loop * take about 24 cycles, so is not quite large enough. * LONG = 128 works OK. Unhideable overheads are about * 12 cycles per inner loop. All assuming timing like * Haswell. */ crc = crc32c_shift(crc32c_long, crc) ^ crc0; crc1 = crc32c_shift(crc32c_long, crc1); crc = crc32c_shift(crc32c_2long, crc) ^ crc1; crc0 = crc2; next += LONG * 2; len -= LONG * 3; } crc0 ^= crc; #endif /* LONG > SHORT */ /* * Do the same thing, but now on SHORT*3 blocks for the remaining data * less than a LONG*3 block */ crc = 0; while (len >= SHORT * 3) { crc1 = 0; crc2 = 0; end = next + SHORT; do { #ifdef __amd64__ crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next); crc1 = _mm_crc32_u64(crc1, *(const uint64_t *)(next + SHORT)); crc2 = _mm_crc32_u64(crc2, *(const uint64_t *)(next + (SHORT * 2))); #else crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next); crc1 = _mm_crc32_u32(crc1, *(const uint32_t *)(next + SHORT)); crc2 = _mm_crc32_u32(crc2, *(const uint32_t *)(next + (SHORT * 2))); #endif next += align; } while (next < end); crc = crc32c_shift(crc32c_short, crc) ^ crc0; crc1 = crc32c_shift(crc32c_short, crc1); crc = crc32c_shift(crc32c_2short, crc) ^ crc1; crc0 = crc2; next += SHORT * 2; len -= SHORT * 3; } crc0 ^= crc; /* Compute the crc on the remaining bytes at native word size. */ end = next + (len - (len & (align - 1))); while (next < end) { #ifdef __amd64__ crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next); #else crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next); #endif next += align; } len &= (align - 1); /* Compute the crc for any trailing bytes. */ while (len) { crc0 = _mm_crc32_u8(crc0, *next); next++; len--; } return ((uint32_t)crc0); }