Example #1
0
unsigned int CRC32C(unsigned int length, const unsigned char* value)
{
	unsigned int hash_value = 0;

	if (length == 1)
		return _mm_crc32_u8(hash_value, *value);

	if (length == 2)
		return _mm_crc32_u16(hash_value, *(unsigned short*) value);

	while (length >= 4)
	{
		hash_value = _mm_crc32_u32(hash_value, *(unsigned int*) value);
		value += 4;
		length -= 4;
	}

	if (length >= 2)
	{
		hash_value = _mm_crc32_u16(hash_value, *(unsigned short*) value);
		value += 2;
		length -= 2;
	}

	if (length)
	{
		hash_value = _mm_crc32_u8(hash_value, *value);
	}

	return hash_value;
}
Example #2
0
void CRC32C_Update_SSE42(const byte *s, size_t n, word32& c)
{
    for(; !IsAligned<word32>(s) && n > 0; s++, n--)
        c = _mm_crc32_u8(c, *s);

    for(; n > 4; s+=4, n-=4)
        c = _mm_crc32_u32(c, *(const word32 *)(void*)s);

    for(; n > 0; s++, n--)
        c = _mm_crc32_u8(c, *s);
}
Example #3
0
void CRC32C::Update(const byte *s, size_t n)
{
#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
	if (HasSSE4())
	{
		for(; !IsAligned<word32>(s) && n > 0; s++, n--)
			m_crc = _mm_crc32_u8(m_crc, *s);

		for(; n > 4; s+=4, n-=4)
			m_crc = _mm_crc32_u32(m_crc, *(const word32 *)(void*)s);

		for(; n > 0; s++, n--)
			m_crc = _mm_crc32_u8(m_crc, *s);

		return;
	}
#elif (CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE)
	if (HasCRC32())
	{
		for(; !IsAligned<word32>(s) && n > 0; s++, n--)
			m_crc = __crc32cb(m_crc, *s);

		for(; n > 4; s+=4, n-=4)
			m_crc = __crc32cw(m_crc, *(const word32 *)(void*)s);

		for(; n > 0; s++, n--)
			m_crc = __crc32cb(m_crc, *s);

		return;
	}
#endif

	word32 crc = m_crc;

	for(; !IsAligned<word32>(s) && n > 0; n--)
		crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc);

	while (n >= 4)
	{
		crc ^= *(const word32 *)(void*)s;
		crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc);
		crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc);
		crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc);
		crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc);
		n -= 4;
		s += 4;
	}

	while (n--)
		crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc);

	m_crc = crc;
}
Example #4
0
/**
 * Calculates 32-bit CRC for the specified input string
 *
 * @return A 32-bit unsigned integer representing the CRC
 */
uint32_t calculateCRC32C(uint32_t crc, const char *buf, size_t len) {
    // If the string is empty, return 0
    if (len == 0) {
        return crc;
    }


    // XOR the initial CRC with INT_MAX
    crc ^= 0xFFFFFFFF;


    // Align the input to the word boundary
    for (; (len > 0) && ((size_t)buf & ALIGN_MASK); len--, buf++) {
        crc = _mm_crc32_u8(crc, *buf);
    }


    // Blast off the CRC32 calculation
#ifdef __x86_64__
    CALC_CRC(_mm_crc32_u64, crc, uint64_t, buf, len);
#endif
    CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
    CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
    CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);


    // Post-process the crc
    return (crc ^= 0xFFFFFFFF);
}
Example #5
0
ZIX_API uint32_t
zix_digest_add(uint32_t hash, const void* const buf, const size_t len)
{
	const uint8_t* str = (const uint8_t*)buf;
#ifdef __SSE4_2__
	// SSE 4.2 CRC32
	for (size_t i = 0; i < (len / sizeof(uint32_t)); ++i) {
		hash = _mm_crc32_u32(hash, *(const uint32_t*)str);
		str += sizeof(uint32_t);
	}
	if (len & sizeof(uint16_t)) {
		hash = _mm_crc32_u16(hash, *(const uint16_t*)str);
		str += sizeof(uint16_t);
	}
	if (len & sizeof(uint8_t)) {
		hash = _mm_crc32_u8(hash, *(const uint8_t*)str);
	}
#else
	// Classic DJB hash
	for (size_t i = 0; i < len; ++i) {
		hash = (hash << 5) + hash + str[i];
	}
#endif
	return hash;
}
pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{
	const unsigned char *p = static_cast<const unsigned char *>(data);
	const unsigned char *pend = p + len;

	/*
	 * Process eight bytes of data at a time.
	 *
	 * NB: We do unaligned accesses here. The Intel architecture allows that,
	 * and performance testing didn't show any performance gain from aligning
	 * the begin address.
	 */
#ifdef __x86_64__
	while (p + 8 <= pend)
	{
		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
		p += 8;
	}

	/* Process remaining full four bytes if any */
	if (p + 4 <= pend)
	{
		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
		p += 4;
	}
#else
	/*
	 * Process four bytes at a time. (The eight byte instruction is not
	 * available on the 32-bit x86 architecture).
	 */
	while (p + 4 <= pend)
	{
		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
		p += 4;
	}
#endif /* __x86_64__ */

	/* Process any remaining bytes one at a time. */
	while (p < pend)
	{
		crc = _mm_crc32_u8(crc, *p);
		p++;
	}

	return crc;
}
Example #7
0
File: main.cpp Project: nongli/Test
int CrcHash(const void* data, int bytes, int hash) {
  int words = bytes / 4;
  bytes = bytes % 4;

  const int* p = reinterpret_cast<const int*>(data);
  while (words--) {
    hash = _mm_crc32_u32(hash, *p);
    ++p;
  }

  const char* s = reinterpret_cast<const char*>(p);
  while (bytes--) {
    hash = _mm_crc32_u8(hash, *s);
    ++s;
  }

  return hash;
} 
Example #8
0
unsigned int test_mm_crc32_u8(unsigned int CRC, unsigned char V) {
  // CHECK-LABEL: test_mm_crc32_u8
  // CHECK: call i32 @llvm.x86.sse42.crc32.32.8(i32 %{{.*}}, i8 %{{.*}})
  return _mm_crc32_u8(CRC, V);
}
Example #9
0
unsigned int __attribute__((__target__("sse4.2"))) mm_crc32_u8_wrap(unsigned int c, unsigned char d) {
  return _mm_crc32_u8(c, d);
}
Example #10
0
 SIMD_INLINE void Crc32c(size_t & crc, const uint8_t * p, const uint8_t * end)
 {
     while(p < end)
         crc = _mm_crc32_u8((uint32_t)crc, *p++);
 }
Example #11
0
void libmaus::digest::CRC32C_sse42::update(uint8_t const * t, size_t l) 
{
	#if defined(LIBMAUS_HAVE_SMMINTRIN_H) && defined(LIBMAUS_USE_ASSEMBLY) && defined(LIBMAUS_HAVE_x86_64) && defined(LIBMAUS_HAVE_i386)
	ctx = ~ctx;
		
	size_t const offset = reinterpret_cast<size_t>(t);
		
	// check for 3 LSB
	if ( offset & 7 )
	{
		// check for LSB
		if ( (offset & 1) && l )
		{
			ctx = _mm_crc32_u8(ctx, *t);
			t += 1;
			l -= 1;
		}
		// check for 2nd LSB
		if ( (offset & 2) && (l>=2) )
		{
			ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t));
			t += 2;
			l -= 2;			
		}	
		// check for 3rd LSB
		if ( (offset & 4) && l >= 4 )
		{
			ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t));
			t += 4;
			l -= 4;			
		}
	}
	
	uint64_t const * t64 = reinterpret_cast<uint64_t const *>(t);
	uint64_t const * const t64e = t64 + (l>>3);
	
	while ( t64 != t64e )
		ctx = _mm_crc32_u64(ctx, *(t64++));
		
	l &= 7;
	t = reinterpret_cast<uint8_t const *>(t64);
	
	if ( l >= 4 )
	{
		ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t));
		t += 4;
		l -= 4;	
	}
	if ( l >= 2 )
	{
		ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t));
		t += 2;
		l -= 2;	
	}
	if ( l )
	{
		ctx = _mm_crc32_u8(ctx, *t);
	}
	
	ctx = ~ctx;
	#endif
}
Example #12
0
uint32_t
sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len)
{
#ifdef __amd64__
	const size_t align = 8;
#else
	const size_t align = 4;
#endif
	const unsigned char *next, *end;
#ifdef __amd64__
	uint64_t crc0, crc1, crc2;
#else
	uint32_t crc0, crc1, crc2;
#endif

	next = buf;
	crc0 = crc;

	/* Compute the crc to bring the data pointer to an aligned boundary. */
	while (len && ((uintptr_t)next & (align - 1)) != 0) {
		crc0 = _mm_crc32_u8(crc0, *next);
		next++;
		len--;
	}

#if LONG > SHORT
	/*
	 * Compute the crc on sets of LONG*3 bytes, executing three independent
	 * crc instructions, each on LONG bytes -- this is optimized for the
	 * Nehalem, Westmere, Sandy Bridge, and Ivy Bridge architectures, which
	 * have a throughput of one crc per cycle, but a latency of three
	 * cycles.
	 */
	crc = 0;
	while (len >= LONG * 3) {
		crc1 = 0;
		crc2 = 0;
		end = next + LONG;
		do {
#ifdef __amd64__
			crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
			crc1 = _mm_crc32_u64(crc1,
			    *(const uint64_t *)(next + LONG));
			crc2 = _mm_crc32_u64(crc2,
			    *(const uint64_t *)(next + (LONG * 2)));
#else
			crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
			crc1 = _mm_crc32_u32(crc1,
			    *(const uint32_t *)(next + LONG));
			crc2 = _mm_crc32_u32(crc2,
			    *(const uint32_t *)(next + (LONG * 2)));
#endif
			next += align;
		} while (next < end);
		/*-
		 * Update the crc.  Try to do it in parallel with the inner
		 * loop.  'crc' is used to accumulate crc0 and crc1
		 * produced by the inner loop so that the next iteration
		 * of the loop doesn't depend on anything except crc2.
		 *
		 * The full expression for the update is:
		 *     crc = S*S*S*crc + S*S*crc0 + S*crc1
		 * where the terms are polynomials modulo the CRC polynomial.
		 * We regroup this subtly as:
		 *     crc = S*S * (S*crc + crc0) + S*crc1.
		 * This has an extra dependency which reduces possible
		 * parallelism for the expression, but it turns out to be
		 * best to intentionally delay evaluation of this expression
		 * so that it competes less with the inner loop.
		 *
		 * We also intentionally reduce parallelism by feedng back
		 * crc2 to the inner loop as crc0 instead of accumulating
		 * it in crc.  This synchronizes the loop with crc update.
		 * CPU and/or compiler schedulers produced bad order without
		 * this.
		 *
		 * Shifts take about 12 cycles each, so 3 here with 2
		 * parallelizable take about 24 cycles and the crc update
		 * takes slightly longer.  8 dependent crc32 instructions
		 * can run in 24 cycles, so the 3-way blocking is worse
		 * than useless for sizes less than 8 * <word size> = 64
		 * on amd64.  In practice, SHORT = 32 confirms these
		 * timing calculations by giving a small improvement
		 * starting at size 96.  Then the inner loop takes about
		 * 12 cycles and the crc update about 24, but these are
		 * partly in parallel so the total time is less than the
		 * 36 cycles that 12 dependent crc32 instructions would
		 * take.
		 *
		 * To have a chance of completely hiding the overhead for
		 * the crc update, the inner loop must take considerably
		 * longer than 24 cycles.  LONG = 64 makes the inner loop
		 * take about 24 cycles, so is not quite large enough.
		 * LONG = 128 works OK.  Unhideable overheads are about
		 * 12 cycles per inner loop.  All assuming timing like
		 * Haswell.
		 */
		crc = crc32c_shift(crc32c_long, crc) ^ crc0;
		crc1 = crc32c_shift(crc32c_long, crc1);
		crc = crc32c_shift(crc32c_2long, crc) ^ crc1;
		crc0 = crc2;
		next += LONG * 2;
		len -= LONG * 3;
	}
	crc0 ^= crc;
#endif /* LONG > SHORT */

	/*
	 * Do the same thing, but now on SHORT*3 blocks for the remaining data
	 * less than a LONG*3 block
	 */
	crc = 0;
	while (len >= SHORT * 3) {
		crc1 = 0;
		crc2 = 0;
		end = next + SHORT;
		do {
#ifdef __amd64__
			crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
			crc1 = _mm_crc32_u64(crc1,
			    *(const uint64_t *)(next + SHORT));
			crc2 = _mm_crc32_u64(crc2,
			    *(const uint64_t *)(next + (SHORT * 2)));
#else
			crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
			crc1 = _mm_crc32_u32(crc1,
			    *(const uint32_t *)(next + SHORT));
			crc2 = _mm_crc32_u32(crc2,
			    *(const uint32_t *)(next + (SHORT * 2)));
#endif
			next += align;
		} while (next < end);
		crc = crc32c_shift(crc32c_short, crc) ^ crc0;
		crc1 = crc32c_shift(crc32c_short, crc1);
		crc = crc32c_shift(crc32c_2short, crc) ^ crc1;
		crc0 = crc2;
		next += SHORT * 2;
		len -= SHORT * 3;
	}
	crc0 ^= crc;

	/* Compute the crc on the remaining bytes at native word size. */
	end = next + (len - (len & (align - 1)));
	while (next < end) {
#ifdef __amd64__
		crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
#else
		crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
#endif
		next += align;
	}
	len &= (align - 1);

	/* Compute the crc for any trailing bytes. */
	while (len) {
		crc0 = _mm_crc32_u8(crc0, *next);
		next++;
		len--;
	}

	return ((uint32_t)crc0);
}