예제 #1
0
파일: digest.c 프로젝트: AndreeeCZ/muse
ZIX_API uint32_t
zix_digest_add(uint32_t hash, const void* const buf, const size_t len)
{
	const uint8_t* str = (const uint8_t*)buf;
#ifdef __SSE4_2__
	// SSE 4.2 CRC32
	for (size_t i = 0; i < (len / sizeof(uint32_t)); ++i) {
		hash = _mm_crc32_u32(hash, *(const uint32_t*)str);
		str += sizeof(uint32_t);
	}
	if (len & sizeof(uint16_t)) {
		hash = _mm_crc32_u16(hash, *(const uint16_t*)str);
		str += sizeof(uint16_t);
	}
	if (len & sizeof(uint8_t)) {
		hash = _mm_crc32_u8(hash, *(const uint8_t*)str);
	}
#else
	// Classic DJB hash
	for (size_t i = 0; i < len; ++i) {
		hash = (hash << 5) + hash + str[i];
	}
#endif
	return hash;
}
예제 #2
0
unsigned int CRC32C(unsigned int length, const unsigned char* value)
{
	unsigned int hash_value = 0;

	if (length == 1)
		return _mm_crc32_u8(hash_value, *value);

	if (length == 2)
		return _mm_crc32_u16(hash_value, *(unsigned short*) value);

	while (length >= 4)
	{
		hash_value = _mm_crc32_u32(hash_value, *(unsigned int*) value);
		value += 4;
		length -= 4;
	}

	if (length >= 2)
	{
		hash_value = _mm_crc32_u16(hash_value, *(unsigned short*) value);
		value += 2;
		length -= 2;
	}

	if (length)
	{
		hash_value = _mm_crc32_u8(hash_value, *value);
	}

	return hash_value;
}
예제 #3
0
pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{
	const unsigned char *p = static_cast<const unsigned char *>(data);
	const unsigned char *pend = p + len;

	/*
	 * Process eight bytes of data at a time.
	 *
	 * NB: We do unaligned accesses here. The Intel architecture allows that,
	 * and performance testing didn't show any performance gain from aligning
	 * the begin address.
	 */
#ifdef __x86_64__
	while (p + 8 <= pend)
	{
		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
		p += 8;
	}

	/* Process remaining full four bytes if any */
	if (p + 4 <= pend)
	{
		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
		p += 4;
	}
#else
	/*
	 * Process four bytes at a time. (The eight byte instruction is not
	 * available on the 32-bit x86 architecture).
	 */
	while (p + 4 <= pend)
	{
		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
		p += 4;
	}
#endif /* __x86_64__ */

	/* Process any remaining bytes one at a time. */
	while (p < pend)
	{
		crc = _mm_crc32_u8(crc, *p);
		p++;
	}

	return crc;
}
예제 #4
0
파일: containers.hpp 프로젝트: gqmelo/mesa
 UINT operator()(const T& k) const
 {
     UINT *pData = (UINT*)&k;
     UINT crc = 0;
     for (UINT i = 0; i < sizeof(T) / sizeof(UINT); ++i)
     {
         crc = _mm_crc32_u32(crc, pData[i]);
     }
     return crc;
 }
예제 #5
0
void CRC32C_Update_SSE42(const byte *s, size_t n, word32& c)
{
    for(; !IsAligned<word32>(s) && n > 0; s++, n--)
        c = _mm_crc32_u8(c, *s);

    for(; n > 4; s+=4, n-=4)
        c = _mm_crc32_u32(c, *(const word32 *)(void*)s);

    for(; n > 0; s++, n--)
        c = _mm_crc32_u8(c, *s);
}
예제 #6
0
        SIMD_INLINE void Crc32c(size_t & crc, const size_t * p, const size_t * end)
        {
            while(p < end)
            {
#ifdef SIMD_X64_ENABLE
                crc = _mm_crc32_u64(crc, *p++);
#else
                crc = _mm_crc32_u32(crc, *p++);
#endif
            }
        }
예제 #7
0
void CRC32C::Update(const byte *s, size_t n)
{
#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
	if (HasSSE4())
	{
		for(; !IsAligned<word32>(s) && n > 0; s++, n--)
			m_crc = _mm_crc32_u8(m_crc, *s);

		for(; n > 4; s+=4, n-=4)
			m_crc = _mm_crc32_u32(m_crc, *(const word32 *)(void*)s);

		for(; n > 0; s++, n--)
			m_crc = _mm_crc32_u8(m_crc, *s);

		return;
	}
#elif (CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE)
	if (HasCRC32())
	{
		for(; !IsAligned<word32>(s) && n > 0; s++, n--)
			m_crc = __crc32cb(m_crc, *s);

		for(; n > 4; s+=4, n-=4)
			m_crc = __crc32cw(m_crc, *(const word32 *)(void*)s);

		for(; n > 0; s++, n--)
			m_crc = __crc32cb(m_crc, *s);

		return;
	}
#endif

	word32 crc = m_crc;

	for(; !IsAligned<word32>(s) && n > 0; n--)
		crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc);

	while (n >= 4)
	{
		crc ^= *(const word32 *)(void*)s;
		crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc);
		crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc);
		crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc);
		crc = m_tab[CRC32_INDEX(crc)] ^ CRC32_SHIFTED(crc);
		n -= 4;
		s += 4;
	}

	while (n--)
		crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc);

	m_crc = crc;
}
예제 #8
0
void increment_sse42(float arr[4]) {
    ALIGN_16 double darr[4];
    __m128d val1 = _mm_set_pd(arr[0], arr[1]);
    __m128d val2 = _mm_set_pd(arr[2], arr[3]);
    __m128d one = _mm_set_pd(1.0, 1.0);
    __m128d result = _mm_add_pd(val1, one);
    _mm_store_pd(darr, result);
    result = _mm_add_pd(val2, one);
    _mm_store_pd(&darr[2], result);
    _mm_crc32_u32(42, 99); /* A no-op, only here to use an SSE4.2 instruction. */
    arr[0] = (float)darr[1];
    arr[1] = (float)darr[0];
    arr[2] = (float)darr[3];
    arr[3] = (float)darr[2];
}
예제 #9
0
파일: main.cpp 프로젝트: nongli/Test
int CrcHash(const void* data, int bytes, int hash) {
  int words = bytes / 4;
  bytes = bytes % 4;

  const int* p = reinterpret_cast<const int*>(data);
  while (words--) {
    hash = _mm_crc32_u32(hash, *p);
    ++p;
  }

  const char* s = reinterpret_cast<const char*>(p);
  while (bytes--) {
    hash = _mm_crc32_u8(hash, *s);
    ++s;
  }

  return hash;
} 
예제 #10
0
unsigned int test_mm_crc32_u32(unsigned int CRC, unsigned int V) {
  // CHECK-LABEL: test_mm_crc32_u32
  // CHECK: call i32 @llvm.x86.sse42.crc32.32.32(i32 %{{.*}}, i32 %{{.*}})
  return _mm_crc32_u32(CRC, V);
}
예제 #11
0
int is_false_positive(struct tuple4 addr, idx_type tcb_index, TCP_THREAD_LOCAL_P tcp_thread_local_p)
{
	struct tcp_stream *tcb_p = &(tcp_thread_local_p->tcb_array[tcb_index]);
	if (!((addr.source == tcb_p->addr.source &&
		addr.dest == tcb_p->addr.dest &&
		addr.saddr == tcb_p->addr.saddr &&
		addr.daddr == tcb_p->addr.daddr ) ||
		(addr.dest == tcb_p->addr.source &&
		addr.source == tcb_p->addr.dest &&
		addr.daddr == tcb_p->addr.saddr &&
		addr.saddr == tcb_p->addr.daddr ))) {

		// Yes, it is false positive
#if defined(DEBUG)
		tcp_test[tcp_thread_local_p->self_cpu_id].false_positive ++;
#endif

#if 0		
		int sign2 = calc_signature(
				tcb_p->addr.saddr,
				tcb_p->addr.daddr,
				tcb_p->addr.source,
				tcb_p->addr.dest);
		printf("||the Founded one in the table: Sip: %d.%d.%d.%d, Sport:%d, Dip : %d.%d.%d.%d, Dport:%d , sign = %x\n", 
				tcb_p->addr.saddr & 0x000000FF,
				(tcb_p->addr.saddr & 0x0000FF00)>>8,
				(tcb_p->addr.saddr & 0x00FF0000)>>16,
				(tcb_p->addr.saddr & 0xFF000000)>>24,
				tcb_p->addr.source,
				tcb_p->addr.daddr & 0x000000FF,
				(tcb_p->addr.daddr & 0x0000FF00)>>8,
				(tcb_p->addr.daddr & 0x00FF0000)>>16,
				(tcb_p->addr.daddr & 0xFF000000)>>24,
				tcb_p->addr.dest,
				sign2
		      );
		int crc1 = 0;
		crc1 = _mm_crc32_u32(crc1, tcb_p->addr.saddr);
		crc1 = _mm_crc32_u32(crc1, tcb_p->addr.daddr);
		crc1 = _mm_crc32_u32(crc1, tcb_p->addr.source ^ tcb_p->addr.dest);
		printf("(%x", crc1);
		crc1 = 0;
		crc1 = _mm_crc32_u32(crc1, tcb_p->addr.daddr);
		crc1 = _mm_crc32_u32(crc1, tcb_p->addr.saddr);
		crc1 = _mm_crc32_u32(crc1, tcb_p->addr.source ^ tcb_p->addr.dest);
		printf("--  %x)\n", crc1);
		sign2 = calc_signature(
				addr.saddr,
				addr.daddr,
				addr.source,
				addr.dest);
		printf("Current one: Sip: %d.%d.%d.%d, Sport:%d, Dip : %d.%d.%d.%d, Dport:%d , sign = %x||\n", 
				addr.saddr & 0x000000FF,
				(addr.saddr & 0x0000FF00)>>8,
				(addr.saddr & 0x00FF0000)>>16,
				(addr.saddr & 0xFF000000)>>24,
				addr.source,
				addr.daddr & 0x000000FF,
				(addr.daddr & 0x0000FF00)>>8,
				(addr.daddr & 0x00FF0000)>>16,
				(addr.daddr & 0xFF000000)>>24,
				addr.dest,
				sign2
		      );
		crc1 = 0;
		crc1 = _mm_crc32_u32(crc1, addr.saddr);
		crc1 = _mm_crc32_u32(crc1, addr.daddr);
		crc1 = _mm_crc32_u32(crc1, addr.source ^ addr.dest);
		printf("(%x", crc1);
		crc1 = 0;
		crc1 = _mm_crc32_u32(crc1, addr.daddr);
		crc1 = _mm_crc32_u32(crc1, addr.saddr);
		crc1 = _mm_crc32_u32(crc1, addr.source ^ addr.dest);
		printf("--  %x)\n", crc1);
#endif

		return 1;
	} else {
예제 #12
0
void libmaus::digest::CRC32C_sse42::update(uint8_t const * t, size_t l) 
{
	#if defined(LIBMAUS_HAVE_SMMINTRIN_H) && defined(LIBMAUS_USE_ASSEMBLY) && defined(LIBMAUS_HAVE_x86_64) && defined(LIBMAUS_HAVE_i386)
	ctx = ~ctx;
		
	size_t const offset = reinterpret_cast<size_t>(t);
		
	// check for 3 LSB
	if ( offset & 7 )
	{
		// check for LSB
		if ( (offset & 1) && l )
		{
			ctx = _mm_crc32_u8(ctx, *t);
			t += 1;
			l -= 1;
		}
		// check for 2nd LSB
		if ( (offset & 2) && (l>=2) )
		{
			ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t));
			t += 2;
			l -= 2;			
		}	
		// check for 3rd LSB
		if ( (offset & 4) && l >= 4 )
		{
			ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t));
			t += 4;
			l -= 4;			
		}
	}
	
	uint64_t const * t64 = reinterpret_cast<uint64_t const *>(t);
	uint64_t const * const t64e = t64 + (l>>3);
	
	while ( t64 != t64e )
		ctx = _mm_crc32_u64(ctx, *(t64++));
		
	l &= 7;
	t = reinterpret_cast<uint8_t const *>(t64);
	
	if ( l >= 4 )
	{
		ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t));
		t += 4;
		l -= 4;	
	}
	if ( l >= 2 )
	{
		ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t));
		t += 2;
		l -= 2;	
	}
	if ( l )
	{
		ctx = _mm_crc32_u8(ctx, *t);
	}
	
	ctx = ~ctx;
	#endif
}
예제 #13
0
uint32_t
sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len)
{
#ifdef __amd64__
	const size_t align = 8;
#else
	const size_t align = 4;
#endif
	const unsigned char *next, *end;
#ifdef __amd64__
	uint64_t crc0, crc1, crc2;
#else
	uint32_t crc0, crc1, crc2;
#endif

	next = buf;
	crc0 = crc;

	/* Compute the crc to bring the data pointer to an aligned boundary. */
	while (len && ((uintptr_t)next & (align - 1)) != 0) {
		crc0 = _mm_crc32_u8(crc0, *next);
		next++;
		len--;
	}

#if LONG > SHORT
	/*
	 * Compute the crc on sets of LONG*3 bytes, executing three independent
	 * crc instructions, each on LONG bytes -- this is optimized for the
	 * Nehalem, Westmere, Sandy Bridge, and Ivy Bridge architectures, which
	 * have a throughput of one crc per cycle, but a latency of three
	 * cycles.
	 */
	crc = 0;
	while (len >= LONG * 3) {
		crc1 = 0;
		crc2 = 0;
		end = next + LONG;
		do {
#ifdef __amd64__
			crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
			crc1 = _mm_crc32_u64(crc1,
			    *(const uint64_t *)(next + LONG));
			crc2 = _mm_crc32_u64(crc2,
			    *(const uint64_t *)(next + (LONG * 2)));
#else
			crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
			crc1 = _mm_crc32_u32(crc1,
			    *(const uint32_t *)(next + LONG));
			crc2 = _mm_crc32_u32(crc2,
			    *(const uint32_t *)(next + (LONG * 2)));
#endif
			next += align;
		} while (next < end);
		/*-
		 * Update the crc.  Try to do it in parallel with the inner
		 * loop.  'crc' is used to accumulate crc0 and crc1
		 * produced by the inner loop so that the next iteration
		 * of the loop doesn't depend on anything except crc2.
		 *
		 * The full expression for the update is:
		 *     crc = S*S*S*crc + S*S*crc0 + S*crc1
		 * where the terms are polynomials modulo the CRC polynomial.
		 * We regroup this subtly as:
		 *     crc = S*S * (S*crc + crc0) + S*crc1.
		 * This has an extra dependency which reduces possible
		 * parallelism for the expression, but it turns out to be
		 * best to intentionally delay evaluation of this expression
		 * so that it competes less with the inner loop.
		 *
		 * We also intentionally reduce parallelism by feedng back
		 * crc2 to the inner loop as crc0 instead of accumulating
		 * it in crc.  This synchronizes the loop with crc update.
		 * CPU and/or compiler schedulers produced bad order without
		 * this.
		 *
		 * Shifts take about 12 cycles each, so 3 here with 2
		 * parallelizable take about 24 cycles and the crc update
		 * takes slightly longer.  8 dependent crc32 instructions
		 * can run in 24 cycles, so the 3-way blocking is worse
		 * than useless for sizes less than 8 * <word size> = 64
		 * on amd64.  In practice, SHORT = 32 confirms these
		 * timing calculations by giving a small improvement
		 * starting at size 96.  Then the inner loop takes about
		 * 12 cycles and the crc update about 24, but these are
		 * partly in parallel so the total time is less than the
		 * 36 cycles that 12 dependent crc32 instructions would
		 * take.
		 *
		 * To have a chance of completely hiding the overhead for
		 * the crc update, the inner loop must take considerably
		 * longer than 24 cycles.  LONG = 64 makes the inner loop
		 * take about 24 cycles, so is not quite large enough.
		 * LONG = 128 works OK.  Unhideable overheads are about
		 * 12 cycles per inner loop.  All assuming timing like
		 * Haswell.
		 */
		crc = crc32c_shift(crc32c_long, crc) ^ crc0;
		crc1 = crc32c_shift(crc32c_long, crc1);
		crc = crc32c_shift(crc32c_2long, crc) ^ crc1;
		crc0 = crc2;
		next += LONG * 2;
		len -= LONG * 3;
	}
	crc0 ^= crc;
#endif /* LONG > SHORT */

	/*
	 * Do the same thing, but now on SHORT*3 blocks for the remaining data
	 * less than a LONG*3 block
	 */
	crc = 0;
	while (len >= SHORT * 3) {
		crc1 = 0;
		crc2 = 0;
		end = next + SHORT;
		do {
#ifdef __amd64__
			crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
			crc1 = _mm_crc32_u64(crc1,
			    *(const uint64_t *)(next + SHORT));
			crc2 = _mm_crc32_u64(crc2,
			    *(const uint64_t *)(next + (SHORT * 2)));
#else
			crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
			crc1 = _mm_crc32_u32(crc1,
			    *(const uint32_t *)(next + SHORT));
			crc2 = _mm_crc32_u32(crc2,
			    *(const uint32_t *)(next + (SHORT * 2)));
#endif
			next += align;
		} while (next < end);
		crc = crc32c_shift(crc32c_short, crc) ^ crc0;
		crc1 = crc32c_shift(crc32c_short, crc1);
		crc = crc32c_shift(crc32c_2short, crc) ^ crc1;
		crc0 = crc2;
		next += SHORT * 2;
		len -= SHORT * 3;
	}
	crc0 ^= crc;

	/* Compute the crc on the remaining bytes at native word size. */
	end = next + (len - (len & (align - 1)));
	while (next < end) {
#ifdef __amd64__
		crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
#else
		crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
#endif
		next += align;
	}
	len &= (align - 1);

	/* Compute the crc for any trailing bytes. */
	while (len) {
		crc0 = _mm_crc32_u8(crc0, *next);
		next++;
		len--;
	}

	return ((uint32_t)crc0);
}