C++ (Cpp) _mm_crc32_u64 예제들

예제 #1

0

파일 보기

파일: tdb_htrie.c 프로젝트: postfix/tempesta

unsigned long
tdb_hash_calc(const char *data, size_t len)
{
#define MUL	sizeof(long)
	int i;
	unsigned long crc0 = 0, crc1 = 0, h;
	unsigned long *d = (unsigned long *)data;
	size_t n = (len / MUL) & ~1UL;

	for (i = 0; i < n; i += 2) {
		/* See linux/arch/x86/crypto/crc32c-intel.c for CRC32C. */
		crc0 = _mm_crc32_u64(crc0, d[i]);
		crc1 = _mm_crc32_u64(crc1, d[i + 1]);
	}

	if (n * MUL + MUL <= len) {
		crc0 = _mm_crc32_u64(crc0, d[n]);
		n++;
	}

	h = (crc1 << 32) | crc0;

	/*
	 * Generate relatively small and dense hash tail values - they are good
	 * for short strings in htrie which uses less significant bits at root,
	 * however collisions are very probable.
	 */
	n *= MUL;
	switch (len - n) {
	case 7:
		h += data[n] * n;
		++n;
	case 6:
		h += data[n] * n;
		++n;
	case 5:
		h += data[n] * n;
		++n;
	case 4:
		h += data[n] * n;
		++n;
	case 3:
		h += data[n] * n;
		++n;
	case 2:
		h += data[n] * n;
		++n;
	case 1:
		h += data[n] * n;
	}

	return h;
#undef MUL
}

예제 #2

0

파일 보기

파일: memkind_arena.c 프로젝트: ArturKoziej/memkind

int memkind_thread_get_arena(struct memkind *kind, unsigned int *arena, size_t size)
{
    int err = 0;
    unsigned int *arena_tsd;

    arena_tsd = pthread_getspecific(kind->arena_key);
    if (arena_tsd == NULL) {
        arena_tsd = jemk_malloc(sizeof(unsigned int));
        if (arena_tsd == NULL) {
            err = MEMKIND_ERROR_MALLOC;
        }
        if (!err) {
            *arena_tsd = _mm_crc32_u64(0, (uint64_t)pthread_self()) %
                         kind->arena_map_len;
            err = pthread_setspecific(kind->arena_key, arena_tsd) ?
                  MEMKIND_ERROR_PTHREAD : 0;
        }
    }
    if (!err) {
        *arena = kind->arena_map[*arena_tsd];
        if (*arena == UINT_MAX) {
            err = MEMKIND_ERROR_MALLCTL;
        }
    }
    return err;
}

예제 #3

0

파일 보기

파일: SimdSse42Crc32.cpp 프로젝트: fengbingchun/CUDA_Test

        SIMD_INLINE void Crc32c(size_t & crc, const size_t * p, const size_t * end)
        {
            while(p < end)
            {
#ifdef SIMD_X64_ENABLE
                crc = _mm_crc32_u64(crc, *p++);
#else
                crc = _mm_crc32_u32(crc, *p++);
#endif
            }
        }

예제 #4

0

파일 보기

파일: hash_map_string_3.cpp 프로젝트: Aahart911/ClickHouse

	size_t operator() (StringRef x) const
	{
		const char * pos = x.data;
		size_t size = x.size;

		if (size == 0)
			return 0;

		if (size < 16)
		{
			return hashLessThan16(x.data, x.size);
		}

		const char * end = pos + size;
		const char * end_16 = pos + size / 16 * 16;
		size_t res0 = -1ULL;
		size_t res1 = -1ULL;

		do
		{
			UInt64 word0 = reinterpret_cast<const UInt64 *>(pos)[0];
			UInt64 word1 = reinterpret_cast<const UInt64 *>(pos)[1];
			res0 = _mm_crc32_u64(res0, word0);
			res1 = _mm_crc32_u64(res1, word1);

			pos += 16;
		} while (pos < end_16);

		UInt64 word0 = *reinterpret_cast<const UInt64 *>(end - 8);
		UInt64 word1 = *reinterpret_cast<const UInt64 *>(end - 16);

	/*	return HashLen16(Rotate(word0 - word1, 43) + Rotate(res0, 30) + res1,
			word0 + Rotate(word1 ^ k3, 20) - res0 + size);*/

		res0 = _mm_crc32_u64(res0, word0);
		res1 = _mm_crc32_u64(res1, word1);

		return hashLen16(res0, res1);
	}

예제 #5

0

파일 보기

파일: memkind_arena.c 프로젝트: ArturKoziej/memkind

int memkind_thread_get_arena(struct memkind *kind, unsigned int *arena, size_t size)
{
    static __thread unsigned int MEMKIND_TLS_MODEL arena_tls = UINT_MAX;
    int err = 0;

    if (arena_tls == UINT_MAX) {
        arena_tls = _mm_crc32_u64(0, (uint64_t)pthread_self());
    }
    if (kind->arena_map != NULL) {
        *arena = kind->arena_map[arena_tls % kind->arena_map_len];
    }
    else {
        err = MEMKIND_ERROR_RUNTIME;
    }
    return err;
}

예제 #6

0

파일 보기

파일: pg_crc32c_sse42.cpp 프로젝트: EccentricLoggers/peloton

pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{
	const unsigned char *p = static_cast<const unsigned char *>(data);
	const unsigned char *pend = p + len;

	/*
	 * Process eight bytes of data at a time.
	 *
	 * NB: We do unaligned accesses here. The Intel architecture allows that,
	 * and performance testing didn't show any performance gain from aligning
	 * the begin address.
	 */
#ifdef __x86_64__
	while (p + 8 <= pend)
	{
		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
		p += 8;
	}

	/* Process remaining full four bytes if any */
	if (p + 4 <= pend)
	{
		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
		p += 4;
	}
#else
	/*
	 * Process four bytes at a time. (The eight byte instruction is not
	 * available on the 32-bit x86 architecture).
	 */
	while (p + 4 <= pend)
	{
		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
		p += 4;
	}
#endif /* __x86_64__ */

	/* Process any remaining bytes one at a time. */
	while (p < pend)
	{
		crc = _mm_crc32_u8(crc, *p);
		p++;
	}

	return crc;
}

예제 #7

0

파일 보기

파일: level0.c 프로젝트: henrik-muehe/level0

// Hash a string of up to 24 characters
int computeHash(const char* from,size_t l) {
    if (l<=8) {
        return _mm_crc32_u64(0,((uint64_t*)from)[0]<<(64-8*l));
    } else if (l<=16) {
        return _mm_crc32_u64(_mm_crc32_u64(0,((uint64_t*)from)[0]),(((uint64_t*)from)[1])<<(128-8*l));
    } else {
		return _mm_crc32_u64(
					_mm_crc32_u64(
						_mm_crc32_u64(
							0,
							((uint64_t*)from)[0]
						),
						((uint64_t*)from)[1]
					),
					(((uint64_t*)from)[2])<<(128+64-8*l));
	}
	assert(false&&"Missing a case in hash.");
}

예제 #8

0

파일 보기

파일: metrohash128crc.cpp 프로젝트: deepankarsharma/MetroHash

void metrohash128crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out)
{
    static const uint64_t k0 = 0xC83A91E1;
    static const uint64_t k1 = 0x8648DBDB;
    static const uint64_t k2 = 0x7BDEC03B;
    static const uint64_t k3 = 0x2F5870A5;

    const uint8_t * ptr = reinterpret_cast<const uint8_t*>(key);
    const uint8_t * const end = ptr + len;
    
    uint64_t v[4];
    
    v[0] = ((static_cast<uint64_t>(seed) - k0) * k3) + len;
    v[1] = ((static_cast<uint64_t>(seed) + k1) * k2) + len;
    
    if (len >= 32)
    {        
        v[2] = ((static_cast<uint64_t>(seed) + k0) * k2) + len;
        v[3] = ((static_cast<uint64_t>(seed) - k1) * k3) + len;

        do
        {
            v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8;
            v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8;
            v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8;
            v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8;
        }
        while (ptr <= (end - 32));

        v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 34) * k1;
        v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 37) * k0;
        v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 34) * k1;
        v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 37) * k0;
    }
    
    if ((end - ptr) >= 16)
    {
        v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3;
        v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],34) * k3;
        v[0] ^= rotate_right((v[0] * k2) + v[1], 30) * k1;
        v[1] ^= rotate_right((v[1] * k3) + v[0], 30) * k0;
    }
    
    if ((end - ptr) >= 8)
    {
        v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],36) * k3;
        v[0] ^= rotate_right((v[0] * k2) + v[1], 23) * k1;
    }
    
    if ((end - ptr) >= 4)
    {
        v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4;
        v[1] ^= rotate_right((v[1] * k3) + v[0], 19) * k0;
    }
    
    if ((end - ptr) >= 2)
    {
        v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2;
        v[0] ^= rotate_right((v[0] * k2) + v[1], 13) * k1;
    }
    
    if ((end - ptr) >= 1)
    {
        v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr));
        v[1] ^= rotate_right((v[1] * k3) + v[0], 17) * k0;
    }
    
    v[0] += rotate_right((v[0] * k0) + v[1], 11);
    v[1] += rotate_right((v[1] * k1) + v[0], 26);
    v[0] += rotate_right((v[0] * k0) + v[1], 11);
    v[1] += rotate_right((v[1] * k1) + v[0], 26);

    memcpy(out, v, 16);
}

예제 #9

0

파일 보기

파일: metrohash128crc.cpp 프로젝트: deepankarsharma/MetroHash

void metrohash128crc_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out)
{
    static const uint64_t k0 = 0xEE783E2F;
    static const uint64_t k1 = 0xAD07C493;
    static const uint64_t k2 = 0x797A90BB;
    static const uint64_t k3 = 0x2E4B2E1B;

    const uint8_t * ptr = reinterpret_cast<const uint8_t*>(key);
    const uint8_t * const end = ptr + len;
    
    uint64_t v[4];
    
    v[0] = ((static_cast<uint64_t>(seed) - k0) * k3) + len;
    v[1] = ((static_cast<uint64_t>(seed) + k1) * k2) + len;
    
    if (len >= 32)
    {        
        v[2] = ((static_cast<uint64_t>(seed) + k0) * k2) + len;
        v[3] = ((static_cast<uint64_t>(seed) - k1) * k3) + len;

        do
        {
            v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8;
            v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8;
            v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8;
            v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8;
        }
        while (ptr <= (end - 32));

        v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 12) * k1;
        v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 19) * k0;
        v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 12) * k1;
        v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 19) * k0;
    }
    
    if ((end - ptr) >= 16)
    {
        v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],41) * k3;
        v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],41) * k3;
        v[0] ^= rotate_right((v[0] * k2) + v[1], 10) * k1;
        v[1] ^= rotate_right((v[1] * k3) + v[0], 10) * k0;
    }
    
    if ((end - ptr) >= 8)
    {
        v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3;
        v[0] ^= rotate_right((v[0] * k2) + v[1], 22) * k1;
    }
    
    if ((end - ptr) >= 4)
    {
        v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4;
        v[1] ^= rotate_right((v[1] * k3) + v[0], 14) * k0;
    }
    
    if ((end - ptr) >= 2)
    {
        v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2;
        v[0] ^= rotate_right((v[0] * k2) + v[1], 15) * k1;
    }
    
    if ((end - ptr) >= 1)
    {
        v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr));
        v[1] ^= rotate_right((v[1] * k3) + v[0],  18) * k0;
    }
    
    v[0] += rotate_right((v[0] * k0) + v[1], 15);
    v[1] += rotate_right((v[1] * k1) + v[0], 27);
    v[0] += rotate_right((v[0] * k0) + v[1], 15);
    v[1] += rotate_right((v[1] * k1) + v[0], 27);

    memcpy(out, v, 16);
}

예제 #10

0

파일 보기

파일: sse42-builtins.c 프로젝트: CSI-LLVM/clang

unsigned long long test_mm_crc32_u64(unsigned long long CRC, unsigned long long V) {
  // CHECK-LABEL: test_mm_crc32_u64
  // CHECK: call i64 @llvm.x86.sse42.crc32.64.64(i64 %{{.*}}, i64 %{{.*}})
  return _mm_crc32_u64(CRC, V);
}

예제 #11

0

파일 보기

파일: metrohash64crc.cpp 프로젝트: chunhui-shi/smhasher

void metrohash64crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out)
{
    static const uint64_t k0 = 0xC83A91E1;
    static const uint64_t k1 = 0x8648DBDB;
    static const uint64_t k2 = 0x7BDEC03B;
    static const uint64_t k3 = 0x2F5870A5;

    const uint8_t * ptr = reinterpret_cast<const uint8_t*>(key);
    const uint8_t * const end = ptr + len;
    
    uint64_t hash = ((static_cast<uint64_t>(seed) + k2) * k0) + len;
    
    if (len >= 32)
    {
        uint64_t v[4];
        v[0] = hash;
        v[1] = hash;
        v[2] = hash;
        v[3] = hash;
        
        do
        {
            v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8;
            v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8;
            v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8;
            v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8;
        }
        while (ptr <= (end - 32));

        v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1;
        v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0;
        v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1;
        v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0;
        hash += v[0] ^ v[1];
    }
    
    if ((end - ptr) >= 16)
    {
        uint64_t v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1;
        uint64_t v1 = hash + (read_u64(ptr) * k1); ptr += 8; v1 = rotate_right(v1,33) * k2;
        v0 ^= rotate_right(v0 * k0, 35) + v1;
        v1 ^= rotate_right(v1 * k3, 35) + v0;
        hash += v1;
    }
    
    if ((end - ptr) >= 8)
    {
        hash += read_u64(ptr) * k3; ptr += 8;
        hash ^= rotate_right(hash, 33) * k1;
        
    }
    
    if ((end - ptr) >= 4)
    {
        hash ^= _mm_crc32_u64(hash, read_u32(ptr)); ptr += 4;
        hash ^= rotate_right(hash, 15) * k1;
    }
    
    if ((end - ptr) >= 2)
    {
        hash ^= _mm_crc32_u64(hash, read_u16(ptr)); ptr += 2;
        hash ^= rotate_right(hash, 13) * k1;
    }
    
    if ((end - ptr) >= 1)
    {
        hash ^= _mm_crc32_u64(hash, read_u8(ptr));
        hash ^= rotate_right(hash, 25) * k1;
    }
    
    hash ^= rotate_right(hash, 33);
    hash *= k0;
    hash ^= rotate_right(hash, 33);

    memcpy(out, &hash, 8);
}

예제 #12

0

파일 보기

파일: memkind_gbtlb.c 프로젝트: ArturKoziej/memkind

static int ptr_hash(void *ptr, int table_len)
{
    return _mm_crc32_u64(0, (size_t)ptr) % table_len;
}

예제 #13

0

파일 보기

파일: CRC32C_sse42_update.cpp 프로젝트: allenday/libmaus

void libmaus::digest::CRC32C_sse42::update(uint8_t const * t, size_t l) 
{
	#if defined(LIBMAUS_HAVE_SMMINTRIN_H) && defined(LIBMAUS_USE_ASSEMBLY) && defined(LIBMAUS_HAVE_x86_64) && defined(LIBMAUS_HAVE_i386)
	ctx = ~ctx;
		
	size_t const offset = reinterpret_cast<size_t>(t);
		
	// check for 3 LSB
	if ( offset & 7 )
	{
		// check for LSB
		if ( (offset & 1) && l )
		{
			ctx = _mm_crc32_u8(ctx, *t);
			t += 1;
			l -= 1;
		}
		// check for 2nd LSB
		if ( (offset & 2) && (l>=2) )
		{
			ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t));
			t += 2;
			l -= 2;			
		}	
		// check for 3rd LSB
		if ( (offset & 4) && l >= 4 )
		{
			ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t));
			t += 4;
			l -= 4;			
		}
	}
	
	uint64_t const * t64 = reinterpret_cast<uint64_t const *>(t);
	uint64_t const * const t64e = t64 + (l>>3);
	
	while ( t64 != t64e )
		ctx = _mm_crc32_u64(ctx, *(t64++));
		
	l &= 7;
	t = reinterpret_cast<uint8_t const *>(t64);
	
	if ( l >= 4 )
	{
		ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t));
		t += 4;
		l -= 4;	
	}
	if ( l >= 2 )
	{
		ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t));
		t += 2;
		l -= 2;	
	}
	if ( l )
	{
		ctx = _mm_crc32_u8(ctx, *t);
	}
	
	ctx = ~ctx;
	#endif
}

예제 #14

0

파일 보기

파일: crc32_sse42.c 프로젝트: mulichao/freebsd

uint32_t
sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len)
{
#ifdef __amd64__
	const size_t align = 8;
#else
	const size_t align = 4;
#endif
	const unsigned char *next, *end;
#ifdef __amd64__
	uint64_t crc0, crc1, crc2;
#else
	uint32_t crc0, crc1, crc2;
#endif

	next = buf;
	crc0 = crc;

	/* Compute the crc to bring the data pointer to an aligned boundary. */
	while (len && ((uintptr_t)next & (align - 1)) != 0) {
		crc0 = _mm_crc32_u8(crc0, *next);
		next++;
		len--;
	}

#if LONG > SHORT
	/*
	 * Compute the crc on sets of LONG*3 bytes, executing three independent
	 * crc instructions, each on LONG bytes -- this is optimized for the
	 * Nehalem, Westmere, Sandy Bridge, and Ivy Bridge architectures, which
	 * have a throughput of one crc per cycle, but a latency of three
	 * cycles.
	 */
	crc = 0;
	while (len >= LONG * 3) {
		crc1 = 0;
		crc2 = 0;
		end = next + LONG;
		do {
#ifdef __amd64__
			crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
			crc1 = _mm_crc32_u64(crc1,
			    *(const uint64_t *)(next + LONG));
			crc2 = _mm_crc32_u64(crc2,
			    *(const uint64_t *)(next + (LONG * 2)));
#else
			crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
			crc1 = _mm_crc32_u32(crc1,
			    *(const uint32_t *)(next + LONG));
			crc2 = _mm_crc32_u32(crc2,
			    *(const uint32_t *)(next + (LONG * 2)));
#endif
			next += align;
		} while (next < end);
		/*-
		 * Update the crc.  Try to do it in parallel with the inner
		 * loop.  'crc' is used to accumulate crc0 and crc1
		 * produced by the inner loop so that the next iteration
		 * of the loop doesn't depend on anything except crc2.
		 *
		 * The full expression for the update is:
		 *     crc = S*S*S*crc + S*S*crc0 + S*crc1
		 * where the terms are polynomials modulo the CRC polynomial.
		 * We regroup this subtly as:
		 *     crc = S*S * (S*crc + crc0) + S*crc1.
		 * This has an extra dependency which reduces possible
		 * parallelism for the expression, but it turns out to be
		 * best to intentionally delay evaluation of this expression
		 * so that it competes less with the inner loop.
		 *
		 * We also intentionally reduce parallelism by feedng back
		 * crc2 to the inner loop as crc0 instead of accumulating
		 * it in crc.  This synchronizes the loop with crc update.
		 * CPU and/or compiler schedulers produced bad order without
		 * this.
		 *
		 * Shifts take about 12 cycles each, so 3 here with 2
		 * parallelizable take about 24 cycles and the crc update
		 * takes slightly longer.  8 dependent crc32 instructions
		 * can run in 24 cycles, so the 3-way blocking is worse
		 * than useless for sizes less than 8 * <word size> = 64
		 * on amd64.  In practice, SHORT = 32 confirms these
		 * timing calculations by giving a small improvement
		 * starting at size 96.  Then the inner loop takes about
		 * 12 cycles and the crc update about 24, but these are
		 * partly in parallel so the total time is less than the
		 * 36 cycles that 12 dependent crc32 instructions would
		 * take.
		 *
		 * To have a chance of completely hiding the overhead for
		 * the crc update, the inner loop must take considerably
		 * longer than 24 cycles.  LONG = 64 makes the inner loop
		 * take about 24 cycles, so is not quite large enough.
		 * LONG = 128 works OK.  Unhideable overheads are about
		 * 12 cycles per inner loop.  All assuming timing like
		 * Haswell.
		 */
		crc = crc32c_shift(crc32c_long, crc) ^ crc0;
		crc1 = crc32c_shift(crc32c_long, crc1);
		crc = crc32c_shift(crc32c_2long, crc) ^ crc1;
		crc0 = crc2;
		next += LONG * 2;
		len -= LONG * 3;
	}
	crc0 ^= crc;
#endif /* LONG > SHORT */

	/*
	 * Do the same thing, but now on SHORT*3 blocks for the remaining data
	 * less than a LONG*3 block
	 */
	crc = 0;
	while (len >= SHORT * 3) {
		crc1 = 0;
		crc2 = 0;
		end = next + SHORT;
		do {
#ifdef __amd64__
			crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
			crc1 = _mm_crc32_u64(crc1,
			    *(const uint64_t *)(next + SHORT));
			crc2 = _mm_crc32_u64(crc2,
			    *(const uint64_t *)(next + (SHORT * 2)));
#else
			crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
			crc1 = _mm_crc32_u32(crc1,
			    *(const uint32_t *)(next + SHORT));
			crc2 = _mm_crc32_u32(crc2,
			    *(const uint32_t *)(next + (SHORT * 2)));
#endif
			next += align;
		} while (next < end);
		crc = crc32c_shift(crc32c_short, crc) ^ crc0;
		crc1 = crc32c_shift(crc32c_short, crc1);
		crc = crc32c_shift(crc32c_2short, crc) ^ crc1;
		crc0 = crc2;
		next += SHORT * 2;
		len -= SHORT * 3;
	}
	crc0 ^= crc;

	/* Compute the crc on the remaining bytes at native word size. */
	end = next + (len - (len & (align - 1)));
	while (next < end) {
#ifdef __amd64__
		crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
#else
		crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
#endif
		next += align;
	}
	len &= (align - 1);

	/* Compute the crc for any trailing bytes. */
	while (len) {
		crc0 = _mm_crc32_u8(crc0, *next);
		next++;
		len--;
	}

	return ((uint32_t)crc0);
}