int decrypt_final(riv_context_t* ctx, const unsigned char* ciphertext, const unsigned long long ciphertext_length, const unsigned char* header, const unsigned long long header_length, const unsigned char tag[TAGLEN], unsigned char* plaintext) { ALIGN(16) uint8_t iv[TAGLEN]; ALIGN(16) uint8_t iv_prime[TAGLEN]; clhash(&(ctx->prf_context), header, header_length, DOMAIN_1, ciphertext, ciphertext_length, iv); cdms(iv, iv, ctx->expanded_key); xor_bytes(iv, iv, tag, TAGLEN); cdms(iv_prime, iv, ctx->expanded_key); sct_mode(ctx, iv_prime, (const __m128i*)ciphertext, ciphertext_length, (__m128i*)plaintext); clhash(&(ctx->prf_context), header, header_length, DOMAIN_0, plaintext, ciphertext_length, iv_prime); cdms(iv_prime, iv_prime, ctx->expanded_key); return (_mm_testc_si128(load(iv), load(iv_prime)) - 1) | (_mm_testc_si128(load((iv+BLOCKLEN)), load((iv_prime+BLOCKLEN))) - 1); }
static void TEST (void) { union { __m128i x; unsigned int i[4]; } val[4]; int i, j, l; int res[32]; val[0].i[0] = 0x11111111; val[0].i[1] = 0x00000000; val[0].i[2] = 0x00000000; val[0].i[3] = 0x11111111; val[1].i[0] = 0x00000000; val[1].i[1] = 0x11111111; val[1].i[2] = 0x11111111; val[1].i[3] = 0x00000000; val[2].i[0] = 0; val[2].i[1] = 0; val[2].i[2] = 0; val[2].i[3] = 0; val[3].i[0] = 0xffffffff; val[3].i[1] = 0xffffffff; val[3].i[2] = 0xffffffff; val[3].i[3] = 0xffffffff; l = 0; for(i = 0; i < 4; i++) for(j = 0; j < 4; j++) { res[l++] = _mm_testz_si128 (val[j].x, val[i].x); res[l++] = _mm_testc_si128 (val[j].x, val[i].x); } l = 0; for(i = 0; i < 4; i++) for(j = 0; j < 4; j++) { if (res[l++] != make_ptestz (val[j].x, val[i].x)) abort (); if (res[l++] != make_ptestc (val[j].x, val[i].x)) abort (); } if (res[2] != _mm_testz_si128 (val[1].x, val[0].x)) abort (); if (res[3] != _mm_testc_si128 (val[1].x, val[0].x)) abort (); }
size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) { const __m128i zeros = _mm_setzero_si128(); const __m128i prefix = sse::load(needle); const __m128i suffix = sse::load(needle + 4); const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4); for (size_t i = 0; i < n; i += 8) { const __m128i data = sse::load(s + i); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; const __m128i str = sse::load(s + i + bitpos + 4); const __m128i cmp = _mm_cmpeq_epi8(str, suffix); if (_mm_testc_si128(cmp, suff_mask)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; }
inline bool memequal_sse41(const char * p1, const char * p2, size_t size) { // const char * p1_end = p1 + size; const char * p1_end_16 = p1 + size / 16 * 16; __m128i zero16 = _mm_setzero_si128(); while (p1 < p1_end_16) { if (!_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(reinterpret_cast<const __m128i *>(p1)), _mm_loadu_si128(reinterpret_cast<const __m128i *>(p2))))) return false; p1 += 16; p2 += 16; } /* while (p1 < p1_end) { if (*p1 != *p2) return false; ++p1; ++p2; }*/ switch (size % 16) { case 15: if (p1[14] != p2[14]) return false; case 14: if (p1[13] != p2[13]) return false; case 13: if (p1[12] != p2[12]) return false; case 12: if (reinterpret_cast<const UInt32 *>(p1)[2] == reinterpret_cast<const UInt32 *>(p2)[2]) goto l8; else return false; case 11: if (p1[10] != p2[10]) return false; case 10: if (p1[9] != p2[9]) return false; case 9: if (p1[8] != p2[8]) return false; l8: case 8: return reinterpret_cast<const UInt64 *>(p1)[0] == reinterpret_cast<const UInt64 *>(p2)[0]; case 7: if (p1[6] != p2[6]) return false; case 6: if (p1[5] != p2[5]) return false; case 5: if (p1[4] != p2[4]) return false; case 4: return reinterpret_cast<const UInt32 *>(p1)[0] == reinterpret_cast<const UInt32 *>(p2)[0]; case 3: if (p1[2] != p2[2]) return false; case 2: return reinterpret_cast<const UInt16 *>(p1)[0] == reinterpret_cast<const UInt16 *>(p2)[0]; case 1: if (p1[0] != p2[0]) return false; case 0: break; } return true; }
int decrypt_final(riv_context_t* ctx, const unsigned char* ciphertext, const unsigned long long ciphertext_length, const unsigned char* header, const unsigned long long header_length, const unsigned char tag[TAGLEN], unsigned char* plaintext) { const __m128i iv = loadu(tag); decrypt(ctx, iv, plaintext, ciphertext_length, ciphertext); ALIGN(16) uint8_t iv_prime[BLOCKLEN]; clhash(&(ctx->prf_context), header, header_length, DOMAIN_0, plaintext, ciphertext_length, iv_prime); const __m128i iv_prime_ = aes_encrypt(load(iv_prime), ctx->expanced_enc_key); return _mm_testc_si128(iv, iv_prime_) - 1; }
void merge() { #if defined(SSE_MERGE) || defined(SSE_MERGE_UNROLL) __m128i isTrue = _mm_set1_epi16(0xFFFF); #endif for (int i = 0; i < NUM_PAGES; ++i) { //merge in everything thats different between the ref and the latest committed page (that we haven't touched) #ifdef PREFETCH for (int pages = 1; pages <= PREFETCH_PAGES; pages++) { for (int bpp = 0; bpp < PREFETCH_BYTES_PER_PAGE; bpp++) { __builtin_prefetch( &LATEST[i+pages][bpp], 0/*read*/, 3/*high temporal locality*/ ); __builtin_prefetch( &REF[i+pages][bpp], 0/*read*/, 3/*high temporal locality*/ ); // don't prefetch LOCAL since we generally don't need it //__builtin_prefetch( &LOCAL[i+pages][bpp], 1/*write*/, 3/*high temporal locality*/ ); } } #endif #ifdef BYTE_MERGE const char* latest = LATEST[i]; const char* ref = REF[i]; char* local = LOCAL[i]; for (int j = 0; j < PAGE_SIZE; ++j) { if ( unlikely(latest[j]!=ref[j] && local[j]==ref[j]) ){ local[j] = latest[j]; } } #endif #ifdef WORD_MERGE const uint64_t* latest = (const uint64_t*) LATEST[i]; const uint64_t* ref = (const uint64_t*) REF[i]; uint64_t* local = (uint64_t*) LOCAL[i]; for (int j = 0; j < (PAGE_SIZE/sizeof(uint64_t)); ++j) { // check for diff at word granularity first if ( unlikely(latest[j]!=ref[j]) ) { if ( local[j] == ref[j] ) { local[j] = latest[j]; } else { // have to do byte-wise comparison const char* latestChar = (const char*) latest[j]; const char* refChar = (const char*) ref[j]; char* localChar = (char*) local[j]; for ( int k = 0; k < sizeof(uint64_t); k++ ) { if ( latestChar[k] != refChar[k] && localChar[k] == refChar[k] ) { localChar[k] = latestChar[k]; } } } } } #endif #ifdef SSE_MERGE const char* latestP = LATEST[i]; const char* refP = REF[i]; char* localP = LOCAL[i]; for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) { __m128i latest = _mm_load_si128( (__m128i*) (latestP+j) ); __m128i ref = _mm_load_si128( (__m128i*) (refP+j) ); __m128i latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } } #endif #ifdef SSE_MERGE_NOBRANCH for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) { __m128i latest = _mm_load_si128( (__m128i*) &LATEST[i][j] ); __m128i ref = _mm_load_si128( (__m128i*) &REF[i][j] ); __m128i local = _mm_load_si128( (__m128i*) &LOCAL[i][j] ); __m128i latref = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones __m128i tmp = _mm_cmpeq_epi8(local, ref); latref = _mm_andnot_si128( latref, tmp ); // (~latref) & localref // update = (latref & latest) | (~latref & local); tmp = _mm_and_si128(latref, latest); __m128i localBytes = _mm_andnot_si128(latref, local); tmp = _mm_or_si128(tmp, localBytes); _mm_stream_si128( (__m128i*) &LOCAL[i][j], tmp ); } #endif #ifdef SSE_MERGE_UNROLL // manually unroll this loop since gcc won't do it; ugh const char* latestP = LATEST[i]; const char* refP = REF[i]; char* localP = LOCAL[i]; for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) { __m128i latest = _mm_load_si128( (__m128i*) (latestP+j) ); __m128i ref = _mm_load_si128( (__m128i*) (refP+j) ); __m128i latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } j += sizeof(__m128i); latest = _mm_load_si128( (__m128i*) (latestP+j) ); ref = _mm_load_si128( (__m128i*) (refP+j) ); latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } j += sizeof(__m128i); latest = _mm_load_si128( (__m128i*) (latestP+j) ); ref = _mm_load_si128( (__m128i*) (refP+j) ); latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } } #endif } }
inline bool memequal_sse41_wide(const char * p1, const char * p2, size_t size) { __m128i zero16 = _mm_setzero_si128(); // const char * p1_end = p1 + size; while (size >= 64) { if (_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0]))) && _mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[1]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[1]))) && _mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[2]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[2]))) && _mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[3]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[3])))) { p1 += 64; p2 += 64; size -= 64; } else return false; } switch ((size % 64) / 16) { case 3: if (!_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[2]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[2])))) return false; case 2: if (!_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[1]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[1])))) return false; case 1: if (!_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0])))) return false; } p1 += (size % 64) / 16 * 16; p2 += (size % 64) / 16 * 16; /* if (size >= 32) { if (_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0]))) & _mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[1]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[1])))) { p1 += 32; p2 += 32; size -= 32; } else return false; } if (size >= 16) { if (_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0])))) { p1 += 16; p2 += 16; size -= 16; } else return false; }*/ switch (size % 16) { case 15: if (p1[14] != p2[14]) return false; case 14: if (p1[13] != p2[13]) return false; case 13: if (p1[12] != p2[12]) return false; case 12: if (reinterpret_cast<const UInt32 *>(p1)[2] == reinterpret_cast<const UInt32 *>(p2)[2]) goto l8; else return false; case 11: if (p1[10] != p2[10]) return false; case 10: if (p1[9] != p2[9]) return false; case 9: if (p1[8] != p2[8]) return false; l8: case 8: return reinterpret_cast<const UInt64 *>(p1)[0] == reinterpret_cast<const UInt64 *>(p2)[0]; case 7: if (p1[6] != p2[6]) return false; case 6: if (p1[5] != p2[5]) return false; case 5: if (p1[4] != p2[4]) return false; case 4: return reinterpret_cast<const UInt32 *>(p1)[0] == reinterpret_cast<const UInt32 *>(p2)[0]; case 3: if (p1[2] != p2[2]) return false; case 2: return reinterpret_cast<const UInt16 *>(p1)[0] == reinterpret_cast<const UInt16 *>(p2)[0]; case 1: if (p1[0] != p2[0]) return false; case 0: break; } return true; }
KFR_SINTRIN bool bittestall(const i64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
KFR_SINTRIN bool bittestall(const f64sse& x) { return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x))); }
bool is_same16(const uint8_t *p, const uint8_t *q) { __m128i x = _mm_loadu_si128((const __m128i*)p); __m128i y = _mm_loadu_si128((const __m128i*)q); return _mm_testc_si128(x, y) != 0; }
int test_mm_testc_si128(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_testc_si128 // CHECK: call i32 @llvm.x86.sse41.ptestc // CHECK-ASM: ptest %xmm{{.*}}, %xmm{{.*}} return _mm_testc_si128(x, y); }
int test_mm_testc_si128(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_testc_si128 // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) return _mm_testc_si128(x, y); }