示例#1
0
文件: riv.c 项目: medsec/riv
int decrypt_final(riv_context_t* ctx, 
                  const unsigned char* ciphertext,
                  const unsigned long long ciphertext_length, 
                  const unsigned char* header,
                  const unsigned long long header_length, 
                  const unsigned char tag[TAGLEN], 
                  unsigned char* plaintext)
{
    ALIGN(16) uint8_t iv[TAGLEN];
    ALIGN(16) uint8_t iv_prime[TAGLEN];

    clhash(&(ctx->prf_context), 
        header, header_length, DOMAIN_1, ciphertext, ciphertext_length, iv);

    cdms(iv, iv, ctx->expanded_key);
    xor_bytes(iv, iv, tag, TAGLEN);

    cdms(iv_prime, iv, ctx->expanded_key);
    sct_mode(ctx, iv_prime, (const __m128i*)ciphertext, 
        ciphertext_length, (__m128i*)plaintext);
    
    clhash(&(ctx->prf_context), 
        header, header_length, DOMAIN_0, plaintext, ciphertext_length, iv_prime);

    cdms(iv_prime, iv_prime, ctx->expanded_key);
    return (_mm_testc_si128(load(iv), load(iv_prime)) - 1)
        | (_mm_testc_si128(load((iv+BLOCKLEN)), load((iv_prime+BLOCKLEN))) - 1);
}
示例#2
0
static void
TEST (void)
{
  union
    {
      __m128i x;
      unsigned int i[4];
    } val[4];
  int i, j, l;
  int res[32];

  val[0].i[0] = 0x11111111;
  val[0].i[1] = 0x00000000;
  val[0].i[2] = 0x00000000;
  val[0].i[3] = 0x11111111;
    
  val[1].i[0] = 0x00000000;
  val[1].i[1] = 0x11111111;
  val[1].i[2] = 0x11111111;
  val[1].i[3] = 0x00000000;

  val[2].i[0] = 0;
  val[2].i[1] = 0;
  val[2].i[2] = 0;
  val[2].i[3] = 0;

  val[3].i[0] = 0xffffffff;
  val[3].i[1] = 0xffffffff;
  val[3].i[2] = 0xffffffff;
  val[3].i[3] = 0xffffffff;

  l = 0;
  for(i = 0; i < 4; i++)
    for(j = 0; j < 4; j++)
      {
	res[l++] = _mm_testz_si128 (val[j].x, val[i].x);
	res[l++] = _mm_testc_si128 (val[j].x, val[i].x);
      }

  l = 0;
  for(i = 0; i < 4; i++)
    for(j = 0; j < 4; j++)
      {
	if (res[l++] != make_ptestz (val[j].x, val[i].x))
	  abort ();
	if (res[l++] != make_ptestc (val[j].x, val[i].x))
	  abort ();
      }

  if (res[2] != _mm_testz_si128 (val[1].x, val[0].x))
    abort ();

  if (res[3] != _mm_testc_si128 (val[1].x, val[0].x))
    abort ();
}
size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) {

    const __m128i zeros  = _mm_setzero_si128();
    const __m128i prefix = sse::load(needle);
    const __m128i suffix = sse::load(needle + 4);
    const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4);

    for (size_t i = 0; i < n; i += 8) {

        const __m128i data   = sse::load(s + i);
        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);

        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);

        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;

        while (mask != 0) {

            const auto bitpos = bits::get_first_bit_set(mask)/2;

            const __m128i str = sse::load(s + i + bitpos + 4);
            const __m128i cmp = _mm_cmpeq_epi8(str, suffix);

            if (_mm_testc_si128(cmp, suff_mask)) {

                return i + bitpos;
            }

            mask = bits::clear_leftmost_set(mask);
        }
    }

    return std::string::npos;
}
示例#4
0
inline bool memequal_sse41(const char * p1, const char * p2, size_t size)
{
//	const char * p1_end = p1 + size;
	const char * p1_end_16 = p1 + size / 16 * 16;

	__m128i zero16 = _mm_setzero_si128();

	while (p1 < p1_end_16)
	{
		if (!_mm_testc_si128(
			zero16,
			_mm_xor_si128(
				_mm_loadu_si128(reinterpret_cast<const __m128i *>(p1)),
				_mm_loadu_si128(reinterpret_cast<const __m128i *>(p2)))))
			return false;

		p1 += 16;
		p2 += 16;
	}

/*	while (p1 < p1_end)
	{
		if (*p1 != *p2)
			return false;

		++p1;
		++p2;
	}*/

	switch (size % 16)
	{
		case 15: if (p1[14] != p2[14]) return false;
		case 14: if (p1[13] != p2[13]) return false;
		case 13: if (p1[12] != p2[12]) return false;
		case 12: if (reinterpret_cast<const UInt32 *>(p1)[2] == reinterpret_cast<const UInt32 *>(p2)[2]) goto l8; else return false;
		case 11: if (p1[10] != p2[10]) return false;
		case 10: if (p1[9] != p2[9]) return false;
		case 9:  if (p1[8] != p2[8]) return false;
	l8: case 8:  return reinterpret_cast<const UInt64 *>(p1)[0] == reinterpret_cast<const UInt64 *>(p2)[0];
		case 7:  if (p1[6] != p2[6]) return false;
		case 6:  if (p1[5] != p2[5]) return false;
		case 5:  if (p1[4] != p2[4]) return false;
		case 4:  return reinterpret_cast<const UInt32 *>(p1)[0] == reinterpret_cast<const UInt32 *>(p2)[0];
		case 3:  if (p1[2] != p2[2]) return false;
		case 2:  return reinterpret_cast<const UInt16 *>(p1)[0] == reinterpret_cast<const UInt16 *>(p2)[0];
		case 1:  if (p1[0] != p2[0]) return false;
		case 0:  break;
	}

	return true;
}
示例#5
0
文件: siv.c 项目: medsec/riv
int decrypt_final(riv_context_t* ctx, 
                  const unsigned char* ciphertext,
                  const unsigned long long ciphertext_length, 
                  const unsigned char* header,
                  const unsigned long long header_length, 
                  const unsigned char tag[TAGLEN], 
                  unsigned char* plaintext)
{
    const __m128i iv = loadu(tag);
    decrypt(ctx, iv, plaintext, ciphertext_length, ciphertext);
    
    ALIGN(16) 
    uint8_t iv_prime[BLOCKLEN];
    
    clhash(&(ctx->prf_context), 
        header, header_length, DOMAIN_0, plaintext, ciphertext_length, iv_prime);
    const __m128i iv_prime_ = aes_encrypt(load(iv_prime), ctx->expanced_enc_key);
    return _mm_testc_si128(iv, iv_prime_) - 1;
}
示例#6
0
void merge() {
#if defined(SSE_MERGE) || defined(SSE_MERGE_UNROLL)
  __m128i isTrue = _mm_set1_epi16(0xFFFF);
#endif

  for (int i = 0; i < NUM_PAGES; ++i) {
    //merge in everything thats different between the ref and the latest committed page (that we haven't touched)
    
#ifdef PREFETCH
    for (int pages = 1; pages <= PREFETCH_PAGES; pages++) {
      for (int bpp = 0; bpp < PREFETCH_BYTES_PER_PAGE; bpp++) {
        __builtin_prefetch( &LATEST[i+pages][bpp], 0/*read*/, 3/*high temporal locality*/ );
        __builtin_prefetch( &REF[i+pages][bpp], 0/*read*/, 3/*high temporal locality*/ );
	// don't prefetch LOCAL since we generally don't need it
        //__builtin_prefetch( &LOCAL[i+pages][bpp], 1/*write*/, 3/*high temporal locality*/ );
      }
    }
#endif

#ifdef BYTE_MERGE
    const char* latest = LATEST[i];
    const char* ref = REF[i];
    char* local = LOCAL[i];
    for (int j = 0; j < PAGE_SIZE; ++j) {
      if ( unlikely(latest[j]!=ref[j] && local[j]==ref[j]) ){
        local[j] = latest[j];
      }
    }
#endif
#ifdef WORD_MERGE
    const uint64_t* latest = (const uint64_t*) LATEST[i];
    const uint64_t* ref = (const uint64_t*) REF[i];
    uint64_t* local = (uint64_t*) LOCAL[i];

    for (int j = 0; j < (PAGE_SIZE/sizeof(uint64_t)); ++j) {

      // check for diff at word granularity first
      if ( unlikely(latest[j]!=ref[j]) ) {
        if ( local[j] == ref[j] ) {
          local[j] = latest[j];

        } else {
          // have to do byte-wise comparison
          const char* latestChar = (const char*) latest[j];
          const char* refChar = (const char*) ref[j];
          char* localChar = (char*) local[j];
          for ( int k = 0; k < sizeof(uint64_t); k++ ) {
            if ( latestChar[k] != refChar[k] && localChar[k] == refChar[k] ) {
              localChar[k] = latestChar[k];
            }
          }
        }
      }

    }
#endif
#ifdef SSE_MERGE 
    const char* latestP = LATEST[i];
    const char* refP = REF[i];
    char* localP = LOCAL[i];

    for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) {
      __m128i latest = _mm_load_si128( (__m128i*) (latestP+j) );
      __m128i ref = _mm_load_si128( (__m128i*) (refP+j) );
      __m128i latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones

      if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) {
        // some bytes differ
	__m128i local = _mm_load_si128( (__m128i*) (localP+j) );
        __m128i localEqRef = _mm_cmpeq_epi8(local, ref);
        if ( _mm_testc_si128(localEqRef, isTrue) ) {
          // local == ref
          _mm_stream_si128( (__m128i*) (localP+j), latest );
        } else {
          // (~latref) & localref, bytes where lat!=ref && local==ref
          __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef );
          // new = (latestMask & latest) | (~latestMask & local);
          __m128i latestBytes = _mm_and_si128(latestMask, latest);
          __m128i localBytes = _mm_andnot_si128(latestMask, local);
          latestBytes = _mm_or_si128(latestBytes, localBytes);
          _mm_stream_si128( (__m128i*) (localP+j), latestBytes );
        }
      }
    }
#endif
#ifdef SSE_MERGE_NOBRANCH
    for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) {
      __m128i latest = _mm_load_si128( (__m128i*) &LATEST[i][j] );
      __m128i ref = _mm_load_si128( (__m128i*) &REF[i][j] );
      __m128i local = _mm_load_si128( (__m128i*) &LOCAL[i][j] );
      __m128i latref = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones
      __m128i tmp = _mm_cmpeq_epi8(local, ref);
      latref = _mm_andnot_si128( latref, tmp ); // (~latref) & localref
      // update = (latref & latest) | (~latref & local);
      tmp = _mm_and_si128(latref, latest);
      __m128i localBytes = _mm_andnot_si128(latref, local);
      tmp = _mm_or_si128(tmp, localBytes);
      _mm_stream_si128( (__m128i*) &LOCAL[i][j], tmp );
    }
#endif
#ifdef SSE_MERGE_UNROLL
    // manually unroll this loop since gcc won't do it; ugh
    const char* latestP = LATEST[i];
    const char* refP = REF[i];
    char* localP = LOCAL[i];

    for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) {
      __m128i latest = _mm_load_si128( (__m128i*) (latestP+j) );
      __m128i ref = _mm_load_si128( (__m128i*) (refP+j) );
      __m128i latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones

      if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) {
        // some bytes differ
	__m128i local = _mm_load_si128( (__m128i*) (localP+j) );
        __m128i localEqRef = _mm_cmpeq_epi8(local, ref);
        if ( _mm_testc_si128(localEqRef, isTrue) ) {
          // local == ref
          _mm_stream_si128( (__m128i*) (localP+j), latest );
        } else {
          // (~latref) & localref, bytes where lat!=ref && local==ref
          __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef );
          // new = (latestMask & latest) | (~latestMask & local);
          __m128i latestBytes = _mm_and_si128(latestMask, latest);
          __m128i localBytes = _mm_andnot_si128(latestMask, local);
          latestBytes = _mm_or_si128(latestBytes, localBytes);
          _mm_stream_si128( (__m128i*) (localP+j), latestBytes );
        }
      }

      j += sizeof(__m128i);
      latest = _mm_load_si128( (__m128i*) (latestP+j) );
      ref = _mm_load_si128( (__m128i*) (refP+j) );
      latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones

      if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) {
        // some bytes differ
	__m128i local = _mm_load_si128( (__m128i*) (localP+j) );
        __m128i localEqRef = _mm_cmpeq_epi8(local, ref);
        if ( _mm_testc_si128(localEqRef, isTrue) ) {
          // local == ref
          _mm_stream_si128( (__m128i*) (localP+j), latest );
        } else {
          // (~latref) & localref, bytes where lat!=ref && local==ref
          __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef );
          // new = (latestMask & latest) | (~latestMask & local);
          __m128i latestBytes = _mm_and_si128(latestMask, latest);
          __m128i localBytes = _mm_andnot_si128(latestMask, local);
          latestBytes = _mm_or_si128(latestBytes, localBytes);
          _mm_stream_si128( (__m128i*) (localP+j), latestBytes );
        }
      }

      j += sizeof(__m128i);
      latest = _mm_load_si128( (__m128i*) (latestP+j) );
      ref = _mm_load_si128( (__m128i*) (refP+j) );
      latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones

      if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) {
        // some bytes differ
	__m128i local = _mm_load_si128( (__m128i*) (localP+j) );
        __m128i localEqRef = _mm_cmpeq_epi8(local, ref);
        if ( _mm_testc_si128(localEqRef, isTrue) ) {
          // local == ref
          _mm_stream_si128( (__m128i*) (localP+j), latest );
        } else {
          // (~latref) & localref, bytes where lat!=ref && local==ref
          __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef );
          // new = (latestMask & latest) | (~latestMask & local);
          __m128i latestBytes = _mm_and_si128(latestMask, latest);
          __m128i localBytes = _mm_andnot_si128(latestMask, local);
          latestBytes = _mm_or_si128(latestBytes, localBytes);
          _mm_stream_si128( (__m128i*) (localP+j), latestBytes );
        }
      }

    }
#endif


  }
}
示例#7
0
inline bool memequal_sse41_wide(const char * p1, const char * p2, size_t size)
{
	__m128i zero16 = _mm_setzero_si128();
//	const char * p1_end = p1 + size;

	while (size >= 64)
	{
		if (_mm_testc_si128(
				zero16,
				_mm_xor_si128(
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]),
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0])))
			&& _mm_testc_si128(
				zero16,
				_mm_xor_si128(
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[1]),
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[1])))
			&& _mm_testc_si128(
				zero16,
				_mm_xor_si128(
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[2]),
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[2])))
			&& _mm_testc_si128(
				zero16,
				_mm_xor_si128(
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[3]),
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[3]))))
		{
			p1 += 64;
			p2 += 64;
			size -= 64;
		}
		else
			return false;
	}

	switch ((size % 64) / 16)
	{
		case 3:
			if (!_mm_testc_si128(
				zero16,
				_mm_xor_si128(
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[2]),
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[2]))))
				return false;
		case 2:
			if (!_mm_testc_si128(
				zero16,
				_mm_xor_si128(
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[1]),
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[1]))))
				return false;
		case 1:
			if (!_mm_testc_si128(
				zero16,
				_mm_xor_si128(
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]),
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0]))))
				return false;
	}

	p1 += (size % 64) / 16 * 16;
	p2 += (size % 64) / 16 * 16;

/*

	if (size >= 32)
	{
		if (_mm_testc_si128(
				zero16,
				_mm_xor_si128(
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]),
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0])))
			& _mm_testc_si128(
				zero16,
				_mm_xor_si128(
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[1]),
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[1]))))
		{
			p1 += 32;
			p2 += 32;
			size -= 32;
		}
		else
			return false;
	}

	if (size >= 16)
	{
		if (_mm_testc_si128(
				zero16,
				_mm_xor_si128(
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]),
					_mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0]))))
		{
			p1 += 16;
			p2 += 16;
			size -= 16;
		}
		else
			return false;
	}*/

	switch (size % 16)
	{
		case 15: if (p1[14] != p2[14]) return false;
		case 14: if (p1[13] != p2[13]) return false;
		case 13: if (p1[12] != p2[12]) return false;
		case 12: if (reinterpret_cast<const UInt32 *>(p1)[2] == reinterpret_cast<const UInt32 *>(p2)[2]) goto l8; else return false;
		case 11: if (p1[10] != p2[10]) return false;
		case 10: if (p1[9] != p2[9]) return false;
		case 9:  if (p1[8] != p2[8]) return false;
	l8: case 8:  return reinterpret_cast<const UInt64 *>(p1)[0] == reinterpret_cast<const UInt64 *>(p2)[0];
		case 7:  if (p1[6] != p2[6]) return false;
		case 6:  if (p1[5] != p2[5]) return false;
		case 5:  if (p1[4] != p2[4]) return false;
		case 4:  return reinterpret_cast<const UInt32 *>(p1)[0] == reinterpret_cast<const UInt32 *>(p2)[0];
		case 3:  if (p1[2] != p2[2]) return false;
		case 2:  return reinterpret_cast<const UInt16 *>(p1)[0] == reinterpret_cast<const UInt16 *>(p2)[0];
		case 1:  if (p1[0] != p2[0]) return false;
		case 0:  break;
	}

	return true;
}
示例#8
0
文件: logical.hpp 项目: dlevin256/kfr
KFR_SINTRIN bool bittestall(const i64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
示例#9
0
文件: logical.hpp 项目: dlevin256/kfr
KFR_SINTRIN bool bittestall(const f64sse& x)
{
    return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x)));
}
示例#10
0
文件: same.cpp 项目: herumi/misc
bool is_same16(const uint8_t *p, const uint8_t *q)
{
	__m128i x = _mm_loadu_si128((const __m128i*)p);
	__m128i y = _mm_loadu_si128((const __m128i*)q);
	return _mm_testc_si128(x, y) != 0;
}
示例#11
0
int test_mm_testc_si128(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_testc_si128
  // CHECK: call i32 @llvm.x86.sse41.ptestc
  // CHECK-ASM: ptest %xmm{{.*}}, %xmm{{.*}}
  return _mm_testc_si128(x, y);
}
示例#12
0
int test_mm_testc_si128(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_testc_si128
  // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
  return _mm_testc_si128(x, y);
}