Ejemplo n.º 1
0
static bool elhash_hash(
	elhash_return_value_t* ret,
	node const* full_nodes,
	elhash_light_t const light,
	uint64_t full_size,
	elhash_h256_t const header_hash,
	uint64_t const nonce
)
{
	if (full_size % MIX_WORDS != 0) {
		return false;
	}

	// pack hash and nonce together into first 40 bytes of s_mix
	assert(sizeof(node) * 8 == 512);
	node s_mix[MIX_NODES + 1];
	memcpy(s_mix[0].bytes, &header_hash, 32);
	fix_endian64(s_mix[0].double_words[4], nonce);

	// compute sha3-512 hash and replicate across mix
	SHA3_512(s_mix->bytes, s_mix->bytes, 40);
	fix_endian_arr32(s_mix[0].words, 16);

	node* const mix = s_mix + 1;
	for (uint32_t w = 0; w != MIX_WORDS; ++w) {
		mix->words[w] = s_mix[0].words[w % NODE_WORDS];
	}

	unsigned const page_size = sizeof(uint32_t) * MIX_WORDS;
	unsigned const num_full_pages = (unsigned) (full_size / page_size);

	for (unsigned i = 0; i != ELHASH_ACCESSES; ++i) {
		uint32_t const index = fnv_hash(s_mix->words[0] ^ i, mix->words[i % MIX_WORDS]) % num_full_pages;

		for (unsigned n = 0; n != MIX_NODES; ++n) {
			node const* dag_node;
			if (full_nodes) {
				dag_node = &full_nodes[MIX_NODES * index + n];
			} else {
				node tmp_node;
				elhash_calculate_dag_item(&tmp_node, index * MIX_NODES + n, light);
				dag_node = &tmp_node;
			}

#if defined(_M_X64) && ENABLE_SSE
			{
				__m128i fnv_prime = _mm_set1_epi32(FNV_PRIME);
				__m128i xmm0 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[0]);
				__m128i xmm1 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[1]);
				__m128i xmm2 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[2]);
				__m128i xmm3 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[3]);
				mix[n].xmm[0] = _mm_xor_si128(xmm0, dag_node->xmm[0]);
				mix[n].xmm[1] = _mm_xor_si128(xmm1, dag_node->xmm[1]);
				mix[n].xmm[2] = _mm_xor_si128(xmm2, dag_node->xmm[2]);
				mix[n].xmm[3] = _mm_xor_si128(xmm3, dag_node->xmm[3]);
			}
			#else
			{
				for (unsigned w = 0; w != NODE_WORDS; ++w) {
					mix[n].words[w] = fnv_hash(mix[n].words[w], dag_node->words[w]);
				}
			}
#endif
		}

	}

	// compress mix
	for (uint32_t w = 0; w != MIX_WORDS; w += 4) {
		uint32_t reduction = mix->words[w + 0];
		reduction = reduction * FNV_PRIME ^ mix->words[w + 1];
		reduction = reduction * FNV_PRIME ^ mix->words[w + 2];
		reduction = reduction * FNV_PRIME ^ mix->words[w + 3];
		mix->words[w / 4] = reduction;
	}

	fix_endian_arr32(mix->words, MIX_WORDS / 4);
	memcpy(&ret->mix_hash, mix->bytes, 32);
	// final Keccak hash
	SHA3_256(&ret->result, s_mix->bytes, 64 + 32); // Keccak-256(s + compressed_mix)
	return true;
}
Ejemplo n.º 2
0
static inline void xor_salsa8_sse2(__m128i B[4], const __m128i Bx[4])
{
	__m128i X0, X1, X2, X3;
	__m128i T;
	int i;

	X0 = B[0] = _mm_xor_si128(B[0], Bx[0]);
	X1 = B[1] = _mm_xor_si128(B[1], Bx[1]);
	X2 = B[2] = _mm_xor_si128(B[2], Bx[2]);
	X3 = B[3] = _mm_xor_si128(B[3], Bx[3]);

	for (i = 0; i < 8; i += 2) {
		/* Operate on "columns". */
		T = _mm_add_epi32(X0, X3);
		X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7));
		X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25));
		T = _mm_add_epi32(X1, X0);
		X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
		X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
		T = _mm_add_epi32(X2, X1);
		X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13));
		X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19));
		T = _mm_add_epi32(X3, X2);
		X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
		X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));

		/* Rearrange data. */
		X1 = _mm_shuffle_epi32(X1, 0x93);
		X2 = _mm_shuffle_epi32(X2, 0x4E);
		X3 = _mm_shuffle_epi32(X3, 0x39);

		/* Operate on "rows". */
		T = _mm_add_epi32(X0, X1);
		X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7));
		X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25));
		T = _mm_add_epi32(X3, X0);
		X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
		X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
		T = _mm_add_epi32(X2, X3);
		X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13));
		X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19));
		T = _mm_add_epi32(X1, X2);
		X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
		X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));

		/* Rearrange data. */
		X1 = _mm_shuffle_epi32(X1, 0x39);
		X2 = _mm_shuffle_epi32(X2, 0x4E);
		X3 = _mm_shuffle_epi32(X3, 0x93);
	}

	B[0] = _mm_add_epi32(B[0], X0);
	B[1] = _mm_add_epi32(B[1], X1);
	B[2] = _mm_add_epi32(B[2], X2);
	B[3] = _mm_add_epi32(B[3], X3);
}
Ejemplo n.º 3
0
// Simple quantization
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                             int n, const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
  const __m128i zero = _mm_set1_epi16(0);
  __m128i sign0, sign8;
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  // TODO(cduvivier): Make variable declarations and allocations aligned so that
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
  sign0 = _mm_srai_epi16(in0, 15);
  sign8 = _mm_srai_epi16(in8, 15);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
  coeff8 = _mm_xor_si128(in8, sign8);
  coeff0 = _mm_sub_epi16(coeff0, sign0);
  coeff8 = _mm_sub_epi16(coeff8, sign8);

  // coeff = abs(in) + sharpen
  coeff0 = _mm_add_epi16(coeff0, sharpen0);
  coeff8 = _mm_add_epi16(coeff8, sharpen8);

  // if (coeff > 2047) coeff = 2047
  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);

  // out = (coeff * iQ + B) >> QFIX;
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // expand bias from 16b to 32b
    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
    // out = (coeff * iQ + B)
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = (coeff * iQ + B) >> QFIX;
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);
    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
  out0 = _mm_xor_si128(out0, sign0);
  out8 = _mm_xor_si128(out8, sign8);
  out0 = _mm_sub_epi16(out0, sign0);
  out8 = _mm_sub_epi16(out8, sign8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
  {
    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
    in0 = _mm_and_si128(in0, cmp0);
    in8 = _mm_and_si128(in8, cmp8);
    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);
    out0 = _mm_and_si128(out0, cmp0);
    out8 = _mm_and_si128(out8, cmp8);
  }

  // zigzag the output before storing it.
  //
  // The zigzag pattern can almost be reproduced with a small sequence of
  // shuffles. After it, we only need to swap the 7th (ending up in third
  // position instead of twelfth) and 8th values.
  {
    __m128i outZ0, outZ8;
    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
    _mm_storeu_si128((__m128i*)&out[0], outZ0);
    _mm_storeu_si128((__m128i*)&out[8], outZ8);
    packed_out = _mm_packs_epi16(outZ0, outZ8);
  }
  {
    const int16_t outZ_12 = out[12];
    const int16_t outZ_3 = out[3];
    out[3] = outZ_12;
    out[12] = outZ_3;
  }

  // detect if all 'out' values are zeroes or not
  {
    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, packed_out);
    if (n) {
      tmp[0] &= ~0xff;
    }
    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
  }
}
Ejemplo n.º 4
0
__m128i operator^(sse_vector a, sse_vector b) {

    return _mm_xor_si128(a.v, b.v);
}
int bit_vec_filter_m128_sse11(uint8_t *read_vec, uint8_t *ref_vec, int length,
		int max_error) {
	const __m128i zero_mask = _mm_set1_epi8(0x00);
	const __m128i one_mask = _mm_set1_epi8(0xff);

	int total_byte = (length - 1) / BYTE_BASE_NUM11 + 1;

	int total_difference = 0;

	//Start iteration
	int i, j;
	//read data
	__m128i prev_read_XMM = _mm_set1_epi8(0x0);
	__m128i curr_read_XMM = *((__m128i *) (read_vec));
	//ref data
	__m128i prev_ref_XMM = _mm_set1_epi8(0x0);
	__m128i curr_ref_XMM = *((__m128i *) (ref_vec));

	__m128i read_XMM;
	__m128i ref_XMM;
	__m128i temp_diff_XMM;
	__m128i diff_XMM;
	__m128i mask;
	for (i = 0; i < total_byte; i += SSE_BYTE_NUM) {

		curr_read_XMM = *((__m128i *) (read_vec + i));
		curr_ref_XMM = *((__m128i *) (ref_vec + i));

		diff_XMM = _mm_xor_si128(curr_read_XMM, curr_ref_XMM);
		diff_XMM = xor11complement_sse(diff_XMM);
		

		if (i + SSE_BYTE_NUM >= total_byte) {
			if (length % SSE_BASE_NUM11) {
				mask = _mm_load_si128(
						(__m128i *) (MASK_SSE_END11
								+ (length % SSE_BASE_NUM11) * SSE_BYTE_NUM));
				diff_XMM = _mm_and_si128(mask, diff_XMM);
			}
		}

		for (j = 1; j <= max_error; j++) {
			//Right shift read
			read_XMM = shift_right_sse11(prev_read_XMM, curr_read_XMM, j);
			
			temp_diff_XMM = _mm_xor_si128(read_XMM, curr_ref_XMM);
			temp_diff_XMM = xor11complement_sse(temp_diff_XMM);
			
			if (i == 0) {
				mask = _mm_load_si128(
						(__m128i *) (MASK_SSE_BEG11 + (j - 1) * SSE_BYTE_NUM));

				temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM);
			}
			if (i + SSE_BYTE_NUM >= total_byte) {
				if (length % SSE_BASE_NUM11) {
					mask = _mm_load_si128(
							(__m128i *) (MASK_SSE_END11
									+ (length % SSE_BASE_NUM11) * SSE_BYTE_NUM));
					temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM);
				}
			}

			diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM);

			//Right shift ref
			ref_XMM = shift_right_sse11(prev_ref_XMM, curr_ref_XMM, j);
			
			temp_diff_XMM = _mm_xor_si128(curr_read_XMM, ref_XMM);
			temp_diff_XMM = xor11complement_sse(temp_diff_XMM);
			
			if (i == 0) {
				mask = _mm_load_si128(
						(__m128i *) (MASK_SSE_BEG11 + (j - 1) * SSE_BYTE_NUM));

				temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM);
			}
			if (i + SSE_BYTE_NUM >= total_byte) {
				if (length % SSE_BASE_NUM11) {
					mask = _mm_load_si128(
							(__m128i *) (MASK_SSE_END11
									+ (length % SSE_BASE_NUM11) * SSE_BYTE_NUM));
					temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM);
				}
			}

			diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM);
		}

		total_difference += popcount11_m128i_sse(diff_XMM);

		prev_read_XMM = curr_read_XMM;
		prev_ref_XMM = curr_ref_XMM;

		if (total_difference > max_error)
			return 0;
	}

	return 1;
}
Ejemplo n.º 6
0
static void my_YCbCr_to_RGB(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
{
   int i=0;

   if (step == 4) {
      // this is a fairly straightforward implementation and not super-optimized.
      __m128i signflip = _mm_set1_epi8(-0x80);
      __m128i cr_const0 = _mm_set1_epi16((short) ( 1.40200f*4096.0f));
      __m128i cr_const1 = _mm_set1_epi16((short) (-0.71414f*4096.0f));
      __m128i cb_const0 = _mm_set1_epi16((short) (-0.34414f*4096.0f));
      __m128i cb_const1 = _mm_set1_epi16((short) ( 1.77200f*4096.0f));
      __m128i y_bias = _mm_set1_epi16(8);
      __m128i xw = _mm_set1_epi16(255);

      for (; i+7 < count; i += 8) {
         // load
         __m128i zero = _mm_setzero_si128();
         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
         __m128i cr_bias = _mm_xor_si128(cr_bytes, signflip); // -128
         __m128i cb_bias = _mm_xor_si128(cb_bytes, signflip); // -128

         // unpack to short (and left-shift cr, cb by 8)
         __m128i yw  = _mm_unpacklo_epi8(y_bytes, zero);
         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_bias);
         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_bias);

         // color transform
         __m128i yws = _mm_slli_epi16(yw, 4);
         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
         __m128i ywb = _mm_add_epi16(yws, y_bias);
         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
         __m128i rws = _mm_add_epi16(cr0, ywb);
         __m128i gwt = _mm_add_epi16(cb0, ywb);
         __m128i bws = _mm_add_epi16(ywb, cb1);
         __m128i gws = _mm_add_epi16(gwt, cr1);

         // descale
         __m128i rw = _mm_srai_epi16(rws, 4);
         __m128i bw = _mm_srai_epi16(bws, 4);
         __m128i gw = _mm_srai_epi16(gws, 4);

         // back to byte, set up for transpose
         __m128i brb = _mm_packus_epi16(rw, bw);
         __m128i gxb = _mm_packus_epi16(gw, xw);

         // transpose to interleave channels
         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
         __m128i o1 = _mm_unpackhi_epi16(t0, t1);

         // store
         _mm_storeu_si128((__m128i *) (out + 0), o0);
         _mm_storeu_si128((__m128i *) (out + 16), o1);
         out += 32;
      }
   }

   for (; i < count; ++i) {
      int y_fixed = (y[i] << 16) + 32768; // rounding
      int r,g,b;
      int cr = pcr[i] - 128;
      int cb = pcb[i] - 128;
      r = y_fixed + cr*float2fixed(1.40200f);
      g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
      b = y_fixed                            + cb*float2fixed(1.77200f);
      r >>= 16;
      g >>= 16;
      b >>= 16;
      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
      out[0] = (stbi_uc)r;
      out[1] = (stbi_uc)g;
      out[2] = (stbi_uc)b;
      out[3] = 255;
      out += step;
   }
}
Ejemplo n.º 7
0
/**
 * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64
 * words of type uint64_t), using Blake2b's G function as the internal permutation
 *
 * @param state The current state of the sponge
 * @param in    The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
 */
inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
    //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
#if defined __AVX2__

    __m256i state_v[4], in_v[2];

    state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
    in_v   [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) );
    state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
    in_v   [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) );
    state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
    state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );

    state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] );
    state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] );

    LYRA_12_ROUNDS_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );

    _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
    _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
    _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
    _mm256_store_si256( (__m256i*)&state[12], state_v[3] );

#elif defined __AVX__

    __m128i state_v[4], in_v[4];

    state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
    state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
    state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
    state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );

    in_v[0]    = _mm_load_si128( (__m128i*)(&in[0]) );
    in_v[1]    = _mm_load_si128( (__m128i*)(&in[2]) );
    in_v[2]    = _mm_load_si128( (__m128i*)(&in[4]) );
    in_v[3]    = _mm_load_si128( (__m128i*)(&in[6]) );

    _mm_store_si128( (__m128i*)(&state[0]),
                       _mm_xor_si128( state_v[0], in_v[0] ) );
    _mm_store_si128( (__m128i*)(&state[2]),
                       _mm_xor_si128( state_v[1], in_v[1] ) );
    _mm_store_si128( (__m128i*)(&state[4]),
                       _mm_xor_si128( state_v[2], in_v[2] ) );
    _mm_store_si128( (__m128i*)(&state[6]),
                        _mm_xor_si128( state_v[3], in_v[3] ) );

    //Applies the transformation f to the sponge's state
    blake2bLyra(state);

#else

    state[0] ^= in[0];
    state[1] ^= in[1];
    state[2] ^= in[2];
    state[3] ^= in[3];
    state[4] ^= in[4];
    state[5] ^= in[5];
    state[6] ^= in[6];
    state[7] ^= in[7];

    //Applies the transformation f to the sponge's state
    blake2bLyra(state);
#endif

}
Ejemplo n.º 8
0
static __m128i aes192_keyexpand_2(__m128i key, __m128i key2)
{
    key = _mm_shuffle_epi32(key, 0xff);
    key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
    return _mm_xor_si128(key, key2);
}
Ejemplo n.º 9
0
/**
 * salsa20_8(B):
 * Apply the salsa20/8 core to the provided block.
 */
static void salsa20_8(__m128i B[4]) {
    __m128i X0, X1, X2, X3;
    __m128i T;
    size_t i;

    X0 = B[0];
    X1 = B[1];
    X2 = B[2];
    X3 = B[3];

    for (i = 0; i < 8; i += 2) {
        /* Operate on "columns". */
        T = _mm_add_epi32(X0, X3);
        X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7));
        X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25));
        T = _mm_add_epi32(X1, X0);
        X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
        X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
        T = _mm_add_epi32(X2, X1);
        X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13));
        X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19));
        T = _mm_add_epi32(X3, X2);
        X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
        X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));

        /* Rearrange data. */
        X1 = _mm_shuffle_epi32(X1, 0x93);
        X2 = _mm_shuffle_epi32(X2, 0x4E);
        X3 = _mm_shuffle_epi32(X3, 0x39);

        /* Operate on "rows". */
        T = _mm_add_epi32(X0, X1);
        X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7));
        X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25));
        T = _mm_add_epi32(X3, X0);
        X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
        X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
        T = _mm_add_epi32(X2, X3);
        X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13));
        X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19));
        T = _mm_add_epi32(X1, X2);
        X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
        X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));

        /* Rearrange data. */
        X1 = _mm_shuffle_epi32(X1, 0x39);
        X2 = _mm_shuffle_epi32(X2, 0x4E);
        X3 = _mm_shuffle_epi32(X3, 0x93);
    }

    B[0] = _mm_add_epi32(B[0], X0);
    B[1] = _mm_add_epi32(B[1], X1);
    B[2] = _mm_add_epi32(B[2], X2);
    B[3] = _mm_add_epi32(B[3], X3);
}
Ejemplo n.º 10
0
/**
 * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words
 * of type uint64_t), using Blake2b's G function as the internal permutation
 *
 * @param state The current state of the sponge
 * @param in    The block to be absorbed (BLOCK_LEN_INT64 words)
 */
inline void absorbBlock(uint64_t *state, const uint64_t *in) {
#if defined __AVX2__

    __m256i state_v[4], in_v[3];

    // only state is guaranteed aligned 256
    state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
    in_v   [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) );
    state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
    in_v   [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) );
    state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
    in_v   [2] = _mm256_loadu_si256( (__m256i*)(&in[8]) ); 
    state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );

    state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] );
    state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] );
    state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] );

    LYRA_12_ROUNDS_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );

    _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
    _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
    _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
    _mm256_store_si256( (__m256i*)&state[12], state_v[3] );

#elif defined __AVX__

    __m128i state_v[6], in_v[6];

    state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
    state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
    state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
    state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
    state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
    state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

    in_v[0]    = _mm_load_si128( (__m128i*)(&in[0]) );
    in_v[1]    = _mm_load_si128( (__m128i*)(&in[2]) );
    in_v[2]    = _mm_load_si128( (__m128i*)(&in[4]) );
    in_v[3]    = _mm_load_si128( (__m128i*)(&in[6]) );
    in_v[4]    = _mm_load_si128( (__m128i*)(&in[8]) );
    in_v[5]    = _mm_load_si128( (__m128i*)(&in[10]) );

// do blake2bLyra without init
// LYRA_ROUND_AVX2( state_v )
    _mm_store_si128( (__m128i*)(&state[0]),
                       _mm_xor_si128( state_v[0], in_v[0] ) );
    _mm_store_si128( (__m128i*)(&state[2]),
                       _mm_xor_si128( state_v[1], in_v[1] ) );
    _mm_store_si128( (__m128i*)(&state[4]),
                       _mm_xor_si128( state_v[2], in_v[2] ) );
    _mm_store_si128( (__m128i*)(&state[6]),
                       _mm_xor_si128( state_v[3], in_v[3] ) );
    _mm_store_si128( (__m128i*)(&state[8]),
                       _mm_xor_si128( state_v[4], in_v[4] ) );
    _mm_store_si128( (__m128i*)(&state[10]),
                       _mm_xor_si128( state_v[5], in_v[5] ) );

    //Applies the transformation f to the sponge's state
    blake2bLyra(state);

#else
    //XORs the first BLOCK_LEN_INT64 words of "in" with the current state
    state[0] ^= in[0];
    state[1] ^= in[1];
    state[2] ^= in[2];
    state[3] ^= in[3];
    state[4] ^= in[4];
    state[5] ^= in[5];
    state[6] ^= in[6];
    state[7] ^= in[7];
    state[8] ^= in[8];
    state[9] ^= in[9];
    state[10] ^= in[10];
    state[11] ^= in[11];

    //Applies the transformation f to the sponge's state
    blake2bLyra(state);

#endif
}
Ejemplo n.º 11
0
static int HafCpu_Histogram16Bins_DATA_U8
	(
		vx_uint32   * dstHist,
		vx_uint8      distOffset, 
		vx_uint8      distWindow,
		vx_uint32     srcWidth,
		vx_uint32     srcHeight,
		vx_uint8    * pSrcImage,
		vx_uint32     srcImageStrideInBytes
	)
{
	// offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
	// thresh: source threshold in -128..127 range
	__m128i offset = _mm_set1_epi8((char)0x80);
	__m128i T0 = _mm_set1_epi8((char)(((distOffset ? distOffset : distWindow) - 1) ^ 0x80));
	__m128i dT = _mm_set1_epi8((char)distWindow);
	__m128i onemask = _mm_set1_epi8((char)1);
	// process one pixel row at a time that counts "pixel < srcThreshold"
	vx_uint32 count[16] = { 0 };
	vx_uint8 * srcRow = pSrcImage;
	vx_uint32 width = (srcWidth + 15) >> 4;
	for (unsigned int y = 0; y < srcHeight; y++) {
		__m128i * src = (__m128i *)srcRow;
		__m128i count0 = _mm_set1_epi8((char)0);
		__m128i count1 = _mm_set1_epi8((char)0);
		__m128i count2 = _mm_set1_epi8((char)0);
		__m128i count3 = _mm_set1_epi8((char)0);
		for (unsigned int x = 0; x < width; x++) {
			__m128i pixels = _mm_load_si128(src++);
			pixels = _mm_xor_si128(pixels, offset);
			__m128i cmpout, Tnext = T0;
			// 0..3
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count0 = _mm_add_epi32(count0, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 16);
			count0 = _mm_add_epi32(count0, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 32);
			count0 = _mm_add_epi32(count0, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 48);
			count0 = _mm_add_epi32(count0, cmpout);
			// 4..7
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count1 = _mm_add_epi32(count1, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 16);
			count1 = _mm_add_epi32(count1, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 32);
			count1 = _mm_add_epi32(count1, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 48);
			count1 = _mm_add_epi32(count1, cmpout);
			// 8..11
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count2 = _mm_add_epi32(count2, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 16);
			count2 = _mm_add_epi32(count2, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 32);
			count2 = _mm_add_epi32(count2, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 48);
			count2 = _mm_add_epi32(count2, cmpout);
			// 12..15
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count3 = _mm_add_epi32(count3, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 16);
			count3 = _mm_add_epi32(count3, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 32);
			count3 = _mm_add_epi32(count3, cmpout);
			Tnext = _mm_add_epi8(Tnext, dT);
			cmpout = _mm_cmpgt_epi8(pixels, Tnext);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			cmpout = _mm_slli_epi64(cmpout, 48);
			count3 = _mm_add_epi32(count3, cmpout);
		}
		srcRow += srcImageStrideInBytes;
		// move counts from count0..2 into count[]
		for (int i = 0; i < 4; i++) {
			count[ 0 + i] += M128I(count0).m128i_u16[i] + M128I(count0).m128i_u16[4 + i];
			count[ 4 + i] += M128I(count1).m128i_u16[i] + M128I(count1).m128i_u16[4 + i];
			count[ 8 + i] += M128I(count2).m128i_u16[i] + M128I(count2).m128i_u16[4 + i];
			count[12 + i] += M128I(count3).m128i_u16[i] + M128I(count3).m128i_u16[4 + i];
		}
	}
	// extract histogram from count
	if (distOffset == 0) {
		vx_uint32 last = (distWindow >= 16) ? srcWidth * srcHeight : count[15];
		for (int i = 14; i >= 0; i--) {
			count[i] = last - count[i];
			last -= count[i];
		}
		dstHist[0] = last;
		for (int i = 1; i < 16; i++)
			dstHist[i] = count[i - 1];
	}
	else {
		vx_uint32 last = srcWidth * srcHeight;
		for (int i = 15; i >= 0; i--) {
			count[i] = last - count[i];
			last -= count[i];
			dstHist[i] = count[i];
		}
	}
	return AGO_SUCCESS;
}
Ejemplo n.º 12
0
long garbleCircuit(GarbledCircuit *garbledCircuit, InputLabels inputLabels,
		OutputMap outputMap) {

	GarblingContext garblingContext;
	GarbledGate *garbledGate;
	GarbledTable *garbledTable;
	DKCipherContext dkCipherContext;
	const block *sched = ((block *) (dkCipherContext.K.rd_key));
	block val;

	block *A, *B, *plainText, *cipherText;
	block tweak;
	long i, j, rnds = 10;
	block blocks[4];
	block keys[4];
	long lsb0, lsb1;
	int input0, input1, output;
	srand_sse(time(NULL));

	startTime = RDTSC;

	createInputLabels(inputLabels, garbledCircuit->n);
	garbledCircuit->id = getFreshId();

	for (i = 0; i < 2 * garbledCircuit->n; i += 2) {
		garbledCircuit->wires[i / 2].label0 = inputLabels[i];
		garbledCircuit->wires[i / 2].label1 = inputLabels[i + 1];
	}
	garbledTable = garbledCircuit->garbledTable;
	garblingContext.gateIndex = 0;
	garblingContext.wireIndex = garbledCircuit->n + 1;
	block key = randomBlock();
	block rkey = randomBlock();
	AES_KEY KR;
	AES_set_encrypt_key((unsigned char *) &rkey, 128, &KR);
	const __m128i *sched2 = ((__m128i *) (KR.rd_key));
	garblingContext.R =
			xorBlocks(garbledCircuit->wires[0].label0, garbledCircuit->wires[0].label1);
	garbledCircuit->globalKey = key;
	DKCipherInit(&key, &(garblingContext.dkCipherContext));
	int tableIndex = 0;

	for (i = 0; i < garbledCircuit->q; i++) {
		garbledGate = &(garbledCircuit->garbledGates[i]);
		input0 = garbledGate->input0;
		input1 = garbledGate->input1;
		output = garbledGate->output;

#ifdef FREE_XOR
		if (garbledGate->type == XORGATE) {
			garbledCircuit->wires[output].label0 =
					xorBlocks(garbledCircuit->wires[input0].label0, garbledCircuit->wires[input1].label0);
			garbledCircuit->wires[output].label1 =
					xorBlocks(garbledCircuit->wires[input0].label1, garbledCircuit->wires[input1].label0);
			continue;
		}
#endif
		tweak = makeBlock(i, (long)0);
		lsb0 = getLSB(garbledCircuit->wires[input0].label0);
		lsb1 = getLSB(garbledCircuit->wires[input1].label0);

		block val = _mm_xor_si128(tweak, sched[0]);
		for (j = 1; j < rnds; j++)
			val = _mm_aesenc_si128(val, sched2[j]);
		garbledCircuit->wires[garbledGate->output].label0 =
				_mm_aesenclast_si128(val, sched[j]);

		garbledCircuit->wires[garbledGate->output].label1 =
				xorBlocks(garblingContext.R,
						garbledCircuit->wires[garbledGate->output].label0);
		block A0, A1, B0, B1;
		A0 = DOUBLE(garbledCircuit->wires[input0].label0);
		A1 = DOUBLE(garbledCircuit->wires[input0].label1);
		B0 = DOUBLE(DOUBLE(garbledCircuit->wires[input1].label0));
		B1 = DOUBLE(DOUBLE(garbledCircuit->wires[input1].label1));

		keys[0] = xorBlocks(xorBlocks(A0, B0) , tweak);
		keys[1] = xorBlocks(xorBlocks(A0,B1), tweak);
		keys[2] = xorBlocks(xorBlocks(A1, B0), tweak);
		keys[3] = xorBlocks(xorBlocks(A1, B1), tweak);

		block *temp[2];
		temp[0] = &garbledCircuit->wires[garbledGate->output].label0;
		temp[1] = &garbledCircuit->wires[garbledGate->output].label1;
		int bp = 0;
		blocks[0] =
				xorBlocks(keys[0], *(temp[(garbledGate->type & (1<<bp))>>bp]));
		bp++;
		blocks[1] =
				xorBlocks(keys[1], *(temp[(garbledGate->type & (1<<bp))>>bp]));
		bp++;
		blocks[2] =
				xorBlocks(keys[2], *(temp[(garbledGate->type & (1<<bp))>>bp]));
		bp++;
		blocks[3] =
				xorBlocks(keys[3], *(temp[(garbledGate->type & (1<<bp))>>bp]));

		write:

		AES_ecb_encrypt_blks_4(keys,  &(garblingContext.dkCipherContext.K));

		garbledTable[tableIndex].table[2 * lsb0 + lsb1] =
				xorBlocks(blocks[0], keys[0]);
		garbledTable[tableIndex].table[2 * lsb0 + 1 - lsb1] =
				xorBlocks(blocks[1], keys[1]);
		garbledTable[tableIndex].table[2 * (1 - lsb0) + lsb1] =
				xorBlocks(blocks[2], keys[2]);
		garbledTable[tableIndex].table[2 * (1 - lsb0) + (1 - lsb1)] =
				xorBlocks(blocks[3], keys[3]);

		tableIndex++;

	}
	for (i = 0; i < garbledCircuit->m; i++) {
		outputMap[2 * i] =
				garbledCircuit->wires[garbledCircuit->outputs[i]].label0;
		outputMap[2 * i + 1] =
				garbledCircuit->wires[garbledCircuit->outputs[i]].label1;
	}
	endTime = RDTSC;
	return (endTime - startTime);
}
Ejemplo n.º 13
0
long garbleCircuit(GarbledCircuit *garbledCircuit, InputLabels inputLabels, OutputMap outputMap) {

	GarblingContext garblingContext;
	GarbledGate *garbledGate;
	GarbledTable *garbledTable;
	DKCipherContext dkCipherContext;
	const __m128i *sched = ((__m128i *)(dkCipherContext.K.rd_key));
	block val;

	block *A, *B, *plainText,*cipherText;
	block tweak;
	long a, b, i, j,rnds = 10;
	block blocks[4];
	block keys[4];
	long lsb0,lsb1;
	block keyToEncrypt;
	int input0, input1,output;
	srand_sse( time(NULL));

	startTime = RDTSC;

	createInputLabels(inputLabels, garbledCircuit->n);

	garbledCircuit->id = getFreshId();

	for(i=0;i<2*garbledCircuit->n;i+=2) {
		garbledCircuit->wires[i/2].id = i+1;
		garbledCircuit->wires[i/2].label0 = inputLabels[i];
		garbledCircuit->wires[i/2].label1 = inputLabels[i+1];
	}
	garbledTable = garbledCircuit->garbledTable;
	garblingContext.gateIndex = 0;
	garblingContext.wireIndex = garbledCircuit->n + 1;
	block key = randomBlock();
	block rkey = randomBlock();
	AES_KEY KR;
	AES_set_encrypt_key(&rkey, 128, &KR);
	const __m128i *sched2 = ((__m128i *)(KR.rd_key));
	garblingContext.R = xorBlocks(garbledCircuit->wires[0].label0, garbledCircuit->wires[0].label1);
	garbledCircuit->globalKey = key;
	DKCipherInit(&key, &(garblingContext.dkCipherContext));
	int tableIndex = 0;

	for(i=0; i< garbledCircuit->q;i++) {
		garbledGate = &(garbledCircuit->garbledGates[i]);
		input0 = garbledGate->input0; input1 = garbledGate->input1;
		output = garbledGate->output;

#ifdef FREE_XOR
		if (garbledGate->type == XORGATE) {
			garbledCircuit->wires[output].label0 = xorBlocks(garbledCircuit->wires[input0].label0, garbledCircuit->wires[input1].label0);
			garbledCircuit->wires[output].label1 = xorBlocks(garbledCircuit->wires[input0].label1, garbledCircuit->wires[input1].label0);
			continue;
		}
#endif
		tweak = makeBlock(i, (long)0);
		lsb0 = getLSB(garbledCircuit->wires[input0].label0);
		lsb1 = getLSB(garbledCircuit->wires[input1].label0);
		char templ[20];
		char templ2[20];
		block val = _mm_xor_si128 (tweak, sched[0]);
		for (j=1; j<rnds; j++) val = _mm_aesenc_si128 (val,sched2[j]);
		*((block*)templ) = _mm_aesenclast_si128 (val, sched[j]);
		val = _mm_aesenclast_si128 (val, sched[j]);
		*((block *)templ2) = xorBlocks(*((block *)templ), garblingContext.R);

		TRUNCATE(templ);
		TRUNCATE(templ2);

		block *label0 = (block *)templ;
		block *label1 = (block *)templ2;
		garbledCircuit->wires[garbledGate->output].label0 = *((block*)templ);
		garbledCircuit->wires[garbledGate->output].label1 = *((block*)templ2);
		block A0, A1, B0, B1;
		A0 = DOUBLE(garbledCircuit->wires[input0].label0);
		A1 = DOUBLE(garbledCircuit->wires[input0].label1);
		B0 = DOUBLE(DOUBLE(garbledCircuit->wires[input1].label0));
		B1 = DOUBLE(DOUBLE(garbledCircuit->wires[input1].label1));

		keys[0] = xorBlocks(A0, B0);
		keys[0] = xorBlocks(keys[0], tweak);
		keys[1] = xorBlocks(A0,B1);
		keys[1] = xorBlocks(keys[1], tweak);
		keys[2] = xorBlocks(A1, B0);
		keys[2] = xorBlocks(keys[2], tweak);
		keys[3] = xorBlocks(A1, B1);
		keys[3] = xorBlocks(keys[3], tweak);

		if (garbledGate->type == ANDGATE) {

			blocks[0] = xorBlocks(keys[0], *label0);
			blocks[1] = xorBlocks(keys[1], *label0);
			blocks[2] = xorBlocks(keys[2], *label0);
			blocks[3] = xorBlocks(keys[3], *label1);
			goto write;
		}

		if (garbledGate->type == ORGATE) {

			blocks[0] = xorBlocks(keys[0], *label0);
			blocks[1] = xorBlocks(keys[1], *label1);
			blocks[2] = xorBlocks(keys[2], *label1);
			blocks[3] = xorBlocks(keys[3], *label1);
			goto write;

		}

		if (garbledGate->type == XORGATE) {

			blocks[0] = xorBlocks(keys[0], *label0);
			blocks[1] = xorBlocks(keys[1], *label1);
			blocks[2] = xorBlocks(keys[2], *label1);
			blocks[3] = xorBlocks(keys[3], *label0);
			goto write;

		}

		if (garbledGate->type == NOTGATE) {

			blocks[0] = xorBlocks(keys[0], *label1);
			blocks[1] = xorBlocks(keys[1], *label0);
			blocks[2] = xorBlocks(keys[2], *label1);
			blocks[3] = xorBlocks(keys[3], *label0);
			goto write;

		}
		write:
		AES_ecb_encrypt_blks(keys, 4, &(garblingContext.dkCipherContext.K));

		char toWrite[4][16];
		char **dest[4];

		*((block *) toWrite[0]) = xorBlocks(blocks[0], keys[0]);
		*((block *) toWrite[1]) = xorBlocks(blocks[1], keys[1]);
		*((block *) toWrite[2]) = xorBlocks(blocks[2], keys[2]);
		*((block *) toWrite[3]) = xorBlocks(blocks[3], keys[3]);

		short *cpsrc; short *cpdst;
		cpsrc = (short *)toWrite[0];
		cpdst = (short *)&garbledTable[tableIndex].table[2*lsb0 + lsb1];
		cpdst[0]=cpsrc[0];
		cpdst[1]=cpsrc[1];
		cpdst[2]=cpsrc[2];
		cpdst[3]=cpsrc[3];
		cpdst[4]=cpsrc[4];

		cpsrc = (short *)toWrite[1];
		cpdst = (short *)&garbledTable[tableIndex].table[2*(lsb0) + (1-lsb1)];
		cpdst[0]=cpsrc[0];
		cpdst[1]=cpsrc[1];
		cpdst[2]=cpsrc[2];
		cpdst[3]=cpsrc[3];
		cpdst[4]=cpsrc[4];

		cpsrc = (short *)toWrite[2];
		cpdst = (short *)&garbledTable[tableIndex].table[2*(1-lsb0) + (lsb1)];
		cpdst[0]=cpsrc[0];
		cpdst[1]=cpsrc[1];
		cpdst[2]=cpsrc[2];
		cpdst[3]=cpsrc[3];
		cpdst[4]=cpsrc[4];

		cpsrc = (short *)toWrite[3];
		cpdst = (short *)&garbledTable[tableIndex].table[2*(1-lsb0) + (1-lsb1)];
		cpdst[0]=cpsrc[0];
		cpdst[1]=cpsrc[1];
		cpdst[2]=cpsrc[2];
		cpdst[3]=cpsrc[3];
		cpdst[4]=cpsrc[4];

		tableIndex++;
	}
	for(i=0;i<garbledCircuit->m;i++) {
		outputMap[2*i] = garbledCircuit->wires[garbledCircuit->outputs[i]].label0;
		outputMap[2*i+1] = garbledCircuit->wires[garbledCircuit->outputs[i]].label1;
	}

	endTime = RDTSC;
	return (endTime - startTime);
}
Ejemplo n.º 14
0
void InvMixColumns_sse(BYTE state[][4])
{
        __m128i stateSse = _mm_set_epi8(
                                gf_mul[state[0][3]][3],
                                gf_mul[state[0][2]][3],
                                gf_mul[state[0][1]][3],
                                gf_mul[state[0][0]][3],
                                gf_mul[state[0][3]][4],
                                gf_mul[state[0][2]][4],
                                gf_mul[state[0][1]][4],
                                gf_mul[state[0][0]][4],
                                gf_mul[state[0][3]][2],
                                gf_mul[state[0][2]][2],
                                gf_mul[state[0][1]][2],
                                gf_mul[state[0][0]][2],
                                gf_mul[state[0][3]][5],
                                gf_mul[state[0][2]][5],
                                gf_mul[state[0][1]][5],
                                gf_mul[state[0][0]][5]);
        __m128i step2 = _mm_set_epi8(
                                gf_mul[state[1][3]][4],
                                gf_mul[state[1][2]][4],
                                gf_mul[state[1][1]][4],
                                gf_mul[state[1][0]][4],
                                gf_mul[state[1][3]][2],
                                gf_mul[state[1][2]][2],
                                gf_mul[state[1][1]][2],
                                gf_mul[state[1][0]][2],
                                gf_mul[state[1][3]][5],
                                gf_mul[state[1][2]][5],
                                gf_mul[state[1][1]][5],
                                gf_mul[state[1][0]][5],
                                gf_mul[state[1][3]][3],
                                gf_mul[state[1][2]][3],
                                gf_mul[state[1][1]][3],
                                gf_mul[state[1][0]][3]
                                );
        __m128i step3 = _mm_set_epi8(
                                gf_mul[state[2][3]][2],
                                gf_mul[state[2][2]][2],
                                gf_mul[state[2][1]][2],
                                gf_mul[state[2][0]][2],
                                gf_mul[state[2][3]][5],
                                gf_mul[state[2][2]][5],
                                gf_mul[state[2][1]][5],
                                gf_mul[state[2][0]][5],
                                gf_mul[state[2][3]][3],
                                gf_mul[state[2][2]][3],
                                gf_mul[state[2][1]][3],
                                gf_mul[state[2][0]][3],
                                gf_mul[state[2][3]][4],
                                gf_mul[state[2][2]][4],
                                gf_mul[state[2][1]][4],
                                gf_mul[state[2][0]][4]
                                );
	__m128i step4 = _mm_set_epi8(
                                gf_mul[state[3][3]][5],
                                gf_mul[state[3][2]][5],
                                gf_mul[state[3][1]][5],
                                gf_mul[state[3][0]][5],
                                gf_mul[state[3][3]][3],
                                gf_mul[state[3][2]][3],
                                gf_mul[state[3][1]][3],
                                gf_mul[state[3][0]][3],
                                gf_mul[state[3][3]][4],
                                gf_mul[state[3][2]][4],
                                gf_mul[state[3][1]][4],
                                gf_mul[state[3][0]][4],
                                gf_mul[state[3][3]][2],
                                gf_mul[state[3][2]][2],
                                gf_mul[state[3][1]][2],
                                gf_mul[state[3][0]][2]
                                );
	stateSse = _mm_xor_si128 ( stateSse, step2);
	stateSse = _mm_xor_si128 ( stateSse, step3); 
	stateSse = _mm_xor_si128 ( stateSse, step4); 
	_mm_storeu_si128(state, stateSse);

}
Ejemplo n.º 15
0
/**
 * Performs a reduced duplex operation for a single row, from the highest to
 * the lowest index, using the reduced-round Blake2b's G function as the
 * internal permutation
 *
 * @param state		The current state of the sponge
 * @param rowIn		Row to feed the sponge
 * @param rowOut	Row to receive the sponge's output
 */
inline void reducedDuplexRow1( uint64_t *state, uint64_t *rowIn,
                               uint64_t *rowOut, uint64_t nCols )
{
    uint64_t* ptrWordIn = rowIn;	//In Lyra2: pointer to prev
    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
    int i;

#if defined __AVX2__
         __m256i state_v[4], in_v[3];
         state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
         state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
         state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
         state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
#endif

    for ( i = 0; i < nCols; i++ )
    {
#if defined __AVX2__

         in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
         in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
         in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
 
         state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] );
         state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] );
         state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] );

         LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );

         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
                              _mm256_xor_si256( state_v[0], in_v[0] ) );
         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
                              _mm256_xor_si256( state_v[1], in_v[1] ) );
         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                              _mm256_xor_si256( state_v[2], in_v[2] ) );

#elif defined __AVX__

        __m128i state_v[6], in_v[6];

         state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
         state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
         state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
         state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
         state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
         state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

         in_v[0]    = _mm_load_si128( (__m128i*)(&ptrWordIn[0]) );
         in_v[1]    = _mm_load_si128( (__m128i*)(&ptrWordIn[2]) );
         in_v[2]    = _mm_load_si128( (__m128i*)(&ptrWordIn[4]) );
         in_v[3]    = _mm_load_si128( (__m128i*)(&ptrWordIn[6]) );
         in_v[4]    = _mm_load_si128( (__m128i*)(&ptrWordIn[8]) );
         in_v[5]    = _mm_load_si128( (__m128i*)(&ptrWordIn[10]) );

         _mm_store_si128( (__m128i*)(&state[0]),
                           _mm_xor_si128( state_v[0], in_v[0] ) );
         _mm_store_si128( (__m128i*)(&state[2]),
                           _mm_xor_si128( state_v[1], in_v[1] ) );
         _mm_store_si128( (__m128i*)(&state[4]),
                           _mm_xor_si128( state_v[2], in_v[2] ) );
         _mm_store_si128( (__m128i*)(&state[6]),
                           _mm_xor_si128( state_v[3], in_v[3] ) );
         _mm_store_si128( (__m128i*)(&state[8]),
                           _mm_xor_si128( state_v[4], in_v[4] ) );
         _mm_store_si128( (__m128i*)(&state[10]),
                           _mm_xor_si128( state_v[5], in_v[5] ) );

        //Applies the reduced-round transformation f to the sponge's state
        reducedBlake2bLyra(state);

#else

        //Absorbing "M[prev][col]"
        state[0]  ^= (ptrWordIn[0]);
        state[1]  ^= (ptrWordIn[1]);
        state[2]  ^= (ptrWordIn[2]);
        state[3]  ^= (ptrWordIn[3]);
        state[4]  ^= (ptrWordIn[4]);
        state[5]  ^= (ptrWordIn[5]);
        state[6]  ^= (ptrWordIn[6]);
        state[7]  ^= (ptrWordIn[7]);
        state[8]  ^= (ptrWordIn[8]);
        state[9]  ^= (ptrWordIn[9]);
        state[10] ^= (ptrWordIn[10]);
        state[11] ^= (ptrWordIn[11]);

        //Applies the reduced-round transformation f to the sponge's state
        reducedBlake2bLyra(state);
#endif

      #if defined __AVX2__
/*         state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
         state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
         state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );

         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
                              _mm256_xor_si256( state_v[0], in_v[0] ) );
         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
                              _mm256_xor_si256( state_v[1], in_v[1] ) );
         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                              _mm256_xor_si256( state_v[2], in_v[2] ) );
*/
      #elif defined __AVX__

         state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
         state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
         state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
         state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
         state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
         state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

         _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]),
                           _mm_xor_si128( state_v[0], in_v[0] ) );
         _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]),
                           _mm_xor_si128( state_v[1], in_v[1] ) );
         _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]),
                           _mm_xor_si128( state_v[2], in_v[2] ) );
         _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]),
                           _mm_xor_si128( state_v[3], in_v[3] ) );
         _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]),
                           _mm_xor_si128( state_v[4], in_v[4] ) );
         _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]),
                            _mm_xor_si128( state_v[5], in_v[5] ) );

      #else

    //M[row][C-1-col] = M[prev][col] XOR rand
    ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
    ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
    ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
    ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
    ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
    ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
    ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
    ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
    ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
    ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
    ptrWordOut[10] = ptrWordIn[10] ^ state[10];
    ptrWordOut[11] = ptrWordIn[11] ^ state[11];
#endif

       //Input: next column (i.e., next block in sequence)
       ptrWordIn += BLOCK_LEN_INT64;
       //Output: goes to previous column
       ptrWordOut -= BLOCK_LEN_INT64;
    }

#if defined __AVX2__
    _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
    _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
    _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
    _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
#endif

}
Ejemplo n.º 16
0
static __m128i aes128_keyexpand(__m128i key)
{
    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
    return _mm_xor_si128(key, _mm_slli_si128(key, 4));
}
Ejemplo n.º 17
0
/**
 * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
 * the wordwise addition of two columns, ignoring carries between words). The
 * output of this operation, "rand", is then used to make
 * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and
 * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
 * rotation to the left and N_COLS is a system parameter.
 *
 * @param state          The current state of the sponge
 * @param rowIn          Row used only as input
 * @param rowInOut       Row used as input and to receive output after rotation
 * @param rowOut         Row receiving the output
 *
 */
inline void reducedDuplexRowSetup( uint64_t *state, uint64_t *rowIn,
                                   uint64_t *rowInOut, uint64_t *rowOut,
                                   uint64_t nCols )
{
    uint64_t* ptrWordIn = rowIn;	//In Lyra2: pointer to prev
    uint64_t* ptrWordInOut = rowInOut;	//In Lyra2: pointer to row*
    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
    int i;

#if defined __AVX2__
    __m256i state_v[4], in_v[3], inout_v[3];
    #define t_state in_v

    state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
    state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
    state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
    state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );

    for ( i = 0; i < nCols; i++ )
    {
       in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
       inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) );
       in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
       inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) );
       in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
       inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) );

       state_v[0] = _mm256_xor_si256( state_v[0],  _mm256_add_epi64( in_v[0],
                                                                inout_v[0] ) );
       state_v[1] = _mm256_xor_si256( state_v[1],  _mm256_add_epi64( in_v[1],
                                                                inout_v[1] ) );
       state_v[2] = _mm256_xor_si256( state_v[2],  _mm256_add_epi64( in_v[2],
                                                                inout_v[2] ) );

       LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );

       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
                              _mm256_xor_si256( state_v[0], in_v[0] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
                              _mm256_xor_si256( state_v[1], in_v[1] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                              _mm256_xor_si256( state_v[2], in_v[2] ) );

       //M[row*][col] = M[row*][col] XOR rotW(rand)
      t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 );
      t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 );
      t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 );

      inout_v[0] = _mm256_xor_si256( inout_v[0],
                         _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) );
      inout_v[1] = _mm256_xor_si256( inout_v[1],
                         _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) );
      inout_v[2] = _mm256_xor_si256( inout_v[2],
                         _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) );

      _mm256_storeu_si256( (__m256i*)&ptrWordInOut[0], inout_v[0] );
      _mm256_storeu_si256( (__m256i*)&ptrWordInOut[4], inout_v[1] );
      _mm256_storeu_si256( (__m256i*)&ptrWordInOut[8], inout_v[2] );

       //Inputs: next column (i.e., next block in sequence)
       ptrWordInOut += BLOCK_LEN_INT64;
       ptrWordIn += BLOCK_LEN_INT64;
       //Output: goes to previous column
       ptrWordOut -= BLOCK_LEN_INT64;
    }

    _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
    _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
    _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
    _mm256_store_si256( (__m256i*)&state[12], state_v[3] );

    #undef t_state 

#elif defined __AVX__

        __m128i state_v[6], in_v[6], inout_v[6];

    for ( i = 0; i < nCols; i++ )
    {

        state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
        state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
        state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
        state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
        state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
        state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

        inout_v[0]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[0]) );
        inout_v[1]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[2]) );
        inout_v[2]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[4]) );
        inout_v[3]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[6]) );
        inout_v[4]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[8]) );
        inout_v[5]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[10]) );

        in_v[0]    = _mm_load_si128( (__m128i*)(&ptrWordIn[0]) );
        in_v[1]    = _mm_load_si128( (__m128i*)(&ptrWordIn[2]) );
        in_v[2]    = _mm_load_si128( (__m128i*)(&ptrWordIn[4]) );
        in_v[3]    = _mm_load_si128( (__m128i*)(&ptrWordIn[6]) );
        in_v[4]    = _mm_load_si128( (__m128i*)(&ptrWordIn[8]) );
        in_v[5]    = _mm_load_si128( (__m128i*)(&ptrWordIn[10]) );

        _mm_store_si128( (__m128i*)(&state[0]),
                          _mm_xor_si128( state_v[0],
                                         _mm_add_epi64( in_v[0],
                                                        inout_v[0] ) ) );
        _mm_store_si128( (__m128i*)(&state[2]),
                          _mm_xor_si128( state_v[1],
                                         _mm_add_epi64( in_v[1],
                                                        inout_v[1] ) ) );
        _mm_store_si128( (__m128i*)(&state[4]),
                          _mm_xor_si128( state_v[2],
                                         _mm_add_epi64( in_v[2],
                                                        inout_v[2] ) ) );
        _mm_store_si128( (__m128i*)(&state[6]),
                          _mm_xor_si128( state_v[3],
                                         _mm_add_epi64( in_v[3],
                                                        inout_v[3] ) ) );
        _mm_store_si128( (__m128i*)(&state[8]),
                          _mm_xor_si128( state_v[4],
                                         _mm_add_epi64( in_v[4],
                                                        inout_v[4] ) ) );
        _mm_store_si128( (__m128i*)(&state[10]),
                          _mm_xor_si128( state_v[5],
                                         _mm_add_epi64( in_v[5],
                                                        inout_v[5] ) ) );

    //Applies the reduced-round transformation f to the sponge's state
    reducedBlake2bLyra(state);
#else
    for ( i = 0; i < nCols; i++ )
    {

    //Absorbing "M[prev] [+] M[row*]"
    state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
    state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
    state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
    state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
    state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
    state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
    state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
    state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
    state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
    state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
    state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
    state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
    //Applies the reduced-round transformation f to the sponge's state
    reducedBlake2bLyra(state);

    //M[row][col] = M[prev][col] XOR rand
#endif


      #if defined __AVX2__

      #elif defined __AVX__

         state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
         state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
         state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
         state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
         state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
         state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

         _mm_store_si128( (__m128i*)(&ptrWordOut[0]),
                           _mm_xor_si128( state_v[0], in_v[0] ) );
         _mm_store_si128( (__m128i*)(&ptrWordOut[2]),
                           _mm_xor_si128( state_v[1], in_v[1] ) );
         _mm_store_si128( (__m128i*)(&ptrWordOut[4]),
                           _mm_xor_si128( state_v[2], in_v[2] ) );
         _mm_store_si128( (__m128i*)(&ptrWordOut[6]),
                           _mm_xor_si128( state_v[3], in_v[3] ) );
         _mm_store_si128( (__m128i*)(&ptrWordOut[8]),
                           _mm_xor_si128( state_v[4], in_v[4] ) );
         _mm_store_si128( (__m128i*)(&ptrWordOut[10]),
                           _mm_xor_si128( state_v[5], in_v[5] ) );

      #else

    ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
    ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
    ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
    ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
    ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
    ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
    ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
    ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
    ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
    ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
    ptrWordOut[10] = ptrWordIn[10] ^ state[10];
    ptrWordOut[11] = ptrWordIn[11] ^ state[11];
#endif

    //M[row*][col] = M[row*][col] XOR rotW(rand)
// Need to fix this before taking state load/store out of loop
#ifdef __AVX2__


#else

    ptrWordInOut[0]  ^= state[11];
    ptrWordInOut[1]  ^= state[0];
    ptrWordInOut[2]  ^= state[1];
    ptrWordInOut[3]  ^= state[2];
    ptrWordInOut[4]  ^= state[3];
    ptrWordInOut[5]  ^= state[4];
    ptrWordInOut[6]  ^= state[5];
    ptrWordInOut[7]  ^= state[6];
    ptrWordInOut[8]  ^= state[7];
    ptrWordInOut[9]  ^= state[8];
    ptrWordInOut[10] ^= state[9];
    ptrWordInOut[11] ^= state[10];

       //Inputs: next column (i.e., next block in sequence)
       ptrWordInOut += BLOCK_LEN_INT64;
       ptrWordIn += BLOCK_LEN_INT64;
       //Output: goes to previous column
       ptrWordOut -= BLOCK_LEN_INT64;
    }

#endif

}

/**
 * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
 * the wordwise addition of two columns, ignoring carries between words). The
 * output of this operation, "rand", is then used to make
 * "M[rowOut][col] = M[rowOut][col] XOR rand" and
 * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
 * rotation to the left.
 *
 * @param state          The current state of the sponge
 * @param rowIn          Row used only as input
 * @param rowInOut       Row used as input and to receive output after rotation
 * @param rowOut         Row receiving the output
 *
 */
inline void reducedDuplexRow( uint64_t *state, uint64_t *rowIn,
                              uint64_t *rowInOut, uint64_t *rowOut,
                              uint64_t nCols )
{
    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
    int i;


#if defined __AVX2__

    for ( i = 0; i < nCols; i++)
    {

       //Absorbing "M[prev] [+] M[row*]"

       __m256i state_v[4], in_v[3], inout_v[3];
       #define out_v in_v    // reuse register in next code block
       #define t_state in_v
       state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
       in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
       inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) );
       state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
       in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
       inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) );
       state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
       in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
       inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) );
       state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );

       state_v[0] = _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0],
                                                               inout_v[0] ) );
       state_v[1] = _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1],
                                                               inout_v[1] ) );
       state_v[2] = _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2],
                                                               inout_v[2] ) );

       out_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) );
       out_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) );
       out_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) );

       LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );

       _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
       _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
       _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
       _mm256_store_si256( (__m256i*)&state[12], state_v[3] );

       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
                            _mm256_xor_si256( state_v[0], out_v[0] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
                            _mm256_xor_si256( state_v[1], out_v[1] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                            _mm256_xor_si256( state_v[2], out_v[2] ) );

/*
       t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 );
       t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 );
       t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 );

       inout_v[0] = _mm256_xor_si256( inout_v[0],
                        _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) );
       inout_v[1] = _mm256_xor_si256( inout_v[1], 
                        _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) );
       inout_v[2] = _mm256_xor_si256( inout_v[2], 
                        _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) );

       _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[0]), inout_v[0] );
       _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[4]), inout_v[1] );
       _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[8]), inout_v[2] );

       _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
       _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
       _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
       _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
*/
       #undef out_v
       #undef t_state 

    //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
    ptrWordInOut[0] ^= state[11];
    ptrWordInOut[1] ^= state[0];
    ptrWordInOut[2] ^= state[1];
    ptrWordInOut[3] ^= state[2];
    ptrWordInOut[4] ^= state[3];
    ptrWordInOut[5] ^= state[4];
    ptrWordInOut[6] ^= state[5];
    ptrWordInOut[7] ^= state[6];
    ptrWordInOut[8] ^= state[7];
    ptrWordInOut[9] ^= state[8];
    ptrWordInOut[10] ^= state[9];
    ptrWordInOut[11] ^= state[10];

       //Goes to next block
       ptrWordOut += BLOCK_LEN_INT64;
       ptrWordInOut += BLOCK_LEN_INT64;
       ptrWordIn += BLOCK_LEN_INT64;
    }

   #elif defined __AVX__

    for ( i = 0; i < nCols; i++)
    {

       __m128i state_v[6], in_v[6], inout_v[6];
       #define out_v in_v    // reuse register in next code block

       state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
       state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
       state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
       state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
       state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
       state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

       inout_v[0]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[0]) );
       inout_v[1]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[2]) );
       inout_v[2]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[4]) );
       inout_v[3]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[6]) );
       inout_v[4]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[8]) );
       inout_v[5]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[10]) );

       in_v[0]    = _mm_load_si128( (__m128i*)(&ptrWordIn[0]) );
       in_v[1]    = _mm_load_si128( (__m128i*)(&ptrWordIn[2]) );
       in_v[2]    = _mm_load_si128( (__m128i*)(&ptrWordIn[4]) );
       in_v[3]    = _mm_load_si128( (__m128i*)(&ptrWordIn[6]) );
       in_v[4]    = _mm_load_si128( (__m128i*)(&ptrWordIn[8]) );
       in_v[5]    = _mm_load_si128( (__m128i*)(&ptrWordIn[10]) );

       _mm_store_si128( (__m128i*)(&state[0]),
                         _mm_xor_si128( state_v[0],
                                        _mm_add_epi64( in_v[0],
                                                       inout_v[0] ) ) );
       _mm_store_si128( (__m128i*)(&state[2]),
                         _mm_xor_si128( state_v[1],
                                        _mm_add_epi64( in_v[1],
                                                       inout_v[1] ) ) );
       _mm_store_si128( (__m128i*)(&state[4]),
                         _mm_xor_si128( state_v[2],
                                        _mm_add_epi64( in_v[2],
                                                       inout_v[2] ) ) );
       _mm_store_si128( (__m128i*)(&state[6]),
                         _mm_xor_si128( state_v[3],
                                        _mm_add_epi64( in_v[3],
                                                       inout_v[3] ) ) );
       _mm_store_si128( (__m128i*)(&state[8]),
                         _mm_xor_si128( state_v[4],
                                        _mm_add_epi64( in_v[4],
                                                       inout_v[4] ) ) );
       _mm_store_si128( (__m128i*)(&state[10]),
                         _mm_xor_si128( state_v[5],
                                        _mm_add_epi64( in_v[5],
                                                       inout_v[5] ) ) );

    //Applies the reduced-round transformation f to the sponge's state
    reducedBlake2bLyra(state);

   #else

    for ( i = 0; i < nCols; i++)
    {

    state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
    state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
    state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
    state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
    state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
    state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
    state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
    state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
    state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
    state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
    state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
    state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);

    //Applies the reduced-round transformation f to the sponge's state
    reducedBlake2bLyra(state);
#endif

    //M[rowOut][col] = M[rowOut][col] XOR rand

    #if defined __AVX2__
/*
       state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
       out_v  [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) );
       state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
       out_v  [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) );
       state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
       out_v  [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) );

       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
                            _mm256_xor_si256( state_v[0], out_v[0] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
                            _mm256_xor_si256( state_v[1], out_v[1] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                            _mm256_xor_si256( state_v[2], out_v[2] ) );
*/
    #elif defined __AVX__

       state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
       state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
       state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
       state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
       state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
       state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

       out_v[0]    = _mm_load_si128( (__m128i*)(&ptrWordOut[0]) );
       out_v[1]    = _mm_load_si128( (__m128i*)(&ptrWordOut[2]) );
       out_v[2]    = _mm_load_si128( (__m128i*)(&ptrWordOut[4]) );
       out_v[3]    = _mm_load_si128( (__m128i*)(&ptrWordOut[6]) );
       out_v[4]    = _mm_load_si128( (__m128i*)(&ptrWordOut[8]) );
       out_v[5]    = _mm_load_si128( (__m128i*)(&ptrWordOut[10]) );

       _mm_store_si128( (__m128i*)(&ptrWordOut[0]),
                         _mm_xor_si128( state_v[0], out_v[0] ) );
       _mm_store_si128( (__m128i*)(&ptrWordOut[2]),
                         _mm_xor_si128( state_v[1], out_v[1] ) );
       _mm_store_si128( (__m128i*)(&ptrWordOut[4]),
                         _mm_xor_si128( state_v[2], out_v[2] ) );
       _mm_store_si128( (__m128i*)(&ptrWordOut[6]),
                         _mm_xor_si128( state_v[3], out_v[3] ) );
       _mm_store_si128( (__m128i*)(&ptrWordOut[8]),
                         _mm_xor_si128( state_v[4], out_v[4] ) );
       _mm_store_si128( (__m128i*)(&ptrWordOut[10]),
                         _mm_xor_si128( state_v[5], out_v[5] ) );

    //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
    ptrWordInOut[0] ^= state[11];
    ptrWordInOut[1] ^= state[0];
    ptrWordInOut[2] ^= state[1];
    ptrWordInOut[3] ^= state[2];
    ptrWordInOut[4] ^= state[3];
    ptrWordInOut[5] ^= state[4];
    ptrWordInOut[6] ^= state[5];
    ptrWordInOut[7] ^= state[6];
    ptrWordInOut[8] ^= state[7];
    ptrWordInOut[9] ^= state[8];
    ptrWordInOut[10] ^= state[9];
    ptrWordInOut[11] ^= state[10];

       //Goes to next block
       ptrWordOut += BLOCK_LEN_INT64;
       ptrWordInOut += BLOCK_LEN_INT64;
       ptrWordIn += BLOCK_LEN_INT64;
    }

#else
    ptrWordOut[0] ^= state[0];
    ptrWordOut[1] ^= state[1];
    ptrWordOut[2] ^= state[2];
    ptrWordOut[3] ^= state[3];
    ptrWordOut[4] ^= state[4];
    ptrWordOut[5] ^= state[5];
    ptrWordOut[6] ^= state[6];
    ptrWordOut[7] ^= state[7];
    ptrWordOut[8] ^= state[8];
    ptrWordOut[9] ^= state[9];
    ptrWordOut[10] ^= state[10];
    ptrWordOut[11] ^= state[11];

    //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
    ptrWordInOut[0] ^= state[11];
    ptrWordInOut[1] ^= state[0];
    ptrWordInOut[2] ^= state[1];
    ptrWordInOut[3] ^= state[2];
    ptrWordInOut[4] ^= state[3];
    ptrWordInOut[5] ^= state[4];
    ptrWordInOut[6] ^= state[5];
    ptrWordInOut[7] ^= state[6];
    ptrWordInOut[8] ^= state[7];
    ptrWordInOut[9] ^= state[8];
    ptrWordInOut[10] ^= state[9];
    ptrWordInOut[11] ^= state[10];

       //Goes to next block
       ptrWordOut += BLOCK_LEN_INT64;
       ptrWordInOut += BLOCK_LEN_INT64;
       ptrWordIn += BLOCK_LEN_INT64;
    }
#endif
}
int bit_vec_filter_no_flipping_m128_sse1(uint8_t *read_vec0, uint8_t *read_vec1, uint8_t
				*ref_vec0, uint8_t *ref_vec1, __m128i mask, int max_error) {

	int total_difference = 0;

	//Start iteration
	int j;
	//read data
	__m128i read_XMM0 = *((__m128i *) (read_vec0));
	__m128i read_XMM1 = *((__m128i *) (read_vec1));
	//ref data
	__m128i ref_XMM0 = *((__m128i *) (ref_vec0));
	__m128i ref_XMM1 = *((__m128i *) (ref_vec1));

	__m128i shift_XMM;
	__m128i diff_XMM;
	__m128i temp_diff_XMM;
	__m128i temp_shift_XMM;
	__m128i temp_mask;

	diff_XMM = _mm_xor_si128(read_XMM0, ref_XMM0);
	temp_diff_XMM = _mm_xor_si128(read_XMM1, ref_XMM1);
	diff_XMM = _mm_or_si128(diff_XMM, temp_diff_XMM);

	//printf("diff_XMM: \n");
	//print128_bit_twice(diff_XMM);

	for (j = 1; j <= max_error; j++) {
		temp_mask = _mm_load_si128( (__m128i *) (MASK_SSE_BEG1 + (j - 1) *
								SSE_BYTE_NUM));
		temp_mask = _mm_and_si128(temp_mask, mask);
		
		//Right shift read
		shift_XMM = shift_right_sse1(read_XMM0, j);
		temp_diff_XMM = _mm_xor_si128(shift_XMM, ref_XMM0);
		shift_XMM = shift_right_sse1(read_XMM1, j);
		temp_shift_XMM = _mm_xor_si128(shift_XMM, ref_XMM1);
		temp_diff_XMM = _mm_or_si128(temp_shift_XMM, temp_diff_XMM);
		temp_diff_XMM = _mm_and_si128(temp_diff_XMM, temp_mask);
//		printf("Before flip: \t");
//		print128_bit(temp_diff_XMM);
//		flip_false_zero(temp_diff_XMM); //No flipping
//		printf("After flip: \t");
//		print128_bit(temp_diff_XMM);
		diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM);

		//printf("read shift %d diff_XMM: \n", j);
		//print128_bit_twice(diff_XMM);

		//Right shift ref
		shift_XMM = shift_right_sse1(ref_XMM0, j);
		temp_diff_XMM = _mm_xor_si128(shift_XMM, read_XMM0);
		shift_XMM = shift_right_sse1(ref_XMM1, j);
		temp_shift_XMM = _mm_xor_si128(shift_XMM, read_XMM1);
		temp_diff_XMM = _mm_or_si128(temp_shift_XMM, temp_diff_XMM);
		temp_diff_XMM = _mm_and_si128(temp_diff_XMM, temp_mask);
//		printf("Before flip: \t");
//		print128_bit(temp_diff_XMM);
//		flip_false_zero(temp_diff_XMM); //No flipping
//		printf("After flip: \t");
//		print128_bit(temp_diff_XMM);
		diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM);
		
		//printf("ref shift %d diff_XMM: \n", j);
		//print128_bit_twice(diff_XMM);
	}

	total_difference = popcount1_m128i_sse(diff_XMM);

	if (total_difference > max_error)
		return 0;
	else
		return 1;
}
Ejemplo n.º 19
0
HashReturn Update(hashState *state, const BitSequence *data,
                  DataLength databitlen)
{
  int r;
  __m128i x0;
  __m128i x1;
  __m128i x2;
  __m128i x3;
  __m128i x4;
  __m128i x5;
  __m128i x6;
  __m128i x7;
  __m128i y0;
  __m128i y1;
  __m128i y2;
  __m128i y3;

  while (databitlen >= 8 && state->pos != 0) {
    ((unsigned char *) state->x)[state->pos / 8] ^= *data;
    data += 1;
    databitlen -= 8;
    state->pos += 8;
    if (state->pos == 8 * CUBEHASH_BLOCKBYTES) {
      transform(state,CUBEHASH_ROUNDS);
      state->pos = 0;
    }
  }

  x0 = state->x[0];
  x1 = state->x[1];
  x2 = state->x[2];
  x3 = state->x[3];
  x4 = state->x[4];
  x5 = state->x[5];
  x6 = state->x[6];
  x7 = state->x[7];
    
  while (databitlen >= 8 * CUBEHASH_BLOCKBYTES) {
    x0 = _mm_xor_si128(x0,_mm_set_epi32(0,0,0,(crypto_uint32) *(crypto_uint16 *) data));
    data += CUBEHASH_BLOCKBYTES;
    databitlen -= 8 * CUBEHASH_BLOCKBYTES;
    
    for (r = 0;r < CUBEHASH_ROUNDS;++r) {
      x4 = _mm_add_epi32(x0,x4);
      x5 = _mm_add_epi32(x1,x5);
      x6 = _mm_add_epi32(x2,x6);
      x7 = _mm_add_epi32(x3,x7);
      y0 = x2;
      y1 = x3;
      y2 = x0;
      y3 = x1;
      x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25));
      x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25));
      x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25));
      x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25));
      x0 = _mm_xor_si128(x0,x4);
      x1 = _mm_xor_si128(x1,x5);
      x2 = _mm_xor_si128(x2,x6);
      x3 = _mm_xor_si128(x3,x7);
      x4 = _mm_shuffle_epi32(x4,0x4e);
      x5 = _mm_shuffle_epi32(x5,0x4e);
      x6 = _mm_shuffle_epi32(x6,0x4e);
      x7 = _mm_shuffle_epi32(x7,0x4e);
      x4 = _mm_add_epi32(x0,x4);
      x5 = _mm_add_epi32(x1,x5);
      x6 = _mm_add_epi32(x2,x6);
      x7 = _mm_add_epi32(x3,x7);
      y0 = x1;
      y1 = x0;
      y2 = x3;
      y3 = x2;
      x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21));
      x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21));
      x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21));
      x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21));
      x0 = _mm_xor_si128(x0,x4);
      x1 = _mm_xor_si128(x1,x5);
      x2 = _mm_xor_si128(x2,x6);
      x3 = _mm_xor_si128(x3,x7);
      x4 = _mm_shuffle_epi32(x4,0xb1);
      x5 = _mm_shuffle_epi32(x5,0xb1);
      x6 = _mm_shuffle_epi32(x6,0xb1);
      x7 = _mm_shuffle_epi32(x7,0xb1);
    }
  }
  
  state->x[0] = x0;
  state->x[1] = x1;
  state->x[2] = x2;
  state->x[3] = x3;
  state->x[4] = x4;
  state->x[5] = x5;
  state->x[6] = x6;
  state->x[7] = x7;

  while (databitlen >= 8) {
    ((unsigned char *) state->x)[state->pos / 8] ^= *data;
    data += 1;
    databitlen -= 8;
    state->pos += 8;
    if (state->pos == 8 * CUBEHASH_BLOCKBYTES) {
      transform(state,CUBEHASH_ROUNDS);
      state->pos = 0;
    }
  }
  if (databitlen > 0) {
    ((unsigned char *) state->x)[state->pos / 8] ^= *data;
    state->pos += databitlen;
  }
  return SUCCESS;
}
Ejemplo n.º 20
0
void xor(byte* x, byte* y, byte* result, int32_t len)
{
#if defined (PORTABLE_64_BIT)
    if (len >= 16)
    {
        for (int32_t i = (len / 16) - 1; i >= 0; i--, x += 16, y += 16, result += 16)
        {
            __m128i xmm_x = _mm_loadu_si128((__m128i*)x);
            __m128i xmm_y = _mm_loadu_si128((__m128i*)y);
            __m128i xmm_res = _mm_xor_si128(xmm_x, xmm_y);
            _mm_storeu_si128((__m128i*)result, xmm_res);
        }
    }

    if ((len & 8) != 0)
    {
        *((uint64_t*)result) = *((uint64_t*)x) ^ *((uint64_t*)y);
        x += 8; y += 8; result += 8;
    }

    if ((len & 4) != 0)
    {
        *((uint32_t*)result) = *((uint32_t*)x) ^ *((uint32_t*)y);
        x += 4; y += 4; result += 4;
    }

    if ((len & 2) != 0)
    {
        *((uint16_t*)result) = *((uint16_t*)x) ^ *((uint16_t*)y);
        x += 2; y += 2; result += 2;
    }

    if ((len & 1) != 0)
    {
        *((byte*)result) = (byte)(*((byte*)x) ^ *((byte*)y));
    }
#elif defined (PORTABLE_32_BIT)
    if (len >= 16)
    {
        for (int32_t i = (len / 16) - 1; i >= 0; i--, x += 16, y += 16, result += 16)
        {
            __m128i xmm_x = _mm_loadu_si128((__m128i*)x);
            __m128i xmm_y = _mm_loadu_si128((__m128i*)y);
            __m128i xmm_res = _mm_xor_si128(xmm_x, xmm_y);
            _mm_storeu_si128((__m128i*)result, xmm_res);
        }
    }

    if ((len & 8) != 0)
    {
        *((uint32_t*)result) = *((uint32_t*)x) ^ *((uint32_t*)y);
        x += 4; y += 4; result += 4;
        *((uint32_t*)result) = *((uint32_t*)x) ^ *((uint32_t*)y);
        x += 4; y += 4; result += 4;
    }

    if ((len & 4) != 0)
    {
        *((uint32_t*)result) = *((uint32_t*)x) ^ *((uint32_t*)y);
        x += 4; y += 4; result += 4;
    }

    if ((len & 2) != 0)
    {
        *((uint16_t*)result) = *((uint16_t*)x) ^ *((uint16_t*)y);
        x += 2; y += 2; result += 2;
    }

    if ((len & 1) != 0)
    {
        *((byte*)result) = (byte)(*((byte*)x) ^ *((byte*)y));
    }
#endif
}
Ejemplo n.º 21
0
static void transform(hashState *state,int r)
{
  __m128i x0;
  __m128i x1;
  __m128i x2;
  __m128i x3;
  __m128i x4;
  __m128i x5;
  __m128i x6;
  __m128i x7;
  __m128i y0;
  __m128i y1;
  __m128i y2;
  __m128i y3;

  x0 = state->x[0];
  x1 = state->x[1];
  x2 = state->x[2];
  x3 = state->x[3];
  x4 = state->x[4];
  x5 = state->x[5];
  x6 = state->x[6];
  x7 = state->x[7];

  for (;r > 0;--r) {
    x4 = _mm_add_epi32(x0,x4);
    x5 = _mm_add_epi32(x1,x5);
    x6 = _mm_add_epi32(x2,x6);
    x7 = _mm_add_epi32(x3,x7);
    y0 = x2;
    y1 = x3;
    y2 = x0;
    y3 = x1;
    x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25));
    x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25));
    x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25));
    x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25));
    x0 = _mm_xor_si128(x0,x4);
    x1 = _mm_xor_si128(x1,x5);
    x2 = _mm_xor_si128(x2,x6);
    x3 = _mm_xor_si128(x3,x7);
    x4 = _mm_shuffle_epi32(x4,0x4e);
    x5 = _mm_shuffle_epi32(x5,0x4e);
    x6 = _mm_shuffle_epi32(x6,0x4e);
    x7 = _mm_shuffle_epi32(x7,0x4e);
    x4 = _mm_add_epi32(x0,x4);
    x5 = _mm_add_epi32(x1,x5);
    x6 = _mm_add_epi32(x2,x6);
    x7 = _mm_add_epi32(x3,x7);
    y0 = x1;
    y1 = x0;
    y2 = x3;
    y3 = x2;
    x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21));
    x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21));
    x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21));
    x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21));
    x0 = _mm_xor_si128(x0,x4);
    x1 = _mm_xor_si128(x1,x5);
    x2 = _mm_xor_si128(x2,x6);
    x3 = _mm_xor_si128(x3,x7);
    x4 = _mm_shuffle_epi32(x4,0xb1);
    x5 = _mm_shuffle_epi32(x5,0xb1);
    x6 = _mm_shuffle_epi32(x6,0xb1);
    x7 = _mm_shuffle_epi32(x7,0xb1);
  }

  state->x[0] = x0;
  state->x[1] = x1;
  state->x[2] = x2;
  state->x[3] = x3;
  state->x[4] = x4;
  state->x[5] = x5;
  state->x[6] = x6;
  state->x[7] = x7;
}
Ejemplo n.º 22
0
void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression)
{
    Mat img = _img.getMat();
    const int K = patternSize/2, N = patternSize + K + 1;
#if CV_SSE2
    const int quarterPatternSize = patternSize/4;
    (void)quarterPatternSize;
#endif
    int i, j, k, pixel[25];
    makeOffsets(pixel, (int)img.step, patternSize);

    keypoints.clear();

    threshold = std::min(std::max(threshold, 0), 255);

#if CV_SSE2
    __m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K);
    (void)K16;
    (void)delta;
    (void)t;
#endif
    uchar threshold_tab[512];
    for( i = -255; i <= 255; i++ )
        threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);

    AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128);
    uchar* buf[3];
    buf[0] = _buf; buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols;
    int* cpbuf[3];
    cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1;
    cpbuf[1] = cpbuf[0] + img.cols + 1;
    cpbuf[2] = cpbuf[1] + img.cols + 1;
    memset(buf[0], 0, img.cols*3);

    for(i = 3; i < img.rows-2; i++)
    {
        const uchar* ptr = img.ptr<uchar>(i) + 3;
        uchar* curr = buf[(i - 3)%3];
        int* cornerpos = cpbuf[(i - 3)%3];
        memset(curr, 0, img.cols);
        int ncorners = 0;

        if( i < img.rows - 3 )
        {
            j = 3;
    #if CV_SSE2
            if( patternSize == 16 )
            {
                for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
                {
                    __m128i m0, m1;
                    __m128i v0 = _mm_loadu_si128((const __m128i*)ptr);
                    __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta);
                    v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta);

                    __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta);
                    __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta);
                    __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta);
                    __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta);
                    m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0));
                    m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2)));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3)));
                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0)));
                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0)));
                    m0 = _mm_or_si128(m0, m1);
                    int mask = _mm_movemask_epi8(m0);
                    if( mask == 0 )
                        continue;
                    if( (mask & 255) == 0 )
                    {
                        j -= 8;
                        ptr -= 8;
                        continue;
                    }

                    __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0;
                    for( k = 0; k < N; k++ )
                    {
                        __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta);
                        m0 = _mm_cmpgt_epi8(x, v0);
                        m1 = _mm_cmpgt_epi8(v1, x);

                        c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0);
                        c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1);

                        max0 = _mm_max_epu8(max0, c0);
                        max1 = _mm_max_epu8(max1, c1);
                    }

                    max0 = _mm_max_epu8(max0, max1);
                    int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16));

                    for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
                        if(m & 1)
                        {
                            cornerpos[ncorners++] = j+k;
                            if(nonmax_suppression)
                                curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
                        }
                }
            }
    #endif
            for( ; j < img.cols - 3; j++, ptr++ )
            {
                int v = ptr[0];
                const uchar* tab = &threshold_tab[0] - v + 255;
                int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
                d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
                d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];

                if( d == 0 )
                    continue;

                d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
                d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
                d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
                d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];

                if( d & 1 )
                {
                    int vt = v - threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x < vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }

                if( d & 2 )
                {
                    int vt = v + threshold, count = 0;

                    for( k = 0; k < N; k++ )
                    {
                        int x = ptr[pixel[k]];
                        if(x > vt)
                        {
                            if( ++count > K )
                            {
                                cornerpos[ncorners++] = j;
                                if(nonmax_suppression)
                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                break;
                            }
                        }
                        else
                            count = 0;
                    }
                }
            }
        }

        cornerpos[-1] = ncorners;

        if( i == 3 )
            continue;

        const uchar* prev = buf[(i - 4 + 3)%3];
        const uchar* pprev = buf[(i - 5 + 3)%3];
        cornerpos = cpbuf[(i - 4 + 3)%3];
        ncorners = cornerpos[-1];

        for( k = 0; k < ncorners; k++ )
        {
            j = cornerpos[k];
            int score = prev[j];
            if( !nonmax_suppression ||
               (score > prev[j+1] && score > prev[j-1] &&
                score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
                score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
            {
                keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score));
            }
        }
    }
Ejemplo n.º 23
0
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
                          const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();
  const __m128i one = _mm_set1_epi16(1);
  const __m128i three = _mm_set1_epi16(3);

  // Load, combine and tranpose inputs.
  {
    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
    const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
    const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
    const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
    const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
    const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
    const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);

    // Combine inA and inB (we'll do two transforms in parallel).
    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0

    // Transpose the two 4x4, discarding the filling zeroes.
    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33

    // Convert to 16b.
    tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
    tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
    tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
    tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Horizontal pass and subsequent transpose.
  {
    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
    const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
    const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
    const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
    // b0_extra = (a0 != 0);
    const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
    const __m128i b0_base = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);
    const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33

    // Transpose the two 4x4.
    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Vertical pass and difference of weighted sums.
  {
    // Load all inputs.
    // TODO(cduvivier): Make variable declarations and allocations aligned so
    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
    const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
    const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);

    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);

    // Separate the transforms of inA and inB.
    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);

    {
      // sign(b) = b >> 15  (0x0000 if positive, 0xffff if negative)
      const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
      const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
      const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
      const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);

      // b = abs(b) = (b ^ sign) - sign
      A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
      A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
      B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
      B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
      A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
      A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
      B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
    }

    // b = abs(b) + 3
    A_b0 = _mm_add_epi16(A_b0, three);
    A_b2 = _mm_add_epi16(A_b2, three);
    B_b0 = _mm_add_epi16(B_b0, three);
    B_b2 = _mm_add_epi16(B_b2, three);

    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
    // b = (abs(b) + 3) >> 3
    A_b0 = _mm_srai_epi16(A_b0, 3);
    A_b2 = _mm_srai_epi16(A_b2, 3);
    B_b0 = _mm_srai_epi16(B_b0, 3);
    B_b2 = _mm_srai_epi16(B_b2, 3);

    // weighted sums
    A_b0 = _mm_madd_epi16(A_b0, w_0);
    A_b2 = _mm_madd_epi16(A_b2, w_8);
    B_b0 = _mm_madd_epi16(B_b0, w_0);
    B_b2 = _mm_madd_epi16(B_b2, w_8);
    A_b0 = _mm_add_epi32(A_b0, A_b2);
    B_b0 = _mm_add_epi32(B_b0, B_b2);

    // difference of weighted sums
    A_b0 = _mm_sub_epi32(A_b0, B_b0);
    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
  }
  return sum[0] + sum[1] + sum[2] + sum[3];
}