Esempio n. 1
0
/**
 * Processes two doubles at a time
 */
int
_mandelbrot_2( double const * const c_re_arg, 
	           double const * const c_im_arg, 
	           int                  max_iter 
	         )
{
	__m128d z_re = _mm_load_pd(c_re_arg);
	__m128d z_im = _mm_load_pd(c_im_arg);
	__m128d y_re;
	__m128d y_im;
	__m128d c_re = z_re;
	__m128d c_im = z_im;

	__m128i count = _mm_set1_epi64x(0);

	__m128d md;
	__m128d mt;
	__m128i mi = _mm_set1_epi16(0xffff);;

	__m128d two = _mm_set1_pd(2.0);
	__m128i one = _mm_set1_epi64x(1);

	for (int i = 0; i<max_iter; i+=1)
	{
		// y = z .* z;
		y_re = _mm_mul_pd(z_re, z_re);
		y_im = _mm_mul_pd(z_im, z_im);

		// y = z * z;
		y_re = _mm_sub_pd(y_re, y_im);
		y_im = _mm_mul_pd(z_re, z_im);
		y_im = _mm_add_pd(y_im, y_im);

		// z = z * z + c
		z_re = _mm_add_pd(y_re, c_re);
		z_im = _mm_add_pd(y_im, c_im);

		// if condition
		// md = _mm_add_pd(z_re, z_im);
		// md = _mm_cmplt_pd(md, four);
		md = _mm_cmplt_pd(z_re, two);
		mt = _mm_cmplt_pd(z_im, two);
		md = _mm_and_pd(md, mt);
		mi = _mm_and_si128(mi, (__m128i) md);
		// PRINT_M128I(mi);
		if ( !_mm_movemask_pd(md) ) { break; }

		// count iterations
		count = _mm_add_epi64( count, _mm_and_si128( mi, one) );
	}

	int val;
	count = _mm_add_epi64( _mm_srli_si128(count, 8), count );
	val   = _mm_cvtsi128_si64( count );

	return val;
}
Esempio n. 2
0
void left_shift_w2w_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
{
	const uint16_t *src_p = static_cast<const uint16_t *>(src);
	uint16_t *dst_p = static_cast<uint16_t *>(dst);

	unsigned vec_left = ceil_n(left, 8);
	unsigned vec_right = floor_n(right, 8);

	__m128i count = _mm_set1_epi64x(shift);

	if (left != vec_left) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 8));
		x = _mm_sll_epi16(x, count);
		mm_store_left_epi16(dst_p + vec_left - 8, x, vec_left - left);
	}

	for (unsigned j = vec_left; j < vec_right; j += 8) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + j));
		x = _mm_sll_epi16(x, count);
		_mm_store_si128((__m128i *)(dst_p + j), x);
	}

	if (right != vec_right) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right));
		x = _mm_sll_epi16(x, count);
		mm_store_right_epi16(dst_p + vec_right, x, right - vec_right);
	}
}
Esempio n. 3
0
void left_shift_b2b_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
{
	const uint8_t *src_p = static_cast<const uint8_t *>(src);
	uint8_t *dst_p = static_cast<uint8_t *>(dst);

	unsigned vec_left = ceil_n(left, 16);
	unsigned vec_right = floor_n(right, 16);

	__m128i count = _mm_set1_epi64x(shift);

	if (left != vec_left) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 16));
		x = mm_sll_epi8(x, count);
		mm_store_idxhi_epi8((__m128i *)(dst_p + vec_left - 16), x, left % 16);
	}

	for (unsigned j = vec_left; j < vec_right; j += 16) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + j));
		x = mm_sll_epi8(x, count);
		_mm_store_si128((__m128i *)(dst_p + j), x);
	}

	if (right != vec_right) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right));
		x = mm_sll_epi8(x, count);
		mm_store_idxlo_epi8((__m128i *)(dst_p + vec_right), x, right % 16);
	}
}
Esempio n. 4
0
void GarbledCct3::gen_next_gen_inp_com(const Bytes &row, size_t kx)
{
	static Bytes tmp;

	__m128i out_key[2];
	tmp = m_prng.rand(Env::k());
	tmp.set_ith_bit(0, 0);
	tmp.resize(16, 0);
	out_key[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
	out_key[1] = _mm_xor_si128(out_key[0], m_R);

	Bytes msg(m_gen_inp_decom[0].size());
	for (size_t jx = 0; jx < Env::circuit().gen_inp_cnt(); jx++)
	{
		if (row.get_ith_bit(jx))
		{
			byte bit = m_gen_inp_mask.get_ith_bit(jx);
			msg ^= m_gen_inp_decom[2*jx+bit];
			//msg ^= m_gen_inp_decom[2*jx];
		}
	}

	__m128i in_key[2], aes_plaintext, aes_ciphertext;

	aes_plaintext = _mm_set1_epi64x((uint64_t)kx);

	tmp.assign(msg.begin(), msg.begin()+Env::key_size_in_bytes());
	tmp.resize(16, 0);
	in_key[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
	in_key[1] = _mm_xor_si128(in_key[0], m_R);

	KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&in_key[0]);
	aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
	out_key[0] = _mm_xor_si128(out_key[0], aes_ciphertext);

	KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&in_key[1]);
	aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
	out_key[1] = _mm_xor_si128(out_key[1], aes_ciphertext);

	const byte bit = msg.get_ith_bit(0);

	tmp.resize(16);
	_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key[  bit]);
	m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

	_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key[1-bit]);
	m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());
//	tmp.resize(16);
//	_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key[0]);
//	std::cout << "GEN " << m_gen_inp_hash_ix << " : (" << tmp.to_hex();
//
//	tmp.resize(16);
//	_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key[1]);
//	std::cout << ", " << tmp.to_hex() << ")" << std::endl;

	m_gen_inp_hash_ix++;
}
Esempio n. 5
0
void left_shift_b2w_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
{
	const uint8_t *src_p = static_cast<const uint8_t *>(src);
	uint16_t *dst_p = static_cast<uint16_t *>(dst);

	unsigned vec_left = ceil_n(left, 16);
	unsigned vec_right = floor_n(right, 16);

	__m128i count = _mm_set1_epi64x(shift);

	if (left != vec_left) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 16));
		__m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
		__m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128());
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);

		if (vec_left - left > 8) {
			mm_store_left_epi16(dst_p + vec_left - 16, lo, (vec_left - left) % 8);
			_mm_store_si128((__m128i *)(dst_p + vec_left - 8), hi);
		} else {
			mm_store_left_epi16(dst_p + vec_left - 8, hi, vec_left - left);
		}
	}

	for (unsigned j = vec_left; j < vec_right; j += 16) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + j));
		__m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
		__m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128());
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);

		_mm_store_si128((__m128i *)(dst_p + j + 0), lo);
		_mm_store_si128((__m128i *)(dst_p + j + 8), hi);
	}

	if (right != vec_right) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right));
		__m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
		__m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128());
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);

		if (right - vec_right > 8) {
			_mm_store_si128((__m128i *)(dst_p + vec_right), lo);
			mm_store_right_epi16(dst_p + vec_right + 8, hi, (right - vec_right) % 8);
		} else {
			mm_store_right_epi16(dst_p + vec_right, lo, right - vec_right);
		}
	}
}
Esempio n. 6
0
void GarbledCct3::evl_next_gen_inp_com(const Bytes &row, size_t kx)
{
	Bytes out(m_gen_inp_decom[0].size());

	for (size_t jx = 0; jx < Env::circuit().gen_inp_cnt(); jx++)
	{
		if (row.get_ith_bit(jx)) { out ^= m_gen_inp_decom[jx]; }
	}

	byte bit = out.get_ith_bit(0);

	static Bytes tmp;

	Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes();

	__m128i aes_key, aes_plaintext, aes_ciphertext, out_key;

	tmp.assign(out.begin(), out.begin()+Env::key_size_in_bytes());
	tmp.resize(16, 0);
	aes_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

	aes_plaintext = _mm_set1_epi64x((uint64_t)kx);

	KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key);
	aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);

	tmp.assign(it, it+Env::key_size_in_bytes());
	tmp.resize(16, 0);
	out_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
	out_key = _mm_xor_si128(out_key, aes_ciphertext);

	bit = _mm_extract_epi8(out_key, 0) & 0x01;
	m_gen_inp_hash.set_ith_bit(kx, bit);

	m_i_bufr_ix += 2*Env::key_size_in_bytes();

//	tmp.resize(16);
//	_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key);
//	std::cout << "EVL " << m_gen_inp_hash_ix << " : " << tmp.to_hex() << std::endl;

	m_gen_inp_hash_ix++;
}
Esempio n. 7
0
void left_shift_w2b_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
{
	const uint16_t *src_p = static_cast<const uint16_t *>(src);
	uint8_t *dst_p = static_cast<uint8_t *>(dst);

	unsigned vec_left = ceil_n(left, 16);
	unsigned vec_right = floor_n(right, 16);

	__m128i count = _mm_set1_epi64x(shift);

	if (left != vec_left) {
		__m128i lo = _mm_load_si128((const __m128i *)(src_p + vec_left - 16));
		__m128i hi = _mm_load_si128((const __m128i *)(src_p + vec_left - 8));
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);
		lo = _mm_packus_epi16(lo, hi);
		mm_store_left_epi8(dst_p + vec_left - 16, lo, vec_left - left);
	}

	for (unsigned j = vec_left; j < vec_right; j += 16) {
		__m128i lo = _mm_load_si128((const __m128i *)(src_p + j + 0));
		__m128i hi = _mm_load_si128((const __m128i *)(src_p + j + 8));
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);
		lo = _mm_packus_epi16(lo, hi);
		_mm_store_si128((__m128i *)(dst_p + j), lo);
	}

	if (right != vec_right) {
		__m128i lo = _mm_load_si128((const __m128i *)(src_p + vec_right + 0));
		__m128i hi = _mm_load_si128((const __m128i *)(src_p + vec_right + 8));
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);
		lo = _mm_packus_epi16(lo, hi);
		mm_store_right_epi8(dst_p + vec_right, lo, right - vec_right);
	}
}
/*****************************************************************************
 * This function utilises 3 properties of the cost function lookup tables,   *
 * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
 * vp9_encoder.c.                                                            *
 * For the joint cost:                                                       *
 *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
 * For the component costs:                                                  *
 *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
 *         (Equal costs for both components)                                 *
 *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
 *         (Cost function is even)                                           *
 * If these do not hold, then this function cannot be used without           *
 * modification, in which case you can revert to using the C implementation, *
 * which does not rely on these properties.                                  *
 *****************************************************************************/
int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                               const search_site_config *cfg,
                               MV *ref_mv, MV *best_mv, int search_param,
                               int sad_per_bit, int *num00,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *center_mv) {
  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);

  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);

  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);

  // search_param determines the length of the initial step and hence the number
  // of iterations.
  // 0 = initial step (MAX_FIRST_STEP) pel
  // 1 = (MAX_FIRST_STEP/2) pel,
  // 2 = (MAX_FIRST_STEP/4) pel...
  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
  const int tot_steps = cfg->total_steps - search_param;

  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
                                        center_mv->col >> 3);
  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);

  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);

  int_mv bmv = pack_int_mv(ref_row, ref_col);
  int_mv new_bmv = bmv;
  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);

  const int what_stride = x->plane[0].src.stride;
  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
  const uint8_t *const what = x->plane[0].src.buf;
  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
                                 ref_row * in_what_stride + ref_col;

  // Work out the start point for the search
  const uint8_t *best_address = in_what;
  const uint8_t *new_best_address = best_address;
#if ARCH_X86_64
  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

  unsigned int best_sad;

  int i;
  int j;
  int step;

  // Check the prerequisite cost function properties that are easy to check
  // in an assert. See the function-level documentation for details on all
  // prerequisites.
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);

  // Check the starting position
  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);

  *num00 = 0;

  for (i = 0, step = 0; step < tot_steps; step++) {
    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
      __m128i v_sad_d;
      __m128i v_cost_d;
      __m128i v_outside_d;
      __m128i v_inside_d;
      __m128i v_diff_mv_w;
#if ARCH_X86_64
      __m128i v_blocka[2];
#else
      __m128i v_blocka[1];
#endif

      // Compute the candidate motion vectors
      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
      // Clamp them to the search bounds
      __m128i v_these_mv_clamp_w = v_these_mv_w;
      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
      // The ones that did not change are inside the search area
      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);

      // If none of them are inside, then move on
      if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
        continue;
      }

      // The inverse mask indicates which of the MVs are outside
      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
      // Shift right to keep the sign bit clear, we will use this later
      // to set the cost to the maximum value.
      v_outside_d = _mm_srli_epi32(v_outside_d, 1);

      // Compute the difference MV
      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
      // We utilise the fact that the cost function is even, and use the
      // absolute difference. This allows us to use unsigned indexes later
      // and reduces cache pressure somewhat as only a half of the table
      // is ever referenced.
      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);

      // Compute the SIMD pointer offsets.
      {
#if ARCH_X86_64  //  sizeof(intptr_t) == 8
        // Load the offsets
        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
        // Set the ones falling outside to zero
        v_bo10_q = _mm_and_si128(v_bo10_q,
                                 _mm_cvtepi32_epi64(v_inside_d));
        v_bo32_q = _mm_and_si128(v_bo32_q,
                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
        // Compute the candidate addresses
        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
#else  // ARCH_X86 //  sizeof(intptr_t) == 4
        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
#endif
      }

      fn_ptr->sdx4df(what, what_stride,
                     (const uint8_t **)&v_blocka[0], in_what_stride,
                     (uint32_t*)&v_sad_d);

      // Look up the component cost of the residual motion vector
      {
        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);

        // Note: This is a use case for vpgather in AVX2
        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];

        __m128i v_cost_10_d, v_cost_32_d;

        v_cost_10_d = _mm_cvtsi32_si128(cost0);
        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);

        v_cost_32_d = _mm_cvtsi32_si128(cost2);
        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);

        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
      }

      // Now add in the joint cost
      {
        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
                                                _mm_setzero_si128());
        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
                                                       v_joint_cost_0_d,
                                                       v_sel_d);
        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
      }

      // Multiply by sad_per_bit
      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
      // ROUND_POWER_OF_TWO(v_cost_d, 8)
      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
      // Add the cost to the sad
      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);

      // Make the motion vectors outside the search area have max cost
      // by or'ing in the comparison mask, this way the minimum search won't
      // pick them.
      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);

      // Find the minimum value and index horizontally in v_sad_d
      {
        // Try speculatively on 16 bits, so we can use the minpos intrinsic
        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);

        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);

        // If the local best value is not saturated, just use it, otherwise
        // find the horizontal minimum again the hard way on 32 bits.
        // This is executed rarely.
        if (__unlikely__(local_best_sad == 0xffff)) {
          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;

          v_loval_d = v_sad_d;
          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
          v_hival_d = _mm_srli_si128(v_loval_d, 8);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
          v_hival_d = _mm_srli_si128(v_loval_d, 4);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);

          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
        }

        // Update the global minimum if the local minimum is smaller
        if (__likely__(local_best_sad < best_sad)) {
          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];

          best_sad = local_best_sad;
        }
      }
    }

    bmv = new_bmv;
    best_address = new_best_address;

    v_bmv_w = _mm_set1_epi32(bmv.as_int);
#if ARCH_X86_64
    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

    if (__unlikely__(best_address == in_what)) {
      (*num00)++;
    }
  }

  *best_mv = bmv.as_mv;
  return best_sad;
}
Esempio n. 9
0
static inline int blake512_compress( state * state, const u8 * datablock ) 
{

  __m128i row1l;
  __m128i row2l;
  __m128i row3l;
  __m128i row4l;
  u64 row1hl, row1hh;
  u64 row2hl, row2hh;
  u64 row3hl, row3hh;
  u64 row4hl, row4hh;

  const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9);
  const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);

  union
  {
    __m128i u128[8];
    u64     u64[16];
  } m;

  __m128i t0, t1, t2, t3, t4, t5, t6, t7;
  u64     u0, u1, u2, u3;
  __m128i b0;
  u64 b1l, b1h;

  m.u128[0] = _mm_loadu_si128((__m128i*)(datablock +   0));
  m.u128[1] = _mm_loadu_si128((__m128i*)(datablock +  16));
  m.u128[2] = _mm_loadu_si128((__m128i*)(datablock +  32));
  m.u128[3] = _mm_loadu_si128((__m128i*)(datablock +  48));
  m.u128[4] = _mm_loadu_si128((__m128i*)(datablock +  64));
  m.u128[5] = _mm_loadu_si128((__m128i*)(datablock +  80));
  m.u128[6] = _mm_loadu_si128((__m128i*)(datablock +  96));
  m.u128[7] = _mm_loadu_si128((__m128i*)(datablock + 112));

  m.u128[0] = BSWAP64(m.u128[0]);
  m.u128[1] = BSWAP64(m.u128[1]);
  m.u128[2] = BSWAP64(m.u128[2]);
  m.u128[3] = BSWAP64(m.u128[3]);
  m.u128[4] = BSWAP64(m.u128[4]);
  m.u128[5] = BSWAP64(m.u128[5]);
  m.u128[6] = BSWAP64(m.u128[6]);
  m.u128[7] = BSWAP64(m.u128[7]);

  row1l = _mm_load_si128((__m128i*)&state->h[0]);
  row1hl = state->h[2];
  row1hh = state->h[3];

  row2l = _mm_load_si128((__m128i*)&state->h[4]);
  row2hl = state->h[6];
  row2hh = state->h[7];

  row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL);
  row3hl = 0xA4093822299F31D0ULL;
  row3hh = 0x082EFA98EC4E6C89ULL;

  row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL);
  row4hl = 0xC0AC29B7C97C50DDULL;
  row4hh = 0x3F84D5B5B5470917ULL;

  if(!state->nullt)
  {
  	row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0]));
    row4hl ^= state->t[1];
    row4hh ^= state->t[1];
  }

  ROUND( 0);
  ROUND( 1);
  ROUND( 2);
  ROUND( 3);
  ROUND( 4);
  ROUND( 5);
  ROUND( 6);
  ROUND( 7);
  ROUND( 8);
  ROUND( 9);
  ROUND(10);
  ROUND(11);
  ROUND(12);
  ROUND(13);
  ROUND(14);
  ROUND(15);

  row1l = _mm_xor_si128(row3l,row1l);
  row1hl ^= row3hl;
  row1hh ^= row3hh;

  _mm_store_si128((__m128i*)&state->h[0], _mm_xor_si128(row1l, _mm_load_si128((__m128i*)&state->h[0])));
  state->h[2] ^= row1hl;
  state->h[3] ^= row1hh;

  row2l = _mm_xor_si128(row4l,row2l);
  row2hl ^= row4hl;
  row2hh ^= row4hh;

  _mm_store_si128((__m128i*)&state->h[4], _mm_xor_si128(row2l, _mm_load_si128((__m128i*)&state->h[4])));
  state->h[6] ^= row2hl;
  state->h[7] ^= row2hh;
  
  return 0;
}
Esempio n. 10
0
/* vms_expma:
 *   Compute the component-wise exponential minus <a>:
 *       r[i] <-- e^x[i] - a
 *
 *   The following comments apply to the SSE2 version of this code:
 *
 *   Computation is done four doubles as a time by doing computation in paralell
 *   on two vectors of two doubles using SSE2 intrisics.  If size is not a
 *   multiple of 4, the remaining elements are computed using the stdlib exp().
 *
 *   The computation is done by first doing a range reduction of the argument of
 *   the type e^x = 2^k * e^f choosing k and f so that f is in [-0.5, 0.5].
 *   Then 2^k can be computed exactly using bit operations to build the double
 *   result and e^f can be efficiently computed with enough precision using a
 *   polynomial approximation.
 *
 *   The polynomial approximation is done with 11th order polynomial computed by
 *   Remez algorithm with the Solya suite, instead of the more classical Pade
 *   polynomial form cause it is better suited to parallel execution. In order
 *   to achieve the same precision, a Pade form seems to require three less
 *   multiplications but need a very costly division, so it will be less
 *   efficient.
 *
 *   The maximum error is less than 1lsb and special cases are correctly
 *   handled:
 *     +inf or +oor  -->   return +inf
 *     -inf or -oor  -->   return  0.0
 *     qNaN or sNaN  -->   return qNaN
 *
 *   This code is copyright 2004-2012 Thomas Lavergne and licenced under the
 *   BSD licence like the remaining of Wapiti.
 */
void xvm_expma(double r[], const double x[], double a, uint64_t N) {
#if defined(__SSE2__) && !defined(XVM_ANSI)
  #define xvm_vconst(v) (_mm_castsi128_pd(_mm_set1_epi64x((v))))
	assert(r != NULL && ((uintptr_t)r % 16) == 0);
	assert(x != NULL && ((uintptr_t)x % 16) == 0);
	const __m128i vl  = _mm_set1_epi64x(0x3ff0000000000000ULL);
	const __m128d ehi = xvm_vconst(0x4086232bdd7abcd2ULL);
	const __m128d elo = xvm_vconst(0xc086232bdd7abcd2ULL);
	const __m128d l2e = xvm_vconst(0x3ff71547652b82feULL);
	const __m128d hal = xvm_vconst(0x3fe0000000000000ULL);
	const __m128d nan = xvm_vconst(0xfff8000000000000ULL);
	const __m128d inf = xvm_vconst(0x7ff0000000000000ULL);
	const __m128d c1  = xvm_vconst(0x3fe62e4000000000ULL);
	const __m128d c2  = xvm_vconst(0x3eb7f7d1cf79abcaULL);
	const __m128d p0  = xvm_vconst(0x3feffffffffffffeULL);
	const __m128d p1  = xvm_vconst(0x3ff000000000000bULL);
	const __m128d p2  = xvm_vconst(0x3fe0000000000256ULL);
	const __m128d p3  = xvm_vconst(0x3fc5555555553a2aULL);
	const __m128d p4  = xvm_vconst(0x3fa55555554e57d3ULL);
	const __m128d p5  = xvm_vconst(0x3f81111111362f4fULL);
	const __m128d p6  = xvm_vconst(0x3f56c16c25f3bae1ULL);
	const __m128d p7  = xvm_vconst(0x3f2a019fc9310c33ULL);
	const __m128d p8  = xvm_vconst(0x3efa01825f3cb28bULL);
	const __m128d p9  = xvm_vconst(0x3ec71e2bd880fdd8ULL);
	const __m128d p10 = xvm_vconst(0x3e9299068168ac8fULL);
	const __m128d p11 = xvm_vconst(0x3e5ac52350b60b19ULL);
	const __m128d va  = _mm_set1_pd(a);
	for (uint64_t n = 0; n < N; n += 4) {
		__m128d mn1, mn2, mi1, mi2;
		__m128d t1,  t2,  d1,  d2;
		__m128d v1,  v2,  w1,  w2;
		__m128i k1,  k2;
		__m128d f1,  f2;
		// Load the next four values
		__m128d x1 = _mm_load_pd(x + n    );
		__m128d x2 = _mm_load_pd(x + n + 2);
		// Check for out of ranges, infinites and NaN
		mn1 = _mm_cmpneq_pd(x1, x1);	mn2 = _mm_cmpneq_pd(x2, x2);
		mi1 = _mm_cmpgt_pd(x1, ehi);	mi2 = _mm_cmpgt_pd(x2, ehi);
		x1  = _mm_max_pd(x1, elo);	x2  = _mm_max_pd(x2, elo);
		// Range reduction: we search k and f such that e^x = 2^k * e^f
		// with f in [-0.5, 0.5]
		t1  = _mm_mul_pd(x1, l2e);	t2  = _mm_mul_pd(x2, l2e);
		t1  = _mm_add_pd(t1, hal);	t2  = _mm_add_pd(t2, hal);
		k1  = _mm_cvttpd_epi32(t1);	k2  = _mm_cvttpd_epi32(t2);
		d1  = _mm_cvtepi32_pd(k1);	d2  = _mm_cvtepi32_pd(k2);
		t1  = _mm_mul_pd(d1, c1);	t2  = _mm_mul_pd(d2, c1);
		f1  = _mm_sub_pd(x1, t1);	f2  = _mm_sub_pd(x2, t2);
		t1  = _mm_mul_pd(d1, c2);	t2  = _mm_mul_pd(d2, c2);
		f1  = _mm_sub_pd(f1, t1);	f2  = _mm_sub_pd(f2, t2);
		// Evaluation of e^f using a 11th order polynom in Horner form
		v1  = _mm_mul_pd(f1, p11);	v2  = _mm_mul_pd(f2, p11);
		v1  = _mm_add_pd(v1, p10);	v2  = _mm_add_pd(v2, p10);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p9);	v2  = _mm_add_pd(v2, p9);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p8);	v2  = _mm_add_pd(v2, p8);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p7);	v2  = _mm_add_pd(v2, p7);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p6);	v2  = _mm_add_pd(v2, p6);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p5);	v2  = _mm_add_pd(v2, p5);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p4);	v2  = _mm_add_pd(v2, p4);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p3);	v2  = _mm_add_pd(v2, p3);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p2);	v2  = _mm_add_pd(v2, p2);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p1);	v2  = _mm_add_pd(v2, p1);
		v1  = _mm_mul_pd(v1, f1);	v2  = _mm_mul_pd(v2, f2);
		v1  = _mm_add_pd(v1, p0);	v2  = _mm_add_pd(v2, p0);
		// Evaluation of 2^k using bitops to achieve exact computation
		k1  = _mm_slli_epi32(k1, 20);	k2  = _mm_slli_epi32(k2, 20);
		k1  = _mm_shuffle_epi32(k1, 0x72);
		k2  = _mm_shuffle_epi32(k2, 0x72);
		k1  = _mm_add_epi32(k1, vl);	k2  = _mm_add_epi32(k2, vl);
		w1  = _mm_castsi128_pd(k1);	w2  = _mm_castsi128_pd(k2);
		// Return to full range to substract <a>
	        v1  = _mm_mul_pd(v1, w1);	v2  = _mm_mul_pd(v2, w2);
		v1  = _mm_sub_pd(v1, va);	v2  = _mm_sub_pd(v2, va);
		// Finally apply infinite and NaN where needed
		v1  = _mm_or_pd(_mm_and_pd(mi1, inf), _mm_andnot_pd(mi1, v1));
		v2  = _mm_or_pd(_mm_and_pd(mi2, inf), _mm_andnot_pd(mi2, v2));
		v1  = _mm_or_pd(_mm_and_pd(mn1, nan), _mm_andnot_pd(mn1, v1));
		v2  = _mm_or_pd(_mm_and_pd(mn2, nan), _mm_andnot_pd(mn2, v2));
		// Store the results
		_mm_store_pd(r + n,     v1);
		_mm_store_pd(r + n + 2, v2);
	}
#else
	for (uint64_t n = 0; n < N; n++)
		r[n] = exp(x[n]) - a;
#endif
}
void GarbledCct::evl_next_gate(const Gate &current_gate)
{
	__m128i current_key, a;
	Bytes::const_iterator it;
	static Bytes tmp;

	if (current_gate.m_tag == Circuit::GEN_INP)
	{
		uint8_t bit = m_gen_inp_mask.get_ith_bit(m_gen_inp_ix);
		Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes();

		tmp = m_M[m_gen_inp_ix].to_bytes().hash(Env::k());
		tmp.resize(16, 0);
		current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		tmp.assign(it, it+Env::key_size_in_bytes());
		tmp.resize(16, 0);
		a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		m_i_bufr_ix += Env::key_size_in_bytes()*2;

		current_key = _mm_xor_si128(current_key, a);

		m_gen_inp_ix++;
	}
	else if (current_gate.m_tag == Circuit::EVL_INP)
	{
		uint8_t bit = m_evl_inp.get_ith_bit(m_evl_inp_ix);
		Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes();

		tmp = (*m_ot_keys)[m_evl_inp_ix];
		tmp.resize(16, 0);
		current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		tmp.assign(it, it+Env::key_size_in_bytes());
		tmp.resize(16, 0);
		a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		m_i_bufr_ix += Env::key_size_in_bytes()*2;

		current_key = _mm_xor_si128(current_key, a);

		m_evl_inp_ix++;
	}
	else
	{
        const vector<uint64_t> &inputs = current_gate.m_input_idx;

#ifdef FREE_XOR
		if (is_xor(current_gate))
		{
			current_key = inputs.size() == 2?
				_mm_xor_si128(m_w[inputs[0]], m_w[inputs[1]]) : _mm_load_si128(m_w+inputs[0]);
		}
		else
#endif
        if (inputs.size() == 2) // 2-arity gates
		{
        	__m128i aes_key[2], aes_plaintext, aes_ciphertext;

			aes_plaintext = _mm_set1_epi64x(m_gate_ix);

			aes_key[0] = _mm_load_si128(m_w+inputs[0]);
			aes_key[1] = _mm_load_si128(m_w+inputs[1]);

			const uint8_t perm_x = _mm_extract_epi8(aes_key[0], 0) & 0x01;
			const uint8_t perm_y = _mm_extract_epi8(aes_key[1], 0) & 0x01;

			KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			uint8_t garbled_ix = (perm_y<<1) | (perm_x<<0);

#ifdef GRR
			if (garbled_ix == 0)
			{
				current_key = _mm_load_si128(&aes_ciphertext);
			}
			else
			{
				it = m_i_bufr_ix+(garbled_ix-1)*Env::key_size_in_bytes();
				tmp.assign(it, it+Env::key_size_in_bytes());
				tmp.resize(16, 0);
				a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
				current_key = _mm_xor_si128(aes_ciphertext, a);
			}
			m_i_bufr_ix += 3*Env::key_size_in_bytes();
#else
			it = m_i_bufr_ix + garbled_ix*Env::key_size_in_bytes();
			tmp.assign(it, it+Env::key_size_in_bytes());
			tmp.resize(16, 0);
			current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
			current_key = _mm_xor_si128(current_key, aes_ciphertext);

			m_i_bufr_ix += 4*Env::key_size_in_bytes();
#endif
		}
		else // 1-arity gates
		{
        	__m128i aes_key, aes_plaintext, aes_ciphertext;

			aes_plaintext = _mm_set1_epi64x(m_gate_ix);
			aes_key = _mm_load_si128(m_w+inputs[0]);
			KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);

			const uint8_t perm_x = _mm_extract_epi8(aes_key, 0) & 0x01;

#ifdef GRR
			if (perm_x == 0)
			{
				current_key = _mm_load_si128(&aes_ciphertext);
			}
			else
			{
				tmp.assign(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes());
				tmp.resize(16, 0);
				a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
				current_key = _mm_xor_si128(aes_ciphertext, a);
			}
			m_i_bufr_ix += Env::key_size_in_bytes();
#else
			it = m_i_bufr_ix + garbled_ix*Env::key_size_in_bytes();
			tmp.assign(it, it+Env::key_size_in_bytes());
			tmp.resize(16, 0);
			current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
			current_key = _mm_xor_si128(current_key, aes_ciphertext);

			m_i_bufr_ix += 2*Env::key_size_in_bytes();
#endif
		}

		if (current_gate.m_tag == Circuit::EVL_OUT)
		{
			uint8_t out_bit = _mm_extract_epi8(current_key, 0) & 0x01;
			out_bit ^= *m_i_bufr_ix;
			m_evl_out.set_ith_bit(m_evl_out_ix, out_bit);
			m_i_bufr_ix++;

			m_evl_out_ix++;
		}
		else if (current_gate.m_tag == Circuit::GEN_OUT)
		{
			// TODO: Ki08 implementation
			uint8_t out_bit = _mm_extract_epi8(current_key, 0) & 0x01;
			out_bit ^= *m_i_bufr_ix;
			m_gen_out.set_ith_bit(m_gen_out_ix, out_bit);
			m_i_bufr_ix++;

//			m_C[2*m_gen_out_ix+0] = Bytes(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes());
//			m_i_bufr_ix += Env::key_size_in_bytes();
//
//			m_C[2*m_gen_out_ix+1] = Bytes(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes());
//			m_i_bufr_ix += Env::key_size_in_bytes();

			m_gen_out_ix++;
		}
	}

	_mm_store_si128(m_w+current_gate.m_idx, current_key);

	update_hash(m_i_bufr);
	m_gate_ix++;
}
void GarbledCct::gen_next_gate(const Gate &current_gate)
{
	__m128i current_zero_key;

	if (current_gate.m_tag == Circuit::GEN_INP)
	{
		__m128i a[2];

		// zero_key = m_prng.rand(Env::k());
		static Bytes tmp;

		tmp = m_prng.rand(Env::k());
		tmp.resize(16, 0);
		current_zero_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[0] = m_M[2*m_gen_inp_ix+0].to_bytes().hash(Env::k());
		tmp = m_M[2*m_gen_inp_ix+0].to_bytes().hash(Env::k());
		tmp.resize(16, 0);
		a[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[1] = m_M[2*m_gen_inp_ix+1].to_bytes().hash(Env::k());
		tmp = m_M[2*m_gen_inp_ix+1].to_bytes().hash(Env::k());
		tmp.resize(16, 0);
		a[1] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[0] ^= zero_key; a[1] ^= zero_key ^ R;
		a[0] = _mm_xor_si128(a[0], current_zero_key);
		a[1] = _mm_xor_si128(a[1], _mm_xor_si128(current_zero_key, m_R));

		uint8_t bit = m_gen_inp_mask.get_ith_bit(m_gen_inp_ix);

		// m_o_bufr += a[bit];
		_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[bit]);
		m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

		// m_o_bufr += a[1-bit];
		_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[1-bit]);
		m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

		m_gen_inp_ix++;
	}
	else if (current_gate.m_tag == Circuit::EVL_INP)
	{
		__m128i a[2];

		// zero_key = m_prng.rand(Env::k());
		static Bytes tmp;

		tmp = m_prng.rand(Env::k());
		tmp.resize(16, 0);
		current_zero_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[0] = (*m_ot_keys)[2*m_evl_inp_ix+0];
		tmp = (*m_ot_keys)[2*m_evl_inp_ix+0];
		tmp.resize(16, 0);
		a[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[1] = (*m_ot_keys)[2*m_evl_inp_ix+1];
		tmp = (*m_ot_keys)[2*m_evl_inp_ix+1];
		tmp.resize(16, 0);
		a[1] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[0] ^= zero_key; a[1] ^= zero_key ^ R;
		a[0] = _mm_xor_si128(a[0], current_zero_key);
		a[1] = _mm_xor_si128(a[1], _mm_xor_si128(current_zero_key, m_R));

		// m_o_bufr += a[0];
		_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[0]);
		m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

		// m_o_bufr += a[1];
		_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[1]);
		m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

		m_evl_inp_ix++;
	}
	else
	{
		const vector<uint64_t> &inputs = current_gate.m_input_idx;
		assert(inputs.size() == 1 || inputs.size() == 2);

#ifdef FREE_XOR
		if (is_xor(current_gate))
		{
			current_zero_key = inputs.size() == 2?
				_mm_xor_si128(m_w[inputs[0]], m_w[inputs[1]]) : _mm_load_si128(m_w+inputs[0]);
		}
		else
#endif
		if (inputs.size() == 2) // 2-arity gates
		{
			uint8_t bit;
			__m128i aes_key[2], aes_plaintext, aes_ciphertext;
			__m128i X[2], Y[2], Z[2];
			static Bytes tmp(16, 0);

			aes_plaintext = _mm_set1_epi64x(m_gate_ix);

			X[0] = _mm_load_si128(m_w+inputs[0]);
			Y[0] = _mm_load_si128(m_w+inputs[1]);

			X[1] = _mm_xor_si128(X[0], m_R); // X[1] = X[0] ^ R
			Y[1] = _mm_xor_si128(Y[0], m_R); // Y[1] = Y[0] ^ R

			const uint8_t perm_x = _mm_extract_epi8(X[0], 0) & 0x01; // permutation bit for X
			const uint8_t perm_y = _mm_extract_epi8(Y[0], 0) & 0x01; // permutation bit for Y
			const uint8_t de_garbled_ix = (perm_y<<1)|perm_x;

			// encrypt the 0-th entry : (X[x], Y[y])
			aes_key[0] = _mm_load_si128(X+perm_x);
			aes_key[1] = _mm_load_si128(Y+perm_y);

			KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); // clear extra bits so that only k bits left
			bit = current_gate.m_table[de_garbled_ix];

#ifdef GRR
			// GRR technique: using zero entry's key as one of the output keys
			_mm_store_si128(Z+bit, aes_ciphertext);
			Z[1-bit] = _mm_xor_si128(Z[bit], m_R);
			current_zero_key = _mm_load_si128(Z);
#else
			tmp = m_prng.rand(Env::k());
			tmp.resize(16, 0);
			Z[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
			Z[1] = _mm_xor_si128(Z[0], m_R);

			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());
#endif

			// encrypt the 1st entry : (X[1-x], Y[y])
			aes_key[0] = _mm_xor_si128(aes_key[0], m_R);

			KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			bit = current_gate.m_table[0x01^de_garbled_ix];
			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

			// encrypt the 2nd entry : (X[x], Y[1-y])
			aes_key[0] = _mm_xor_si128(aes_key[0], m_R);
			aes_key[1] = _mm_xor_si128(aes_key[1], m_R);

			KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			bit = current_gate.m_table[0x02^de_garbled_ix];
			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

			// encrypt the 3rd entry : (X[1-x], Y[1-y])
			aes_key[0] = _mm_xor_si128(aes_key[0], m_R);

			KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			bit = current_gate.m_table[0x03^de_garbled_ix];
			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());
		}
		else // 1-arity gates
		{
			uint8_t bit;
			__m128i aes_key, aes_plaintext, aes_ciphertext;
			__m128i X[2], Z[2];
			static Bytes tmp;

			tmp.assign(16, 0);

			aes_plaintext = _mm_set1_epi64x(m_gate_ix);

			X[0] = _mm_load_si128(m_w+inputs[0]);
			X[1] = _mm_xor_si128(X[0], m_R);

			const uint8_t perm_x = _mm_extract_epi8(X[0], 0) & 0x01;

			// 0-th entry : X[x]
			aes_key = _mm_load_si128(X+perm_x);
			KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			bit = current_gate.m_table[perm_x];

#ifdef GRR
			_mm_store_si128(Z+bit, aes_ciphertext);
			Z[1-bit] = _mm_xor_si128(Z[bit], m_R);
			current_zero_key = _mm_load_si128(Z);
#else
			tmp = m_prng.rand(Env::k());
			tmp.resize(16, 0);
			Z[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
			Z[1] = _mm_xor_si128(Z[0], m_R);

			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());
#endif

			// 1-st entry : X[1-x]
			aes_key = _mm_xor_si128(aes_key, m_R);

			KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			bit = current_gate.m_table[0x01^perm_x];
			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());
		}

		if (current_gate.m_tag == Circuit::EVL_OUT)
		{
			m_o_bufr.push_back(_mm_extract_epi8(current_zero_key, 0) & 0x01); // permutation bit
		}
		else if (current_gate.m_tag == Circuit::GEN_OUT)
		{
			m_o_bufr.push_back(_mm_extract_epi8(current_zero_key, 0) & 0x01); // permutation bit

//			// TODO: C[ix_0] = w[ix0] || randomness, C[ix_1] = w[ix1] || randomness
//			m_o_bufr += (key_pair[0] + m_prng.rand(Env::k())).hash(Env::k());
//			m_o_bufr += (key_pair[1] + m_prng.rand(Env::k())).hash(Env::k());
		}
	}

	_mm_store_si128(m_w+current_gate.m_idx, current_zero_key);

	m_gate_ix++;
}
Esempio n. 13
0
void TestRootBoard::generateCaptures() {
    QTextStream xout(stderr);
    cpu_set_t mask;
    CPU_ZERO( &mask );
    CPU_SET( 1, &mask );
    if ( sched_setaffinity( 0, sizeof(mask), &mask ) == -1 )
        qDebug() << "Could not set CPU Affinity" << endl;
    static const unsigned testCases = 200;
    static const int iter = 10000;
    typedef QVector<uint64_t> Sample;
    QVector<Sample> times(testCases, Sample(iter));
    QVector<Sample> movetimes(testCases, Sample(iter));
    QVector<Sample> captimes(testCases, Sample(iter));
    QVector<Sample> b02flood(testCases, Sample(iter));
    QVector<Sample> b02point(testCases, Sample(iter));
    QVector<Sample> b02double(testCases, Sample(iter));
    Move moveList[256];
    uint64_t sum=0;
    uint64_t movesum=0;
    uint64_t nmoves=0;
    uint64_t ncap =0;
    uint64_t a, d, tsc;
    Key blah;
    Colors color[testCases];
    double cpufreq = 3900.0;
    for (unsigned int i = testCases; i;) {
        --i;
        b->setup(testPositions[i]);
        color[i] = b->color;
        if (i) {
            b->boards[i] = b->boards[0]; }
        movetimes[i].reserve(iter*2);
        times[i].reserve(iter*2);
        captimes[i].reserve(iter*2); }
    unsigned op = 1;
    const unsigned int iter2 = 10000000;
    __v2di res = _mm_set1_epi64x(0);
    uint64_t time=0;
#ifdef NDEBUG
    for (unsigned int i = 0; i < iter2; ++i) {
        Board& bb = b->boards[i & 0xf].wb;
        tsc = readtsc();
        res = bb.build02Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build02Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build02Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build02Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build02Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build02Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build02Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build02Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        time += readtsc() - tsc;
//        op = fold(res) & 0x3f;
    }
    std::cout << "build02(pos): " << time/iter2 << " clocks" << std::endl;

    time=0;
    for (unsigned int i = 0; i < iter2; ++i) {
        Board& bb = b->boards[i & 0xf].wb;
        tsc = readtsc();
        res = bb.build13Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build13Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build13Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build13Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build13Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build13Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build13Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        res = bb.build13Attack(op);
        op = _mm_cvtsi128_si64(res) & 0x3f;
        time += readtsc() - tsc; }
    std::cout << "build13(pos): " << time/iter2 << " clocks" << std::endl;

//     time=0;
//     for (unsigned int i = 0; i < iter2; ++i) {
//         BoardBase& bb = b->boards[i & 0xf].wb;
//         tsc = readtsc();
//         res = bb.build02Attack(res);
//         time += readtsc() - tsc;
//     }
//     std::cout << "build02(vector): " << time/iter2 << " clocks" << std::endl;

    time=0;
    for (unsigned int i = 0; i < iter2; ++i) {
        Board& bb = b->boards[i & 0xf].wb;
        tsc = readtsc();
        res = b->boards[0].wb.build13Attack(res);
        res = b->boards[1].wb.build13Attack(res);
        res = b->boards[2].wb.build13Attack(res);
        res = b->boards[3].wb.build13Attack(res);
        res = b->boards[4].wb.build13Attack(res);
        res = b->boards[5].wb.build13Attack(res);
        res = b->boards[6].wb.build13Attack(res);
        res = b->boards[7].wb.build13Attack(res);
        time += readtsc() - tsc; }
    std::cout << "build13(vector): " << time/iter2 << " clocks" << std::endl;

    for (int j = 0; j < iter; ++j) {
        nmoves = 0;
        ncap=0;
        for (unsigned int i = 0; i < testCases; ++i) {
//                      b->setup(testPositions[i]);
            uint64_t  overhead;
            /*
             asm volatile("cpuid\n rdtsc" : "=a" (a), "=d" (d) :: "%rbx", "%rcx");
             tsc = (a + (d << 32));
             asm volatile("cpuid\n rdtsc" : "=a" (a), "=d" (d) :: "%rbx", "%rcx");
             overhead = (a + (d << 32)) - tsc;
             */
            overhead = 20;
            if (color[i] == White)
                b->boards[i].wb.buildAttacks();
            else
                b->boards[i].bb.buildAttacks();

            tsc = readtsc();
            Move* good = moveList+192;
            Move* bad = good;
            if (color[i] == White)
                b->boards[i].wb.generateCaptureMoves<AllMoves>(good, bad);
            else
                b->boards[i].bb.generateCaptureMoves<AllMoves>(good, bad);
            ncap += bad - good;
            captimes[i][j] = readtsc() - tsc - overhead;

            tsc = readtsc();
            if (color[i] == White)
                b->boards[i].wb.generateNonCap(good, bad);
            else
                b->boards[i].bb.generateNonCap(good, bad);
            nmoves += bad - good;
            times[i][j] = readtsc() - tsc - overhead;
            for (Move* k=good; k<bad; ++k) {
//                              std::cout << k->string() << std::endl;
                tsc = readtsc();
                if (color[i] == White) {
                    __v8hi est = b->boards[i].b->eval.estimate(wb, *k);
                    ColoredBoard<Black> bb(b->boards[i].wb, *k, est);
                    blah += bb.getZobrist(); }
                else {
                    __v8hi est = b->boards[i].b->eval.estimate(bb, *k);
                    ColoredBoard<White> bb(b->boards[i].bb, *k, est);
                    blah += bb.getZobrist(); }
                movetimes[i][j] += readtsc() - tsc - overhead; }
//                      std::string empty;
//                      std::cin >> empty;
        } }
    for (QVector<Sample>::Iterator i = times.begin(); i != times.end(); ++i) {
        qSort(*i);
        sum += (*i)[iter / 2]; }
    uint64_t capsum=0;
    for (QVector<Sample>::Iterator i = captimes.begin(); i != captimes.end(); ++i) {
        qSort(*i);
        capsum += (*i)[iter / 2]; }
    for (QVector<Sample>::Iterator i = movetimes.begin(); i != movetimes.end(); ++i) {
        qSort(*i);
        movesum += (*i)[iter / 2]; }

    xout << endl << nmoves << " Moves, " << sum/nmoves << " Clocks, " << cpufreq* nmoves/sum << " generated Mmoves/s, " << cpufreq* nmoves/movesum << " executed Mmoves/s" << endl;
    xout << ncap << " Captures, " << capsum/ncap << " Clocks, " << cpufreq* ncap/capsum << " generated Mmoves/s, " /*<< cpufreq*ncap/movesum << " executed Mmoves/s" */<< endl;
    xout << blah + fold(res) + op64 << endl;
#endif

}
Esempio n. 14
0
static void crypt_all (int count)
#endif
{
#if FMT_MAIN_VERSION > 10
    int count = *pcount;
#endif
    int index = 0;

#ifdef _OPENMP
#pragma omp parallel for
    for (index = 0; index < count; index += 2)
#endif
    {
        int i;

        __m128i a, b, c, d, e, f, g, h;
        __m128i w[80], tmp1, tmp2;


        for (i = 0; i < 14; i += 2) {
            GATHER (tmp1, saved_key, i);
            GATHER (tmp2, saved_key, i + 1);
            SWAP_ENDIAN (tmp1);
            SWAP_ENDIAN (tmp2);
            w[i] = tmp1;
            w[i + 1] = tmp2;
        }
        GATHER (tmp1, saved_key, 14);
        SWAP_ENDIAN (tmp1);
        w[14] = tmp1;
        GATHER (w[15], saved_key, 15);
        for (i = 16; i < 80; i++) R(i);

        a = _mm_set1_epi64x (0x6a09e667f3bcc908ULL);
        b = _mm_set1_epi64x (0xbb67ae8584caa73bULL);
        c = _mm_set1_epi64x (0x3c6ef372fe94f82bULL);
        d = _mm_set1_epi64x (0xa54ff53a5f1d36f1ULL);
        e = _mm_set1_epi64x (0x510e527fade682d1ULL);
        f = _mm_set1_epi64x (0x9b05688c2b3e6c1fULL);
        g = _mm_set1_epi64x (0x1f83d9abfb41bd6bULL);
        h = _mm_set1_epi64x (0x5be0cd19137e2179ULL);

        SHA512_STEP(a, b, c, d, e, f, g, h,  0, 0x428a2f98d728ae22ULL);
        SHA512_STEP(h, a, b, c, d, e, f, g,  1, 0x7137449123ef65cdULL);
        SHA512_STEP(g, h, a, b, c, d, e, f,  2, 0xb5c0fbcfec4d3b2fULL);
        SHA512_STEP(f, g, h, a, b, c, d, e,  3, 0xe9b5dba58189dbbcULL);
        SHA512_STEP(e, f, g, h, a, b, c, d,  4, 0x3956c25bf348b538ULL);
        SHA512_STEP(d, e, f, g, h, a, b, c,  5, 0x59f111f1b605d019ULL);
        SHA512_STEP(c, d, e, f, g, h, a, b,  6, 0x923f82a4af194f9bULL);
        SHA512_STEP(b, c, d, e, f, g, h, a,  7, 0xab1c5ed5da6d8118ULL);
        SHA512_STEP(a, b, c, d, e, f, g, h,  8, 0xd807aa98a3030242ULL);
        SHA512_STEP(h, a, b, c, d, e, f, g,  9, 0x12835b0145706fbeULL);
        SHA512_STEP(g, h, a, b, c, d, e, f, 10, 0x243185be4ee4b28cULL);
        SHA512_STEP(f, g, h, a, b, c, d, e, 11, 0x550c7dc3d5ffb4e2ULL);
        SHA512_STEP(e, f, g, h, a, b, c, d, 12, 0x72be5d74f27b896fULL);
        SHA512_STEP(d, e, f, g, h, a, b, c, 13, 0x80deb1fe3b1696b1ULL);
        SHA512_STEP(c, d, e, f, g, h, a, b, 14, 0x9bdc06a725c71235ULL);
        SHA512_STEP(b, c, d, e, f, g, h, a, 15, 0xc19bf174cf692694ULL);

        SHA512_STEP(a, b, c, d, e, f, g, h, 16, 0xe49b69c19ef14ad2ULL);
        SHA512_STEP(h, a, b, c, d, e, f, g, 17, 0xefbe4786384f25e3ULL);
        SHA512_STEP(g, h, a, b, c, d, e, f, 18, 0x0fc19dc68b8cd5b5ULL);
        SHA512_STEP(f, g, h, a, b, c, d, e, 19, 0x240ca1cc77ac9c65ULL);
        SHA512_STEP(e, f, g, h, a, b, c, d, 20, 0x2de92c6f592b0275ULL);
        SHA512_STEP(d, e, f, g, h, a, b, c, 21, 0x4a7484aa6ea6e483ULL);
        SHA512_STEP(c, d, e, f, g, h, a, b, 22, 0x5cb0a9dcbd41fbd4ULL);
        SHA512_STEP(b, c, d, e, f, g, h, a, 23, 0x76f988da831153b5ULL);
        SHA512_STEP(a, b, c, d, e, f, g, h, 24, 0x983e5152ee66dfabULL);
        SHA512_STEP(h, a, b, c, d, e, f, g, 25, 0xa831c66d2db43210ULL);
        SHA512_STEP(g, h, a, b, c, d, e, f, 26, 0xb00327c898fb213fULL);
        SHA512_STEP(f, g, h, a, b, c, d, e, 27, 0xbf597fc7beef0ee4ULL);
        SHA512_STEP(e, f, g, h, a, b, c, d, 28, 0xc6e00bf33da88fc2ULL);
        SHA512_STEP(d, e, f, g, h, a, b, c, 29, 0xd5a79147930aa725ULL);
        SHA512_STEP(c, d, e, f, g, h, a, b, 30, 0x06ca6351e003826fULL);
        SHA512_STEP(b, c, d, e, f, g, h, a, 31, 0x142929670a0e6e70ULL);

        SHA512_STEP(a, b, c, d, e, f, g, h, 32, 0x27b70a8546d22ffcULL);
        SHA512_STEP(h, a, b, c, d, e, f, g, 33, 0x2e1b21385c26c926ULL);
        SHA512_STEP(g, h, a, b, c, d, e, f, 34, 0x4d2c6dfc5ac42aedULL);
        SHA512_STEP(f, g, h, a, b, c, d, e, 35, 0x53380d139d95b3dfULL);
        SHA512_STEP(e, f, g, h, a, b, c, d, 36, 0x650a73548baf63deULL);
        SHA512_STEP(d, e, f, g, h, a, b, c, 37, 0x766a0abb3c77b2a8ULL);
        SHA512_STEP(c, d, e, f, g, h, a, b, 38, 0x81c2c92e47edaee6ULL);
        SHA512_STEP(b, c, d, e, f, g, h, a, 39, 0x92722c851482353bULL);
        SHA512_STEP(a, b, c, d, e, f, g, h, 40, 0xa2bfe8a14cf10364ULL);
        SHA512_STEP(h, a, b, c, d, e, f, g, 41, 0xa81a664bbc423001ULL);
        SHA512_STEP(g, h, a, b, c, d, e, f, 42, 0xc24b8b70d0f89791ULL);
        SHA512_STEP(f, g, h, a, b, c, d, e, 43, 0xc76c51a30654be30ULL);
        SHA512_STEP(e, f, g, h, a, b, c, d, 44, 0xd192e819d6ef5218ULL);
        SHA512_STEP(d, e, f, g, h, a, b, c, 45, 0xd69906245565a910ULL);
        SHA512_STEP(c, d, e, f, g, h, a, b, 46, 0xf40e35855771202aULL);
        SHA512_STEP(b, c, d, e, f, g, h, a, 47, 0x106aa07032bbd1b8ULL);

        SHA512_STEP(a, b, c, d, e, f, g, h, 48, 0x19a4c116b8d2d0c8ULL);
        SHA512_STEP(h, a, b, c, d, e, f, g, 49, 0x1e376c085141ab53ULL);
        SHA512_STEP(g, h, a, b, c, d, e, f, 50, 0x2748774cdf8eeb99ULL);
        SHA512_STEP(f, g, h, a, b, c, d, e, 51, 0x34b0bcb5e19b48a8ULL);
        SHA512_STEP(e, f, g, h, a, b, c, d, 52, 0x391c0cb3c5c95a63ULL);
        SHA512_STEP(d, e, f, g, h, a, b, c, 53, 0x4ed8aa4ae3418acbULL);
        SHA512_STEP(c, d, e, f, g, h, a, b, 54, 0x5b9cca4f7763e373ULL);
        SHA512_STEP(b, c, d, e, f, g, h, a, 55, 0x682e6ff3d6b2b8a3ULL);
        SHA512_STEP(a, b, c, d, e, f, g, h, 56, 0x748f82ee5defb2fcULL);
        SHA512_STEP(h, a, b, c, d, e, f, g, 57, 0x78a5636f43172f60ULL);
        SHA512_STEP(g, h, a, b, c, d, e, f, 58, 0x84c87814a1f0ab72ULL);
        SHA512_STEP(f, g, h, a, b, c, d, e, 59, 0x8cc702081a6439ecULL);
        SHA512_STEP(e, f, g, h, a, b, c, d, 60, 0x90befffa23631e28ULL);
        SHA512_STEP(d, e, f, g, h, a, b, c, 61, 0xa4506cebde82bde9ULL);
        SHA512_STEP(c, d, e, f, g, h, a, b, 62, 0xbef9a3f7b2c67915ULL);
        SHA512_STEP(b, c, d, e, f, g, h, a, 63, 0xc67178f2e372532bULL);

        SHA512_STEP(a, b, c, d, e, f, g, h, 64, 0xca273eceea26619cULL);
        SHA512_STEP(h, a, b, c, d, e, f, g, 65, 0xd186b8c721c0c207ULL);
        SHA512_STEP(g, h, a, b, c, d, e, f, 66, 0xeada7dd6cde0eb1eULL);
        SHA512_STEP(f, g, h, a, b, c, d, e, 67, 0xf57d4f7fee6ed178ULL);
        SHA512_STEP(e, f, g, h, a, b, c, d, 68, 0x06f067aa72176fbaULL);
        SHA512_STEP(d, e, f, g, h, a, b, c, 69, 0x0a637dc5a2c898a6ULL);
        SHA512_STEP(c, d, e, f, g, h, a, b, 70, 0x113f9804bef90daeULL);
        SHA512_STEP(b, c, d, e, f, g, h, a, 71, 0x1b710b35131c471bULL);
        SHA512_STEP(a, b, c, d, e, f, g, h, 72, 0x28db77f523047d84ULL);
        SHA512_STEP(h, a, b, c, d, e, f, g, 73, 0x32caab7b40c72493ULL);
        SHA512_STEP(g, h, a, b, c, d, e, f, 74, 0x3c9ebe0a15c9bebcULL);
        SHA512_STEP(f, g, h, a, b, c, d, e, 75, 0x431d67c49c100d4cULL);
        SHA512_STEP(e, f, g, h, a, b, c, d, 76, 0x4cc5d4becb3e42b6ULL);
        SHA512_STEP(d, e, f, g, h, a, b, c, 77, 0x597f299cfc657e2aULL);
        SHA512_STEP(c, d, e, f, g, h, a, b, 78, 0x5fcb6fab3ad6faecULL);
        SHA512_STEP(b, c, d, e, f, g, h, a, 79, 0x6c44198c4a475817ULL);

        _mm_store_si128 ((__m128i *) &crypt_key[0][index], a);
        _mm_store_si128 ((__m128i *) &crypt_key[1][index], b);
        _mm_store_si128 ((__m128i *) &crypt_key[2][index], c);
        _mm_store_si128 ((__m128i *) &crypt_key[3][index], d);
        _mm_store_si128 ((__m128i *) &crypt_key[4][index], e);
        _mm_store_si128 ((__m128i *) &crypt_key[5][index], f);
        _mm_store_si128 ((__m128i *) &crypt_key[6][index], g);
        _mm_store_si128 ((__m128i *) &crypt_key[7][index], h);
    }

#if FMT_MAIN_VERSION > 10
    return count;
#endif
}
#ifdef PARASAIL_ROWCOL
    parasail_result_t *result = parasail_result_new_rowcol3(s1Len, s2Len);
#else
    parasail_result_t *result = parasail_result_new();
#endif
#endif
    int32_t i = 0;
    int32_t j = 0;
    int32_t end_query = 0;
    int32_t end_ref = 0;
    int64_t score = NEG_INF;
    int64_t matches = NEG_INF;
    int64_t similar = NEG_INF;
    int64_t length = NEG_INF;
    
    __m128i vNegInf = _mm_set1_epi64x(NEG_INF);
    __m128i vOpen = _mm_set1_epi64x(open);
    __m128i vGap  = _mm_set1_epi64x(gap);
    __m128i vZero = _mm_set1_epi64x(0);
    __m128i vOne = _mm_set1_epi64x(1);
    __m128i vN = _mm_set1_epi64x(N);
    __m128i vGapN = _mm_set1_epi64x(gap*N);
    __m128i vNegOne = _mm_set1_epi64x(-1);
    __m128i vI = _mm_set_epi64x(0,1);
    __m128i vJreset = _mm_set_epi64x(0,-1);
    __m128i vMaxScore = vNegInf;
    __m128i vMaxMatch = vNegInf;
    __m128i vMaxSimilar = vNegInf;
    __m128i vMaxLength = vNegInf;
    __m128i vILimit = _mm_set1_epi64x(s1Len);
    __m128i vILimit1 = _mm_sub_epi64(vILimit, vOne);
 constants()
     : shuf128(_mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8,
                            4, 0)),
       const2020(_mm_set1_epi64x(0x0000000200000000)),
       constFFFF(_mm_set1_epi32(0x0F)) {
 }