/** * Processes two doubles at a time */ int _mandelbrot_2( double const * const c_re_arg, double const * const c_im_arg, int max_iter ) { __m128d z_re = _mm_load_pd(c_re_arg); __m128d z_im = _mm_load_pd(c_im_arg); __m128d y_re; __m128d y_im; __m128d c_re = z_re; __m128d c_im = z_im; __m128i count = _mm_set1_epi64x(0); __m128d md; __m128d mt; __m128i mi = _mm_set1_epi16(0xffff);; __m128d two = _mm_set1_pd(2.0); __m128i one = _mm_set1_epi64x(1); for (int i = 0; i<max_iter; i+=1) { // y = z .* z; y_re = _mm_mul_pd(z_re, z_re); y_im = _mm_mul_pd(z_im, z_im); // y = z * z; y_re = _mm_sub_pd(y_re, y_im); y_im = _mm_mul_pd(z_re, z_im); y_im = _mm_add_pd(y_im, y_im); // z = z * z + c z_re = _mm_add_pd(y_re, c_re); z_im = _mm_add_pd(y_im, c_im); // if condition // md = _mm_add_pd(z_re, z_im); // md = _mm_cmplt_pd(md, four); md = _mm_cmplt_pd(z_re, two); mt = _mm_cmplt_pd(z_im, two); md = _mm_and_pd(md, mt); mi = _mm_and_si128(mi, (__m128i) md); // PRINT_M128I(mi); if ( !_mm_movemask_pd(md) ) { break; } // count iterations count = _mm_add_epi64( count, _mm_and_si128( mi, one) ); } int val; count = _mm_add_epi64( _mm_srli_si128(count, 8), count ); val = _mm_cvtsi128_si64( count ); return val; }
void left_shift_w2w_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right) { const uint16_t *src_p = static_cast<const uint16_t *>(src); uint16_t *dst_p = static_cast<uint16_t *>(dst); unsigned vec_left = ceil_n(left, 8); unsigned vec_right = floor_n(right, 8); __m128i count = _mm_set1_epi64x(shift); if (left != vec_left) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 8)); x = _mm_sll_epi16(x, count); mm_store_left_epi16(dst_p + vec_left - 8, x, vec_left - left); } for (unsigned j = vec_left; j < vec_right; j += 8) { __m128i x = _mm_load_si128((const __m128i *)(src_p + j)); x = _mm_sll_epi16(x, count); _mm_store_si128((__m128i *)(dst_p + j), x); } if (right != vec_right) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right)); x = _mm_sll_epi16(x, count); mm_store_right_epi16(dst_p + vec_right, x, right - vec_right); } }
void left_shift_b2b_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right) { const uint8_t *src_p = static_cast<const uint8_t *>(src); uint8_t *dst_p = static_cast<uint8_t *>(dst); unsigned vec_left = ceil_n(left, 16); unsigned vec_right = floor_n(right, 16); __m128i count = _mm_set1_epi64x(shift); if (left != vec_left) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 16)); x = mm_sll_epi8(x, count); mm_store_idxhi_epi8((__m128i *)(dst_p + vec_left - 16), x, left % 16); } for (unsigned j = vec_left; j < vec_right; j += 16) { __m128i x = _mm_load_si128((const __m128i *)(src_p + j)); x = mm_sll_epi8(x, count); _mm_store_si128((__m128i *)(dst_p + j), x); } if (right != vec_right) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right)); x = mm_sll_epi8(x, count); mm_store_idxlo_epi8((__m128i *)(dst_p + vec_right), x, right % 16); } }
void GarbledCct3::gen_next_gen_inp_com(const Bytes &row, size_t kx) { static Bytes tmp; __m128i out_key[2]; tmp = m_prng.rand(Env::k()); tmp.set_ith_bit(0, 0); tmp.resize(16, 0); out_key[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); out_key[1] = _mm_xor_si128(out_key[0], m_R); Bytes msg(m_gen_inp_decom[0].size()); for (size_t jx = 0; jx < Env::circuit().gen_inp_cnt(); jx++) { if (row.get_ith_bit(jx)) { byte bit = m_gen_inp_mask.get_ith_bit(jx); msg ^= m_gen_inp_decom[2*jx+bit]; //msg ^= m_gen_inp_decom[2*jx]; } } __m128i in_key[2], aes_plaintext, aes_ciphertext; aes_plaintext = _mm_set1_epi64x((uint64_t)kx); tmp.assign(msg.begin(), msg.begin()+Env::key_size_in_bytes()); tmp.resize(16, 0); in_key[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); in_key[1] = _mm_xor_si128(in_key[0], m_R); KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&in_key[0]); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); out_key[0] = _mm_xor_si128(out_key[0], aes_ciphertext); KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&in_key[1]); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); out_key[1] = _mm_xor_si128(out_key[1], aes_ciphertext); const byte bit = msg.get_ith_bit(0); tmp.resize(16); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key[ bit]); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key[1-bit]); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); // tmp.resize(16); // _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key[0]); // std::cout << "GEN " << m_gen_inp_hash_ix << " : (" << tmp.to_hex(); // // tmp.resize(16); // _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key[1]); // std::cout << ", " << tmp.to_hex() << ")" << std::endl; m_gen_inp_hash_ix++; }
void left_shift_b2w_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right) { const uint8_t *src_p = static_cast<const uint8_t *>(src); uint16_t *dst_p = static_cast<uint16_t *>(dst); unsigned vec_left = ceil_n(left, 16); unsigned vec_right = floor_n(right, 16); __m128i count = _mm_set1_epi64x(shift); if (left != vec_left) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 16)); __m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128()); __m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128()); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); if (vec_left - left > 8) { mm_store_left_epi16(dst_p + vec_left - 16, lo, (vec_left - left) % 8); _mm_store_si128((__m128i *)(dst_p + vec_left - 8), hi); } else { mm_store_left_epi16(dst_p + vec_left - 8, hi, vec_left - left); } } for (unsigned j = vec_left; j < vec_right; j += 16) { __m128i x = _mm_load_si128((const __m128i *)(src_p + j)); __m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128()); __m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128()); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); _mm_store_si128((__m128i *)(dst_p + j + 0), lo); _mm_store_si128((__m128i *)(dst_p + j + 8), hi); } if (right != vec_right) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right)); __m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128()); __m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128()); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); if (right - vec_right > 8) { _mm_store_si128((__m128i *)(dst_p + vec_right), lo); mm_store_right_epi16(dst_p + vec_right + 8, hi, (right - vec_right) % 8); } else { mm_store_right_epi16(dst_p + vec_right, lo, right - vec_right); } } }
void GarbledCct3::evl_next_gen_inp_com(const Bytes &row, size_t kx) { Bytes out(m_gen_inp_decom[0].size()); for (size_t jx = 0; jx < Env::circuit().gen_inp_cnt(); jx++) { if (row.get_ith_bit(jx)) { out ^= m_gen_inp_decom[jx]; } } byte bit = out.get_ith_bit(0); static Bytes tmp; Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes(); __m128i aes_key, aes_plaintext, aes_ciphertext, out_key; tmp.assign(out.begin(), out.begin()+Env::key_size_in_bytes()); tmp.resize(16, 0); aes_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); aes_plaintext = _mm_set1_epi64x((uint64_t)kx); KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); out_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); out_key = _mm_xor_si128(out_key, aes_ciphertext); bit = _mm_extract_epi8(out_key, 0) & 0x01; m_gen_inp_hash.set_ith_bit(kx, bit); m_i_bufr_ix += 2*Env::key_size_in_bytes(); // tmp.resize(16); // _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key); // std::cout << "EVL " << m_gen_inp_hash_ix << " : " << tmp.to_hex() << std::endl; m_gen_inp_hash_ix++; }
void left_shift_w2b_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right) { const uint16_t *src_p = static_cast<const uint16_t *>(src); uint8_t *dst_p = static_cast<uint8_t *>(dst); unsigned vec_left = ceil_n(left, 16); unsigned vec_right = floor_n(right, 16); __m128i count = _mm_set1_epi64x(shift); if (left != vec_left) { __m128i lo = _mm_load_si128((const __m128i *)(src_p + vec_left - 16)); __m128i hi = _mm_load_si128((const __m128i *)(src_p + vec_left - 8)); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); lo = _mm_packus_epi16(lo, hi); mm_store_left_epi8(dst_p + vec_left - 16, lo, vec_left - left); } for (unsigned j = vec_left; j < vec_right; j += 16) { __m128i lo = _mm_load_si128((const __m128i *)(src_p + j + 0)); __m128i hi = _mm_load_si128((const __m128i *)(src_p + j + 8)); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); lo = _mm_packus_epi16(lo, hi); _mm_store_si128((__m128i *)(dst_p + j), lo); } if (right != vec_right) { __m128i lo = _mm_load_si128((const __m128i *)(src_p + vec_right + 0)); __m128i hi = _mm_load_si128((const __m128i *)(src_p + vec_right + 8)); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); lo = _mm_packus_epi16(lo, hi); mm_store_right_epi8(dst_p + vec_right, lo, right - vec_right); } }
/***************************************************************************** * This function utilises 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * * vp9_encoder.c. * * For the joint cost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * * For the component costs: * * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * * If these do not hold, then this function cannot be used without * * modification, in which case you can revert to using the C implementation, * * which does not rely on these properties. * *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min); const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); // search_param determines the length of the initial step and hence the number // of iterations. // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; const int tot_steps = cfg->total_steps - search_param; const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; const uint8_t *const what = x->plane[0].src.buf; const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; #if ARCH_X86_64 __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif unsigned int best_sad; int i; int j; int step; // Check the prerequisite cost function properties that are easy to check // in an assert. See the function-level documentation for details on all // prerequisites. assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); // Check the starting position best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { __m128i v_sad_d; __m128i v_cost_d; __m128i v_outside_d; __m128i v_inside_d; __m128i v_diff_mv_w; #if ARCH_X86_64 __m128i v_blocka[2]; #else __m128i v_blocka[1]; #endif // Compute the candidate motion vectors const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]); const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); // Clamp them to the search bounds __m128i v_these_mv_clamp_w = v_these_mv_w; v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); // The ones that did not change are inside the search area v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); // If none of them are inside, then move on if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) { continue; } // The inverse mask indicates which of the MVs are outside v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); // Shift right to keep the sign bit clear, we will use this later // to set the cost to the maximum value. v_outside_d = _mm_srli_epi32(v_outside_d, 1); // Compute the difference MV v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); // We utilise the fact that the cost function is even, and use the // absolute difference. This allows us to use unsigned indexes later // and reduces cache pressure somewhat as only a half of the table // is ever referenced. v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); // Compute the SIMD pointer offsets. { #if ARCH_X86_64 // sizeof(intptr_t) == 8 // Load the offsets __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]); __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]); // Set the ones falling outside to zero v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); v_bo32_q = _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); // Compute the candidate addresses v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); #else // ARCH_X86 // sizeof(intptr_t) == 4 __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]); v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); #endif } fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], in_what_stride, (uint32_t*)&v_sad_d); // Look up the component cost of the residual motion vector { const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); // Note: This is a use case for vpgather in AVX2 const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; __m128i v_cost_10_d, v_cost_32_d; v_cost_10_d = _mm_cvtsi32_si128(cost0); v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); v_cost_32_d = _mm_cvtsi32_si128(cost2); v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); } // Now add in the joint cost { const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); } // Multiply by sad_per_bit v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); // ROUND_POWER_OF_TWO(v_cost_d, 8) v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80)); v_cost_d = _mm_srai_epi32(v_cost_d, 8); // Add the cost to the sad v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); // Make the motion vectors outside the search area have max cost // by or'ing in the comparison mask, this way the minimum search won't // pick them. v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); // Find the minimum value and index horizontally in v_sad_d { // Try speculatively on 16 bits, so we can use the minpos intrinsic const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); // If the local best value is not saturated, just use it, otherwise // find the horizontal minimum again the hard way on 32 bits. // This is executed rarely. if (__unlikely__(local_best_sad == 0xffff)) { __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; v_loval_d = v_sad_d; v_loidx_d = _mm_set_epi32(3, 2, 1, 0); v_hival_d = _mm_srli_si128(v_loval_d, 8); v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); v_hival_d = _mm_srli_si128(v_loval_d, 4); v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); local_best_sad = _mm_extract_epi32(v_loval_d, 0); local_best_idx = _mm_extract_epi32(v_loidx_d, 0); } // Update the global minimum if the local minimum is smaller if (__likely__(local_best_sad < best_sad)) { new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; best_sad = local_best_sad; } } } bmv = new_bmv; best_address = new_best_address; v_bmv_w = _mm_set1_epi32(bmv.as_int); #if ARCH_X86_64 v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif if (__unlikely__(best_address == in_what)) { (*num00)++; } } *best_mv = bmv.as_mv; return best_sad; }
static inline int blake512_compress( state * state, const u8 * datablock ) { __m128i row1l; __m128i row2l; __m128i row3l; __m128i row4l; u64 row1hl, row1hh; u64 row2hl, row2hh; u64 row3hl, row3hh; u64 row4hl, row4hh; const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9); const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); union { __m128i u128[8]; u64 u64[16]; } m; __m128i t0, t1, t2, t3, t4, t5, t6, t7; u64 u0, u1, u2, u3; __m128i b0; u64 b1l, b1h; m.u128[0] = _mm_loadu_si128((__m128i*)(datablock + 0)); m.u128[1] = _mm_loadu_si128((__m128i*)(datablock + 16)); m.u128[2] = _mm_loadu_si128((__m128i*)(datablock + 32)); m.u128[3] = _mm_loadu_si128((__m128i*)(datablock + 48)); m.u128[4] = _mm_loadu_si128((__m128i*)(datablock + 64)); m.u128[5] = _mm_loadu_si128((__m128i*)(datablock + 80)); m.u128[6] = _mm_loadu_si128((__m128i*)(datablock + 96)); m.u128[7] = _mm_loadu_si128((__m128i*)(datablock + 112)); m.u128[0] = BSWAP64(m.u128[0]); m.u128[1] = BSWAP64(m.u128[1]); m.u128[2] = BSWAP64(m.u128[2]); m.u128[3] = BSWAP64(m.u128[3]); m.u128[4] = BSWAP64(m.u128[4]); m.u128[5] = BSWAP64(m.u128[5]); m.u128[6] = BSWAP64(m.u128[6]); m.u128[7] = BSWAP64(m.u128[7]); row1l = _mm_load_si128((__m128i*)&state->h[0]); row1hl = state->h[2]; row1hh = state->h[3]; row2l = _mm_load_si128((__m128i*)&state->h[4]); row2hl = state->h[6]; row2hh = state->h[7]; row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL); row3hl = 0xA4093822299F31D0ULL; row3hh = 0x082EFA98EC4E6C89ULL; row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); row4hl = 0xC0AC29B7C97C50DDULL; row4hh = 0x3F84D5B5B5470917ULL; if(!state->nullt) { row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0])); row4hl ^= state->t[1]; row4hh ^= state->t[1]; } ROUND( 0); ROUND( 1); ROUND( 2); ROUND( 3); ROUND( 4); ROUND( 5); ROUND( 6); ROUND( 7); ROUND( 8); ROUND( 9); ROUND(10); ROUND(11); ROUND(12); ROUND(13); ROUND(14); ROUND(15); row1l = _mm_xor_si128(row3l,row1l); row1hl ^= row3hl; row1hh ^= row3hh; _mm_store_si128((__m128i*)&state->h[0], _mm_xor_si128(row1l, _mm_load_si128((__m128i*)&state->h[0]))); state->h[2] ^= row1hl; state->h[3] ^= row1hh; row2l = _mm_xor_si128(row4l,row2l); row2hl ^= row4hl; row2hh ^= row4hh; _mm_store_si128((__m128i*)&state->h[4], _mm_xor_si128(row2l, _mm_load_si128((__m128i*)&state->h[4]))); state->h[6] ^= row2hl; state->h[7] ^= row2hh; return 0; }
/* vms_expma: * Compute the component-wise exponential minus <a>: * r[i] <-- e^x[i] - a * * The following comments apply to the SSE2 version of this code: * * Computation is done four doubles as a time by doing computation in paralell * on two vectors of two doubles using SSE2 intrisics. If size is not a * multiple of 4, the remaining elements are computed using the stdlib exp(). * * The computation is done by first doing a range reduction of the argument of * the type e^x = 2^k * e^f choosing k and f so that f is in [-0.5, 0.5]. * Then 2^k can be computed exactly using bit operations to build the double * result and e^f can be efficiently computed with enough precision using a * polynomial approximation. * * The polynomial approximation is done with 11th order polynomial computed by * Remez algorithm with the Solya suite, instead of the more classical Pade * polynomial form cause it is better suited to parallel execution. In order * to achieve the same precision, a Pade form seems to require three less * multiplications but need a very costly division, so it will be less * efficient. * * The maximum error is less than 1lsb and special cases are correctly * handled: * +inf or +oor --> return +inf * -inf or -oor --> return 0.0 * qNaN or sNaN --> return qNaN * * This code is copyright 2004-2012 Thomas Lavergne and licenced under the * BSD licence like the remaining of Wapiti. */ void xvm_expma(double r[], const double x[], double a, uint64_t N) { #if defined(__SSE2__) && !defined(XVM_ANSI) #define xvm_vconst(v) (_mm_castsi128_pd(_mm_set1_epi64x((v)))) assert(r != NULL && ((uintptr_t)r % 16) == 0); assert(x != NULL && ((uintptr_t)x % 16) == 0); const __m128i vl = _mm_set1_epi64x(0x3ff0000000000000ULL); const __m128d ehi = xvm_vconst(0x4086232bdd7abcd2ULL); const __m128d elo = xvm_vconst(0xc086232bdd7abcd2ULL); const __m128d l2e = xvm_vconst(0x3ff71547652b82feULL); const __m128d hal = xvm_vconst(0x3fe0000000000000ULL); const __m128d nan = xvm_vconst(0xfff8000000000000ULL); const __m128d inf = xvm_vconst(0x7ff0000000000000ULL); const __m128d c1 = xvm_vconst(0x3fe62e4000000000ULL); const __m128d c2 = xvm_vconst(0x3eb7f7d1cf79abcaULL); const __m128d p0 = xvm_vconst(0x3feffffffffffffeULL); const __m128d p1 = xvm_vconst(0x3ff000000000000bULL); const __m128d p2 = xvm_vconst(0x3fe0000000000256ULL); const __m128d p3 = xvm_vconst(0x3fc5555555553a2aULL); const __m128d p4 = xvm_vconst(0x3fa55555554e57d3ULL); const __m128d p5 = xvm_vconst(0x3f81111111362f4fULL); const __m128d p6 = xvm_vconst(0x3f56c16c25f3bae1ULL); const __m128d p7 = xvm_vconst(0x3f2a019fc9310c33ULL); const __m128d p8 = xvm_vconst(0x3efa01825f3cb28bULL); const __m128d p9 = xvm_vconst(0x3ec71e2bd880fdd8ULL); const __m128d p10 = xvm_vconst(0x3e9299068168ac8fULL); const __m128d p11 = xvm_vconst(0x3e5ac52350b60b19ULL); const __m128d va = _mm_set1_pd(a); for (uint64_t n = 0; n < N; n += 4) { __m128d mn1, mn2, mi1, mi2; __m128d t1, t2, d1, d2; __m128d v1, v2, w1, w2; __m128i k1, k2; __m128d f1, f2; // Load the next four values __m128d x1 = _mm_load_pd(x + n ); __m128d x2 = _mm_load_pd(x + n + 2); // Check for out of ranges, infinites and NaN mn1 = _mm_cmpneq_pd(x1, x1); mn2 = _mm_cmpneq_pd(x2, x2); mi1 = _mm_cmpgt_pd(x1, ehi); mi2 = _mm_cmpgt_pd(x2, ehi); x1 = _mm_max_pd(x1, elo); x2 = _mm_max_pd(x2, elo); // Range reduction: we search k and f such that e^x = 2^k * e^f // with f in [-0.5, 0.5] t1 = _mm_mul_pd(x1, l2e); t2 = _mm_mul_pd(x2, l2e); t1 = _mm_add_pd(t1, hal); t2 = _mm_add_pd(t2, hal); k1 = _mm_cvttpd_epi32(t1); k2 = _mm_cvttpd_epi32(t2); d1 = _mm_cvtepi32_pd(k1); d2 = _mm_cvtepi32_pd(k2); t1 = _mm_mul_pd(d1, c1); t2 = _mm_mul_pd(d2, c1); f1 = _mm_sub_pd(x1, t1); f2 = _mm_sub_pd(x2, t2); t1 = _mm_mul_pd(d1, c2); t2 = _mm_mul_pd(d2, c2); f1 = _mm_sub_pd(f1, t1); f2 = _mm_sub_pd(f2, t2); // Evaluation of e^f using a 11th order polynom in Horner form v1 = _mm_mul_pd(f1, p11); v2 = _mm_mul_pd(f2, p11); v1 = _mm_add_pd(v1, p10); v2 = _mm_add_pd(v2, p10); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p9); v2 = _mm_add_pd(v2, p9); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p8); v2 = _mm_add_pd(v2, p8); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p7); v2 = _mm_add_pd(v2, p7); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p6); v2 = _mm_add_pd(v2, p6); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p5); v2 = _mm_add_pd(v2, p5); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p4); v2 = _mm_add_pd(v2, p4); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p3); v2 = _mm_add_pd(v2, p3); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p2); v2 = _mm_add_pd(v2, p2); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p1); v2 = _mm_add_pd(v2, p1); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p0); v2 = _mm_add_pd(v2, p0); // Evaluation of 2^k using bitops to achieve exact computation k1 = _mm_slli_epi32(k1, 20); k2 = _mm_slli_epi32(k2, 20); k1 = _mm_shuffle_epi32(k1, 0x72); k2 = _mm_shuffle_epi32(k2, 0x72); k1 = _mm_add_epi32(k1, vl); k2 = _mm_add_epi32(k2, vl); w1 = _mm_castsi128_pd(k1); w2 = _mm_castsi128_pd(k2); // Return to full range to substract <a> v1 = _mm_mul_pd(v1, w1); v2 = _mm_mul_pd(v2, w2); v1 = _mm_sub_pd(v1, va); v2 = _mm_sub_pd(v2, va); // Finally apply infinite and NaN where needed v1 = _mm_or_pd(_mm_and_pd(mi1, inf), _mm_andnot_pd(mi1, v1)); v2 = _mm_or_pd(_mm_and_pd(mi2, inf), _mm_andnot_pd(mi2, v2)); v1 = _mm_or_pd(_mm_and_pd(mn1, nan), _mm_andnot_pd(mn1, v1)); v2 = _mm_or_pd(_mm_and_pd(mn2, nan), _mm_andnot_pd(mn2, v2)); // Store the results _mm_store_pd(r + n, v1); _mm_store_pd(r + n + 2, v2); } #else for (uint64_t n = 0; n < N; n++) r[n] = exp(x[n]) - a; #endif }
void GarbledCct::evl_next_gate(const Gate ¤t_gate) { __m128i current_key, a; Bytes::const_iterator it; static Bytes tmp; if (current_gate.m_tag == Circuit::GEN_INP) { uint8_t bit = m_gen_inp_mask.get_ith_bit(m_gen_inp_ix); Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes(); tmp = m_M[m_gen_inp_ix].to_bytes().hash(Env::k()); tmp.resize(16, 0); current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); m_i_bufr_ix += Env::key_size_in_bytes()*2; current_key = _mm_xor_si128(current_key, a); m_gen_inp_ix++; } else if (current_gate.m_tag == Circuit::EVL_INP) { uint8_t bit = m_evl_inp.get_ith_bit(m_evl_inp_ix); Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes(); tmp = (*m_ot_keys)[m_evl_inp_ix]; tmp.resize(16, 0); current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); m_i_bufr_ix += Env::key_size_in_bytes()*2; current_key = _mm_xor_si128(current_key, a); m_evl_inp_ix++; } else { const vector<uint64_t> &inputs = current_gate.m_input_idx; #ifdef FREE_XOR if (is_xor(current_gate)) { current_key = inputs.size() == 2? _mm_xor_si128(m_w[inputs[0]], m_w[inputs[1]]) : _mm_load_si128(m_w+inputs[0]); } else #endif if (inputs.size() == 2) // 2-arity gates { __m128i aes_key[2], aes_plaintext, aes_ciphertext; aes_plaintext = _mm_set1_epi64x(m_gate_ix); aes_key[0] = _mm_load_si128(m_w+inputs[0]); aes_key[1] = _mm_load_si128(m_w+inputs[1]); const uint8_t perm_x = _mm_extract_epi8(aes_key[0], 0) & 0x01; const uint8_t perm_y = _mm_extract_epi8(aes_key[1], 0) & 0x01; KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); uint8_t garbled_ix = (perm_y<<1) | (perm_x<<0); #ifdef GRR if (garbled_ix == 0) { current_key = _mm_load_si128(&aes_ciphertext); } else { it = m_i_bufr_ix+(garbled_ix-1)*Env::key_size_in_bytes(); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); current_key = _mm_xor_si128(aes_ciphertext, a); } m_i_bufr_ix += 3*Env::key_size_in_bytes(); #else it = m_i_bufr_ix + garbled_ix*Env::key_size_in_bytes(); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); current_key = _mm_xor_si128(current_key, aes_ciphertext); m_i_bufr_ix += 4*Env::key_size_in_bytes(); #endif } else // 1-arity gates { __m128i aes_key, aes_plaintext, aes_ciphertext; aes_plaintext = _mm_set1_epi64x(m_gate_ix); aes_key = _mm_load_si128(m_w+inputs[0]); KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); const uint8_t perm_x = _mm_extract_epi8(aes_key, 0) & 0x01; #ifdef GRR if (perm_x == 0) { current_key = _mm_load_si128(&aes_ciphertext); } else { tmp.assign(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes()); tmp.resize(16, 0); a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); current_key = _mm_xor_si128(aes_ciphertext, a); } m_i_bufr_ix += Env::key_size_in_bytes(); #else it = m_i_bufr_ix + garbled_ix*Env::key_size_in_bytes(); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); current_key = _mm_xor_si128(current_key, aes_ciphertext); m_i_bufr_ix += 2*Env::key_size_in_bytes(); #endif } if (current_gate.m_tag == Circuit::EVL_OUT) { uint8_t out_bit = _mm_extract_epi8(current_key, 0) & 0x01; out_bit ^= *m_i_bufr_ix; m_evl_out.set_ith_bit(m_evl_out_ix, out_bit); m_i_bufr_ix++; m_evl_out_ix++; } else if (current_gate.m_tag == Circuit::GEN_OUT) { // TODO: Ki08 implementation uint8_t out_bit = _mm_extract_epi8(current_key, 0) & 0x01; out_bit ^= *m_i_bufr_ix; m_gen_out.set_ith_bit(m_gen_out_ix, out_bit); m_i_bufr_ix++; // m_C[2*m_gen_out_ix+0] = Bytes(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes()); // m_i_bufr_ix += Env::key_size_in_bytes(); // // m_C[2*m_gen_out_ix+1] = Bytes(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes()); // m_i_bufr_ix += Env::key_size_in_bytes(); m_gen_out_ix++; } } _mm_store_si128(m_w+current_gate.m_idx, current_key); update_hash(m_i_bufr); m_gate_ix++; }
void GarbledCct::gen_next_gate(const Gate ¤t_gate) { __m128i current_zero_key; if (current_gate.m_tag == Circuit::GEN_INP) { __m128i a[2]; // zero_key = m_prng.rand(Env::k()); static Bytes tmp; tmp = m_prng.rand(Env::k()); tmp.resize(16, 0); current_zero_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[0] = m_M[2*m_gen_inp_ix+0].to_bytes().hash(Env::k()); tmp = m_M[2*m_gen_inp_ix+0].to_bytes().hash(Env::k()); tmp.resize(16, 0); a[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[1] = m_M[2*m_gen_inp_ix+1].to_bytes().hash(Env::k()); tmp = m_M[2*m_gen_inp_ix+1].to_bytes().hash(Env::k()); tmp.resize(16, 0); a[1] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[0] ^= zero_key; a[1] ^= zero_key ^ R; a[0] = _mm_xor_si128(a[0], current_zero_key); a[1] = _mm_xor_si128(a[1], _mm_xor_si128(current_zero_key, m_R)); uint8_t bit = m_gen_inp_mask.get_ith_bit(m_gen_inp_ix); // m_o_bufr += a[bit]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[bit]); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); // m_o_bufr += a[1-bit]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[1-bit]); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); m_gen_inp_ix++; } else if (current_gate.m_tag == Circuit::EVL_INP) { __m128i a[2]; // zero_key = m_prng.rand(Env::k()); static Bytes tmp; tmp = m_prng.rand(Env::k()); tmp.resize(16, 0); current_zero_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[0] = (*m_ot_keys)[2*m_evl_inp_ix+0]; tmp = (*m_ot_keys)[2*m_evl_inp_ix+0]; tmp.resize(16, 0); a[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[1] = (*m_ot_keys)[2*m_evl_inp_ix+1]; tmp = (*m_ot_keys)[2*m_evl_inp_ix+1]; tmp.resize(16, 0); a[1] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[0] ^= zero_key; a[1] ^= zero_key ^ R; a[0] = _mm_xor_si128(a[0], current_zero_key); a[1] = _mm_xor_si128(a[1], _mm_xor_si128(current_zero_key, m_R)); // m_o_bufr += a[0]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[0]); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); // m_o_bufr += a[1]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[1]); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); m_evl_inp_ix++; } else { const vector<uint64_t> &inputs = current_gate.m_input_idx; assert(inputs.size() == 1 || inputs.size() == 2); #ifdef FREE_XOR if (is_xor(current_gate)) { current_zero_key = inputs.size() == 2? _mm_xor_si128(m_w[inputs[0]], m_w[inputs[1]]) : _mm_load_si128(m_w+inputs[0]); } else #endif if (inputs.size() == 2) // 2-arity gates { uint8_t bit; __m128i aes_key[2], aes_plaintext, aes_ciphertext; __m128i X[2], Y[2], Z[2]; static Bytes tmp(16, 0); aes_plaintext = _mm_set1_epi64x(m_gate_ix); X[0] = _mm_load_si128(m_w+inputs[0]); Y[0] = _mm_load_si128(m_w+inputs[1]); X[1] = _mm_xor_si128(X[0], m_R); // X[1] = X[0] ^ R Y[1] = _mm_xor_si128(Y[0], m_R); // Y[1] = Y[0] ^ R const uint8_t perm_x = _mm_extract_epi8(X[0], 0) & 0x01; // permutation bit for X const uint8_t perm_y = _mm_extract_epi8(Y[0], 0) & 0x01; // permutation bit for Y const uint8_t de_garbled_ix = (perm_y<<1)|perm_x; // encrypt the 0-th entry : (X[x], Y[y]) aes_key[0] = _mm_load_si128(X+perm_x); aes_key[1] = _mm_load_si128(Y+perm_y); KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); // clear extra bits so that only k bits left bit = current_gate.m_table[de_garbled_ix]; #ifdef GRR // GRR technique: using zero entry's key as one of the output keys _mm_store_si128(Z+bit, aes_ciphertext); Z[1-bit] = _mm_xor_si128(Z[bit], m_R); current_zero_key = _mm_load_si128(Z); #else tmp = m_prng.rand(Env::k()); tmp.resize(16, 0); Z[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); Z[1] = _mm_xor_si128(Z[0], m_R); aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); #endif // encrypt the 1st entry : (X[1-x], Y[y]) aes_key[0] = _mm_xor_si128(aes_key[0], m_R); KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); bit = current_gate.m_table[0x01^de_garbled_ix]; aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); // encrypt the 2nd entry : (X[x], Y[1-y]) aes_key[0] = _mm_xor_si128(aes_key[0], m_R); aes_key[1] = _mm_xor_si128(aes_key[1], m_R); KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); bit = current_gate.m_table[0x02^de_garbled_ix]; aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); // encrypt the 3rd entry : (X[1-x], Y[1-y]) aes_key[0] = _mm_xor_si128(aes_key[0], m_R); KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); bit = current_gate.m_table[0x03^de_garbled_ix]; aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); } else // 1-arity gates { uint8_t bit; __m128i aes_key, aes_plaintext, aes_ciphertext; __m128i X[2], Z[2]; static Bytes tmp; tmp.assign(16, 0); aes_plaintext = _mm_set1_epi64x(m_gate_ix); X[0] = _mm_load_si128(m_w+inputs[0]); X[1] = _mm_xor_si128(X[0], m_R); const uint8_t perm_x = _mm_extract_epi8(X[0], 0) & 0x01; // 0-th entry : X[x] aes_key = _mm_load_si128(X+perm_x); KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); bit = current_gate.m_table[perm_x]; #ifdef GRR _mm_store_si128(Z+bit, aes_ciphertext); Z[1-bit] = _mm_xor_si128(Z[bit], m_R); current_zero_key = _mm_load_si128(Z); #else tmp = m_prng.rand(Env::k()); tmp.resize(16, 0); Z[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); Z[1] = _mm_xor_si128(Z[0], m_R); aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); #endif // 1-st entry : X[1-x] aes_key = _mm_xor_si128(aes_key, m_R); KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); bit = current_gate.m_table[0x01^perm_x]; aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); } if (current_gate.m_tag == Circuit::EVL_OUT) { m_o_bufr.push_back(_mm_extract_epi8(current_zero_key, 0) & 0x01); // permutation bit } else if (current_gate.m_tag == Circuit::GEN_OUT) { m_o_bufr.push_back(_mm_extract_epi8(current_zero_key, 0) & 0x01); // permutation bit // // TODO: C[ix_0] = w[ix0] || randomness, C[ix_1] = w[ix1] || randomness // m_o_bufr += (key_pair[0] + m_prng.rand(Env::k())).hash(Env::k()); // m_o_bufr += (key_pair[1] + m_prng.rand(Env::k())).hash(Env::k()); } } _mm_store_si128(m_w+current_gate.m_idx, current_zero_key); m_gate_ix++; }
void TestRootBoard::generateCaptures() { QTextStream xout(stderr); cpu_set_t mask; CPU_ZERO( &mask ); CPU_SET( 1, &mask ); if ( sched_setaffinity( 0, sizeof(mask), &mask ) == -1 ) qDebug() << "Could not set CPU Affinity" << endl; static const unsigned testCases = 200; static const int iter = 10000; typedef QVector<uint64_t> Sample; QVector<Sample> times(testCases, Sample(iter)); QVector<Sample> movetimes(testCases, Sample(iter)); QVector<Sample> captimes(testCases, Sample(iter)); QVector<Sample> b02flood(testCases, Sample(iter)); QVector<Sample> b02point(testCases, Sample(iter)); QVector<Sample> b02double(testCases, Sample(iter)); Move moveList[256]; uint64_t sum=0; uint64_t movesum=0; uint64_t nmoves=0; uint64_t ncap =0; uint64_t a, d, tsc; Key blah; Colors color[testCases]; double cpufreq = 3900.0; for (unsigned int i = testCases; i;) { --i; b->setup(testPositions[i]); color[i] = b->color; if (i) { b->boards[i] = b->boards[0]; } movetimes[i].reserve(iter*2); times[i].reserve(iter*2); captimes[i].reserve(iter*2); } unsigned op = 1; const unsigned int iter2 = 10000000; __v2di res = _mm_set1_epi64x(0); uint64_t time=0; #ifdef NDEBUG for (unsigned int i = 0; i < iter2; ++i) { Board& bb = b->boards[i & 0xf].wb; tsc = readtsc(); res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; time += readtsc() - tsc; // op = fold(res) & 0x3f; } std::cout << "build02(pos): " << time/iter2 << " clocks" << std::endl; time=0; for (unsigned int i = 0; i < iter2; ++i) { Board& bb = b->boards[i & 0xf].wb; tsc = readtsc(); res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; time += readtsc() - tsc; } std::cout << "build13(pos): " << time/iter2 << " clocks" << std::endl; // time=0; // for (unsigned int i = 0; i < iter2; ++i) { // BoardBase& bb = b->boards[i & 0xf].wb; // tsc = readtsc(); // res = bb.build02Attack(res); // time += readtsc() - tsc; // } // std::cout << "build02(vector): " << time/iter2 << " clocks" << std::endl; time=0; for (unsigned int i = 0; i < iter2; ++i) { Board& bb = b->boards[i & 0xf].wb; tsc = readtsc(); res = b->boards[0].wb.build13Attack(res); res = b->boards[1].wb.build13Attack(res); res = b->boards[2].wb.build13Attack(res); res = b->boards[3].wb.build13Attack(res); res = b->boards[4].wb.build13Attack(res); res = b->boards[5].wb.build13Attack(res); res = b->boards[6].wb.build13Attack(res); res = b->boards[7].wb.build13Attack(res); time += readtsc() - tsc; } std::cout << "build13(vector): " << time/iter2 << " clocks" << std::endl; for (int j = 0; j < iter; ++j) { nmoves = 0; ncap=0; for (unsigned int i = 0; i < testCases; ++i) { // b->setup(testPositions[i]); uint64_t overhead; /* asm volatile("cpuid\n rdtsc" : "=a" (a), "=d" (d) :: "%rbx", "%rcx"); tsc = (a + (d << 32)); asm volatile("cpuid\n rdtsc" : "=a" (a), "=d" (d) :: "%rbx", "%rcx"); overhead = (a + (d << 32)) - tsc; */ overhead = 20; if (color[i] == White) b->boards[i].wb.buildAttacks(); else b->boards[i].bb.buildAttacks(); tsc = readtsc(); Move* good = moveList+192; Move* bad = good; if (color[i] == White) b->boards[i].wb.generateCaptureMoves<AllMoves>(good, bad); else b->boards[i].bb.generateCaptureMoves<AllMoves>(good, bad); ncap += bad - good; captimes[i][j] = readtsc() - tsc - overhead; tsc = readtsc(); if (color[i] == White) b->boards[i].wb.generateNonCap(good, bad); else b->boards[i].bb.generateNonCap(good, bad); nmoves += bad - good; times[i][j] = readtsc() - tsc - overhead; for (Move* k=good; k<bad; ++k) { // std::cout << k->string() << std::endl; tsc = readtsc(); if (color[i] == White) { __v8hi est = b->boards[i].b->eval.estimate(wb, *k); ColoredBoard<Black> bb(b->boards[i].wb, *k, est); blah += bb.getZobrist(); } else { __v8hi est = b->boards[i].b->eval.estimate(bb, *k); ColoredBoard<White> bb(b->boards[i].bb, *k, est); blah += bb.getZobrist(); } movetimes[i][j] += readtsc() - tsc - overhead; } // std::string empty; // std::cin >> empty; } } for (QVector<Sample>::Iterator i = times.begin(); i != times.end(); ++i) { qSort(*i); sum += (*i)[iter / 2]; } uint64_t capsum=0; for (QVector<Sample>::Iterator i = captimes.begin(); i != captimes.end(); ++i) { qSort(*i); capsum += (*i)[iter / 2]; } for (QVector<Sample>::Iterator i = movetimes.begin(); i != movetimes.end(); ++i) { qSort(*i); movesum += (*i)[iter / 2]; } xout << endl << nmoves << " Moves, " << sum/nmoves << " Clocks, " << cpufreq* nmoves/sum << " generated Mmoves/s, " << cpufreq* nmoves/movesum << " executed Mmoves/s" << endl; xout << ncap << " Captures, " << capsum/ncap << " Clocks, " << cpufreq* ncap/capsum << " generated Mmoves/s, " /*<< cpufreq*ncap/movesum << " executed Mmoves/s" */<< endl; xout << blah + fold(res) + op64 << endl; #endif }
static void crypt_all (int count) #endif { #if FMT_MAIN_VERSION > 10 int count = *pcount; #endif int index = 0; #ifdef _OPENMP #pragma omp parallel for for (index = 0; index < count; index += 2) #endif { int i; __m128i a, b, c, d, e, f, g, h; __m128i w[80], tmp1, tmp2; for (i = 0; i < 14; i += 2) { GATHER (tmp1, saved_key, i); GATHER (tmp2, saved_key, i + 1); SWAP_ENDIAN (tmp1); SWAP_ENDIAN (tmp2); w[i] = tmp1; w[i + 1] = tmp2; } GATHER (tmp1, saved_key, 14); SWAP_ENDIAN (tmp1); w[14] = tmp1; GATHER (w[15], saved_key, 15); for (i = 16; i < 80; i++) R(i); a = _mm_set1_epi64x (0x6a09e667f3bcc908ULL); b = _mm_set1_epi64x (0xbb67ae8584caa73bULL); c = _mm_set1_epi64x (0x3c6ef372fe94f82bULL); d = _mm_set1_epi64x (0xa54ff53a5f1d36f1ULL); e = _mm_set1_epi64x (0x510e527fade682d1ULL); f = _mm_set1_epi64x (0x9b05688c2b3e6c1fULL); g = _mm_set1_epi64x (0x1f83d9abfb41bd6bULL); h = _mm_set1_epi64x (0x5be0cd19137e2179ULL); SHA512_STEP(a, b, c, d, e, f, g, h, 0, 0x428a2f98d728ae22ULL); SHA512_STEP(h, a, b, c, d, e, f, g, 1, 0x7137449123ef65cdULL); SHA512_STEP(g, h, a, b, c, d, e, f, 2, 0xb5c0fbcfec4d3b2fULL); SHA512_STEP(f, g, h, a, b, c, d, e, 3, 0xe9b5dba58189dbbcULL); SHA512_STEP(e, f, g, h, a, b, c, d, 4, 0x3956c25bf348b538ULL); SHA512_STEP(d, e, f, g, h, a, b, c, 5, 0x59f111f1b605d019ULL); SHA512_STEP(c, d, e, f, g, h, a, b, 6, 0x923f82a4af194f9bULL); SHA512_STEP(b, c, d, e, f, g, h, a, 7, 0xab1c5ed5da6d8118ULL); SHA512_STEP(a, b, c, d, e, f, g, h, 8, 0xd807aa98a3030242ULL); SHA512_STEP(h, a, b, c, d, e, f, g, 9, 0x12835b0145706fbeULL); SHA512_STEP(g, h, a, b, c, d, e, f, 10, 0x243185be4ee4b28cULL); SHA512_STEP(f, g, h, a, b, c, d, e, 11, 0x550c7dc3d5ffb4e2ULL); SHA512_STEP(e, f, g, h, a, b, c, d, 12, 0x72be5d74f27b896fULL); SHA512_STEP(d, e, f, g, h, a, b, c, 13, 0x80deb1fe3b1696b1ULL); SHA512_STEP(c, d, e, f, g, h, a, b, 14, 0x9bdc06a725c71235ULL); SHA512_STEP(b, c, d, e, f, g, h, a, 15, 0xc19bf174cf692694ULL); SHA512_STEP(a, b, c, d, e, f, g, h, 16, 0xe49b69c19ef14ad2ULL); SHA512_STEP(h, a, b, c, d, e, f, g, 17, 0xefbe4786384f25e3ULL); SHA512_STEP(g, h, a, b, c, d, e, f, 18, 0x0fc19dc68b8cd5b5ULL); SHA512_STEP(f, g, h, a, b, c, d, e, 19, 0x240ca1cc77ac9c65ULL); SHA512_STEP(e, f, g, h, a, b, c, d, 20, 0x2de92c6f592b0275ULL); SHA512_STEP(d, e, f, g, h, a, b, c, 21, 0x4a7484aa6ea6e483ULL); SHA512_STEP(c, d, e, f, g, h, a, b, 22, 0x5cb0a9dcbd41fbd4ULL); SHA512_STEP(b, c, d, e, f, g, h, a, 23, 0x76f988da831153b5ULL); SHA512_STEP(a, b, c, d, e, f, g, h, 24, 0x983e5152ee66dfabULL); SHA512_STEP(h, a, b, c, d, e, f, g, 25, 0xa831c66d2db43210ULL); SHA512_STEP(g, h, a, b, c, d, e, f, 26, 0xb00327c898fb213fULL); SHA512_STEP(f, g, h, a, b, c, d, e, 27, 0xbf597fc7beef0ee4ULL); SHA512_STEP(e, f, g, h, a, b, c, d, 28, 0xc6e00bf33da88fc2ULL); SHA512_STEP(d, e, f, g, h, a, b, c, 29, 0xd5a79147930aa725ULL); SHA512_STEP(c, d, e, f, g, h, a, b, 30, 0x06ca6351e003826fULL); SHA512_STEP(b, c, d, e, f, g, h, a, 31, 0x142929670a0e6e70ULL); SHA512_STEP(a, b, c, d, e, f, g, h, 32, 0x27b70a8546d22ffcULL); SHA512_STEP(h, a, b, c, d, e, f, g, 33, 0x2e1b21385c26c926ULL); SHA512_STEP(g, h, a, b, c, d, e, f, 34, 0x4d2c6dfc5ac42aedULL); SHA512_STEP(f, g, h, a, b, c, d, e, 35, 0x53380d139d95b3dfULL); SHA512_STEP(e, f, g, h, a, b, c, d, 36, 0x650a73548baf63deULL); SHA512_STEP(d, e, f, g, h, a, b, c, 37, 0x766a0abb3c77b2a8ULL); SHA512_STEP(c, d, e, f, g, h, a, b, 38, 0x81c2c92e47edaee6ULL); SHA512_STEP(b, c, d, e, f, g, h, a, 39, 0x92722c851482353bULL); SHA512_STEP(a, b, c, d, e, f, g, h, 40, 0xa2bfe8a14cf10364ULL); SHA512_STEP(h, a, b, c, d, e, f, g, 41, 0xa81a664bbc423001ULL); SHA512_STEP(g, h, a, b, c, d, e, f, 42, 0xc24b8b70d0f89791ULL); SHA512_STEP(f, g, h, a, b, c, d, e, 43, 0xc76c51a30654be30ULL); SHA512_STEP(e, f, g, h, a, b, c, d, 44, 0xd192e819d6ef5218ULL); SHA512_STEP(d, e, f, g, h, a, b, c, 45, 0xd69906245565a910ULL); SHA512_STEP(c, d, e, f, g, h, a, b, 46, 0xf40e35855771202aULL); SHA512_STEP(b, c, d, e, f, g, h, a, 47, 0x106aa07032bbd1b8ULL); SHA512_STEP(a, b, c, d, e, f, g, h, 48, 0x19a4c116b8d2d0c8ULL); SHA512_STEP(h, a, b, c, d, e, f, g, 49, 0x1e376c085141ab53ULL); SHA512_STEP(g, h, a, b, c, d, e, f, 50, 0x2748774cdf8eeb99ULL); SHA512_STEP(f, g, h, a, b, c, d, e, 51, 0x34b0bcb5e19b48a8ULL); SHA512_STEP(e, f, g, h, a, b, c, d, 52, 0x391c0cb3c5c95a63ULL); SHA512_STEP(d, e, f, g, h, a, b, c, 53, 0x4ed8aa4ae3418acbULL); SHA512_STEP(c, d, e, f, g, h, a, b, 54, 0x5b9cca4f7763e373ULL); SHA512_STEP(b, c, d, e, f, g, h, a, 55, 0x682e6ff3d6b2b8a3ULL); SHA512_STEP(a, b, c, d, e, f, g, h, 56, 0x748f82ee5defb2fcULL); SHA512_STEP(h, a, b, c, d, e, f, g, 57, 0x78a5636f43172f60ULL); SHA512_STEP(g, h, a, b, c, d, e, f, 58, 0x84c87814a1f0ab72ULL); SHA512_STEP(f, g, h, a, b, c, d, e, 59, 0x8cc702081a6439ecULL); SHA512_STEP(e, f, g, h, a, b, c, d, 60, 0x90befffa23631e28ULL); SHA512_STEP(d, e, f, g, h, a, b, c, 61, 0xa4506cebde82bde9ULL); SHA512_STEP(c, d, e, f, g, h, a, b, 62, 0xbef9a3f7b2c67915ULL); SHA512_STEP(b, c, d, e, f, g, h, a, 63, 0xc67178f2e372532bULL); SHA512_STEP(a, b, c, d, e, f, g, h, 64, 0xca273eceea26619cULL); SHA512_STEP(h, a, b, c, d, e, f, g, 65, 0xd186b8c721c0c207ULL); SHA512_STEP(g, h, a, b, c, d, e, f, 66, 0xeada7dd6cde0eb1eULL); SHA512_STEP(f, g, h, a, b, c, d, e, 67, 0xf57d4f7fee6ed178ULL); SHA512_STEP(e, f, g, h, a, b, c, d, 68, 0x06f067aa72176fbaULL); SHA512_STEP(d, e, f, g, h, a, b, c, 69, 0x0a637dc5a2c898a6ULL); SHA512_STEP(c, d, e, f, g, h, a, b, 70, 0x113f9804bef90daeULL); SHA512_STEP(b, c, d, e, f, g, h, a, 71, 0x1b710b35131c471bULL); SHA512_STEP(a, b, c, d, e, f, g, h, 72, 0x28db77f523047d84ULL); SHA512_STEP(h, a, b, c, d, e, f, g, 73, 0x32caab7b40c72493ULL); SHA512_STEP(g, h, a, b, c, d, e, f, 74, 0x3c9ebe0a15c9bebcULL); SHA512_STEP(f, g, h, a, b, c, d, e, 75, 0x431d67c49c100d4cULL); SHA512_STEP(e, f, g, h, a, b, c, d, 76, 0x4cc5d4becb3e42b6ULL); SHA512_STEP(d, e, f, g, h, a, b, c, 77, 0x597f299cfc657e2aULL); SHA512_STEP(c, d, e, f, g, h, a, b, 78, 0x5fcb6fab3ad6faecULL); SHA512_STEP(b, c, d, e, f, g, h, a, 79, 0x6c44198c4a475817ULL); _mm_store_si128 ((__m128i *) &crypt_key[0][index], a); _mm_store_si128 ((__m128i *) &crypt_key[1][index], b); _mm_store_si128 ((__m128i *) &crypt_key[2][index], c); _mm_store_si128 ((__m128i *) &crypt_key[3][index], d); _mm_store_si128 ((__m128i *) &crypt_key[4][index], e); _mm_store_si128 ((__m128i *) &crypt_key[5][index], f); _mm_store_si128 ((__m128i *) &crypt_key[6][index], g); _mm_store_si128 ((__m128i *) &crypt_key[7][index], h); } #if FMT_MAIN_VERSION > 10 return count; #endif }
#ifdef PARASAIL_ROWCOL parasail_result_t *result = parasail_result_new_rowcol3(s1Len, s2Len); #else parasail_result_t *result = parasail_result_new(); #endif #endif int32_t i = 0; int32_t j = 0; int32_t end_query = 0; int32_t end_ref = 0; int64_t score = NEG_INF; int64_t matches = NEG_INF; int64_t similar = NEG_INF; int64_t length = NEG_INF; __m128i vNegInf = _mm_set1_epi64x(NEG_INF); __m128i vOpen = _mm_set1_epi64x(open); __m128i vGap = _mm_set1_epi64x(gap); __m128i vZero = _mm_set1_epi64x(0); __m128i vOne = _mm_set1_epi64x(1); __m128i vN = _mm_set1_epi64x(N); __m128i vGapN = _mm_set1_epi64x(gap*N); __m128i vNegOne = _mm_set1_epi64x(-1); __m128i vI = _mm_set_epi64x(0,1); __m128i vJreset = _mm_set_epi64x(0,-1); __m128i vMaxScore = vNegInf; __m128i vMaxMatch = vNegInf; __m128i vMaxSimilar = vNegInf; __m128i vMaxLength = vNegInf; __m128i vILimit = _mm_set1_epi64x(s1Len); __m128i vILimit1 = _mm_sub_epi64(vILimit, vOne);
constants() : shuf128(_mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)), const2020(_mm_set1_epi64x(0x0000000200000000)), constFFFF(_mm_set1_epi32(0x0F)) { }