static bool elhash_hash( elhash_return_value_t* ret, node const* full_nodes, elhash_light_t const light, uint64_t full_size, elhash_h256_t const header_hash, uint64_t const nonce ) { if (full_size % MIX_WORDS != 0) { return false; } // pack hash and nonce together into first 40 bytes of s_mix assert(sizeof(node) * 8 == 512); node s_mix[MIX_NODES + 1]; memcpy(s_mix[0].bytes, &header_hash, 32); fix_endian64(s_mix[0].double_words[4], nonce); // compute sha3-512 hash and replicate across mix SHA3_512(s_mix->bytes, s_mix->bytes, 40); fix_endian_arr32(s_mix[0].words, 16); node* const mix = s_mix + 1; for (uint32_t w = 0; w != MIX_WORDS; ++w) { mix->words[w] = s_mix[0].words[w % NODE_WORDS]; } unsigned const page_size = sizeof(uint32_t) * MIX_WORDS; unsigned const num_full_pages = (unsigned) (full_size / page_size); for (unsigned i = 0; i != ELHASH_ACCESSES; ++i) { uint32_t const index = fnv_hash(s_mix->words[0] ^ i, mix->words[i % MIX_WORDS]) % num_full_pages; for (unsigned n = 0; n != MIX_NODES; ++n) { node const* dag_node; if (full_nodes) { dag_node = &full_nodes[MIX_NODES * index + n]; } else { node tmp_node; elhash_calculate_dag_item(&tmp_node, index * MIX_NODES + n, light); dag_node = &tmp_node; } #if defined(_M_X64) && ENABLE_SSE { __m128i fnv_prime = _mm_set1_epi32(FNV_PRIME); __m128i xmm0 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[0]); __m128i xmm1 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[1]); __m128i xmm2 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[2]); __m128i xmm3 = _mm_mullo_epi32(fnv_prime, mix[n].xmm[3]); mix[n].xmm[0] = _mm_xor_si128(xmm0, dag_node->xmm[0]); mix[n].xmm[1] = _mm_xor_si128(xmm1, dag_node->xmm[1]); mix[n].xmm[2] = _mm_xor_si128(xmm2, dag_node->xmm[2]); mix[n].xmm[3] = _mm_xor_si128(xmm3, dag_node->xmm[3]); } #else { for (unsigned w = 0; w != NODE_WORDS; ++w) { mix[n].words[w] = fnv_hash(mix[n].words[w], dag_node->words[w]); } } #endif } } // compress mix for (uint32_t w = 0; w != MIX_WORDS; w += 4) { uint32_t reduction = mix->words[w + 0]; reduction = reduction * FNV_PRIME ^ mix->words[w + 1]; reduction = reduction * FNV_PRIME ^ mix->words[w + 2]; reduction = reduction * FNV_PRIME ^ mix->words[w + 3]; mix->words[w / 4] = reduction; } fix_endian_arr32(mix->words, MIX_WORDS / 4); memcpy(&ret->mix_hash, mix->bytes, 32); // final Keccak hash SHA3_256(&ret->result, s_mix->bytes, 64 + 32); // Keccak-256(s + compressed_mix) return true; }
static inline void xor_salsa8_sse2(__m128i B[4], const __m128i Bx[4]) { __m128i X0, X1, X2, X3; __m128i T; int i; X0 = B[0] = _mm_xor_si128(B[0], Bx[0]); X1 = B[1] = _mm_xor_si128(B[1], Bx[1]); X2 = B[2] = _mm_xor_si128(B[2], Bx[2]); X3 = B[3] = _mm_xor_si128(B[3], Bx[3]); for (i = 0; i < 8; i += 2) { /* Operate on "columns". */ T = _mm_add_epi32(X0, X3); X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7)); X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25)); T = _mm_add_epi32(X1, X0); X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); T = _mm_add_epi32(X2, X1); X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13)); X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19)); T = _mm_add_epi32(X3, X2); X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); /* Rearrange data. */ X1 = _mm_shuffle_epi32(X1, 0x93); X2 = _mm_shuffle_epi32(X2, 0x4E); X3 = _mm_shuffle_epi32(X3, 0x39); /* Operate on "rows". */ T = _mm_add_epi32(X0, X1); X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7)); X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25)); T = _mm_add_epi32(X3, X0); X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); T = _mm_add_epi32(X2, X3); X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13)); X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19)); T = _mm_add_epi32(X1, X2); X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); /* Rearrange data. */ X1 = _mm_shuffle_epi32(X1, 0x39); X2 = _mm_shuffle_epi32(X2, 0x4E); X3 = _mm_shuffle_epi32(X3, 0x93); } B[0] = _mm_add_epi32(B[0], X0); B[1] = _mm_add_epi32(B[1], X1); B[2] = _mm_add_epi32(B[2], X2); B[3] = _mm_add_epi32(B[3], X3); }
// Simple quantization static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], int n, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(2047); const __m128i zero = _mm_set1_epi16(0); __m128i sign0, sign8; __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) sign0 = _mm_srai_epi16(in0, 15); sign8 = _mm_srai_epi16(in8, 15); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); // if (coeff > 2047) coeff = 2047 coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); // out = (coeff * iQ + B) >> QFIX; { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // expand bias from 16b to 32b __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); // out = (coeff * iQ + B) out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = (coeff * iQ + B) >> QFIX; out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); // if (coeff <= mtx->zthresh_) {in=0; out=0;} { __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); in0 = _mm_and_si128(in0, cmp0); in8 = _mm_and_si128(in8, cmp8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); out0 = _mm_and_si128(out0, cmp0); out8 = _mm_and_si128(out8, cmp8); } // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not { int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, packed_out); if (n) { tmp[0] &= ~0xff; } return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); } }
__m128i operator^(sse_vector a, sse_vector b) { return _mm_xor_si128(a.v, b.v); }
int bit_vec_filter_m128_sse11(uint8_t *read_vec, uint8_t *ref_vec, int length, int max_error) { const __m128i zero_mask = _mm_set1_epi8(0x00); const __m128i one_mask = _mm_set1_epi8(0xff); int total_byte = (length - 1) / BYTE_BASE_NUM11 + 1; int total_difference = 0; //Start iteration int i, j; //read data __m128i prev_read_XMM = _mm_set1_epi8(0x0); __m128i curr_read_XMM = *((__m128i *) (read_vec)); //ref data __m128i prev_ref_XMM = _mm_set1_epi8(0x0); __m128i curr_ref_XMM = *((__m128i *) (ref_vec)); __m128i read_XMM; __m128i ref_XMM; __m128i temp_diff_XMM; __m128i diff_XMM; __m128i mask; for (i = 0; i < total_byte; i += SSE_BYTE_NUM) { curr_read_XMM = *((__m128i *) (read_vec + i)); curr_ref_XMM = *((__m128i *) (ref_vec + i)); diff_XMM = _mm_xor_si128(curr_read_XMM, curr_ref_XMM); diff_XMM = xor11complement_sse(diff_XMM); if (i + SSE_BYTE_NUM >= total_byte) { if (length % SSE_BASE_NUM11) { mask = _mm_load_si128( (__m128i *) (MASK_SSE_END11 + (length % SSE_BASE_NUM11) * SSE_BYTE_NUM)); diff_XMM = _mm_and_si128(mask, diff_XMM); } } for (j = 1; j <= max_error; j++) { //Right shift read read_XMM = shift_right_sse11(prev_read_XMM, curr_read_XMM, j); temp_diff_XMM = _mm_xor_si128(read_XMM, curr_ref_XMM); temp_diff_XMM = xor11complement_sse(temp_diff_XMM); if (i == 0) { mask = _mm_load_si128( (__m128i *) (MASK_SSE_BEG11 + (j - 1) * SSE_BYTE_NUM)); temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM); } if (i + SSE_BYTE_NUM >= total_byte) { if (length % SSE_BASE_NUM11) { mask = _mm_load_si128( (__m128i *) (MASK_SSE_END11 + (length % SSE_BASE_NUM11) * SSE_BYTE_NUM)); temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM); } } diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM); //Right shift ref ref_XMM = shift_right_sse11(prev_ref_XMM, curr_ref_XMM, j); temp_diff_XMM = _mm_xor_si128(curr_read_XMM, ref_XMM); temp_diff_XMM = xor11complement_sse(temp_diff_XMM); if (i == 0) { mask = _mm_load_si128( (__m128i *) (MASK_SSE_BEG11 + (j - 1) * SSE_BYTE_NUM)); temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM); } if (i + SSE_BYTE_NUM >= total_byte) { if (length % SSE_BASE_NUM11) { mask = _mm_load_si128( (__m128i *) (MASK_SSE_END11 + (length % SSE_BASE_NUM11) * SSE_BYTE_NUM)); temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM); } } diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM); } total_difference += popcount11_m128i_sse(diff_XMM); prev_read_XMM = curr_read_XMM; prev_ref_XMM = curr_ref_XMM; if (total_difference > max_error) return 0; } return 1; }
static void my_YCbCr_to_RGB(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step) { int i=0; if (step == 4) { // this is a fairly straightforward implementation and not super-optimized. __m128i signflip = _mm_set1_epi8(-0x80); __m128i cr_const0 = _mm_set1_epi16((short) ( 1.40200f*4096.0f)); __m128i cr_const1 = _mm_set1_epi16((short) (-0.71414f*4096.0f)); __m128i cb_const0 = _mm_set1_epi16((short) (-0.34414f*4096.0f)); __m128i cb_const1 = _mm_set1_epi16((short) ( 1.77200f*4096.0f)); __m128i y_bias = _mm_set1_epi16(8); __m128i xw = _mm_set1_epi16(255); for (; i+7 < count; i += 8) { // load __m128i zero = _mm_setzero_si128(); __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i)); __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i)); __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i)); __m128i cr_bias = _mm_xor_si128(cr_bytes, signflip); // -128 __m128i cb_bias = _mm_xor_si128(cb_bytes, signflip); // -128 // unpack to short (and left-shift cr, cb by 8) __m128i yw = _mm_unpacklo_epi8(y_bytes, zero); __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_bias); __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_bias); // color transform __m128i yws = _mm_slli_epi16(yw, 4); __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw); __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw); __m128i ywb = _mm_add_epi16(yws, y_bias); __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1); __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1); __m128i rws = _mm_add_epi16(cr0, ywb); __m128i gwt = _mm_add_epi16(cb0, ywb); __m128i bws = _mm_add_epi16(ywb, cb1); __m128i gws = _mm_add_epi16(gwt, cr1); // descale __m128i rw = _mm_srai_epi16(rws, 4); __m128i bw = _mm_srai_epi16(bws, 4); __m128i gw = _mm_srai_epi16(gws, 4); // back to byte, set up for transpose __m128i brb = _mm_packus_epi16(rw, bw); __m128i gxb = _mm_packus_epi16(gw, xw); // transpose to interleave channels __m128i t0 = _mm_unpacklo_epi8(brb, gxb); __m128i t1 = _mm_unpackhi_epi8(brb, gxb); __m128i o0 = _mm_unpacklo_epi16(t0, t1); __m128i o1 = _mm_unpackhi_epi16(t0, t1); // store _mm_storeu_si128((__m128i *) (out + 0), o0); _mm_storeu_si128((__m128i *) (out + 16), o1); out += 32; } } for (; i < count; ++i) { int y_fixed = (y[i] << 16) + 32768; // rounding int r,g,b; int cr = pcr[i] - 128; int cb = pcb[i] - 128; r = y_fixed + cr*float2fixed(1.40200f); g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f); b = y_fixed + cb*float2fixed(1.77200f); r >>= 16; g >>= 16; b >>= 16; if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; } if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; } if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; } out[0] = (stbi_uc)r; out[1] = (stbi_uc)g; out[2] = (stbi_uc)b; out[3] = 255; out += step; } }
/** * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 * words of type uint64_t), using Blake2b's G function as the internal permutation * * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) */ inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state #if defined __AVX2__ __m256i state_v[4], in_v[2]; state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] ); state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] ); LYRA_12_ROUNDS_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); #elif defined __AVX__ __m128i state_v[4], in_v[4]; state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); in_v[0] = _mm_load_si128( (__m128i*)(&in[0]) ); in_v[1] = _mm_load_si128( (__m128i*)(&in[2]) ); in_v[2] = _mm_load_si128( (__m128i*)(&in[4]) ); in_v[3] = _mm_load_si128( (__m128i*)(&in[6]) ); _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); //Applies the transformation f to the sponge's state blake2bLyra(state); #else state[0] ^= in[0]; state[1] ^= in[1]; state[2] ^= in[2]; state[3] ^= in[3]; state[4] ^= in[4]; state[5] ^= in[5]; state[6] ^= in[6]; state[7] ^= in[7]; //Applies the transformation f to the sponge's state blake2bLyra(state); #endif }
static __m128i aes192_keyexpand_2(__m128i key, __m128i key2) { key = _mm_shuffle_epi32(key, 0xff); key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4)); return _mm_xor_si128(key, key2); }
/** * salsa20_8(B): * Apply the salsa20/8 core to the provided block. */ static void salsa20_8(__m128i B[4]) { __m128i X0, X1, X2, X3; __m128i T; size_t i; X0 = B[0]; X1 = B[1]; X2 = B[2]; X3 = B[3]; for (i = 0; i < 8; i += 2) { /* Operate on "columns". */ T = _mm_add_epi32(X0, X3); X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7)); X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25)); T = _mm_add_epi32(X1, X0); X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); T = _mm_add_epi32(X2, X1); X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13)); X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19)); T = _mm_add_epi32(X3, X2); X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); /* Rearrange data. */ X1 = _mm_shuffle_epi32(X1, 0x93); X2 = _mm_shuffle_epi32(X2, 0x4E); X3 = _mm_shuffle_epi32(X3, 0x39); /* Operate on "rows". */ T = _mm_add_epi32(X0, X1); X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7)); X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25)); T = _mm_add_epi32(X3, X0); X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); T = _mm_add_epi32(X2, X3); X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13)); X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19)); T = _mm_add_epi32(X1, X2); X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); /* Rearrange data. */ X1 = _mm_shuffle_epi32(X1, 0x39); X2 = _mm_shuffle_epi32(X2, 0x4E); X3 = _mm_shuffle_epi32(X3, 0x93); } B[0] = _mm_add_epi32(B[0], X0); B[1] = _mm_add_epi32(B[1], X1); B[2] = _mm_add_epi32(B[2], X2); B[3] = _mm_add_epi32(B[3], X3); }
/** * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words * of type uint64_t), using Blake2b's G function as the internal permutation * * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_INT64 words) */ inline void absorbBlock(uint64_t *state, const uint64_t *in) { #if defined __AVX2__ __m256i state_v[4], in_v[3]; // only state is guaranteed aligned 256 state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&in[8]) ); state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] ); state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] ); state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] ); LYRA_12_ROUNDS_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); #elif defined __AVX__ __m128i state_v[6], in_v[6]; state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); in_v[0] = _mm_load_si128( (__m128i*)(&in[0]) ); in_v[1] = _mm_load_si128( (__m128i*)(&in[2]) ); in_v[2] = _mm_load_si128( (__m128i*)(&in[4]) ); in_v[3] = _mm_load_si128( (__m128i*)(&in[6]) ); in_v[4] = _mm_load_si128( (__m128i*)(&in[8]) ); in_v[5] = _mm_load_si128( (__m128i*)(&in[10]) ); // do blake2bLyra without init // LYRA_ROUND_AVX2( state_v ) _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); _mm_store_si128( (__m128i*)(&state[8]), _mm_xor_si128( state_v[4], in_v[4] ) ); _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); //Applies the transformation f to the sponge's state blake2bLyra(state); #else //XORs the first BLOCK_LEN_INT64 words of "in" with the current state state[0] ^= in[0]; state[1] ^= in[1]; state[2] ^= in[2]; state[3] ^= in[3]; state[4] ^= in[4]; state[5] ^= in[5]; state[6] ^= in[6]; state[7] ^= in[7]; state[8] ^= in[8]; state[9] ^= in[9]; state[10] ^= in[10]; state[11] ^= in[11]; //Applies the transformation f to the sponge's state blake2bLyra(state); #endif }
static int HafCpu_Histogram16Bins_DATA_U8 ( vx_uint32 * dstHist, vx_uint8 distOffset, vx_uint8 distWindow, vx_uint32 srcWidth, vx_uint32 srcHeight, vx_uint8 * pSrcImage, vx_uint32 srcImageStrideInBytes ) { // offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes // thresh: source threshold in -128..127 range __m128i offset = _mm_set1_epi8((char)0x80); __m128i T0 = _mm_set1_epi8((char)(((distOffset ? distOffset : distWindow) - 1) ^ 0x80)); __m128i dT = _mm_set1_epi8((char)distWindow); __m128i onemask = _mm_set1_epi8((char)1); // process one pixel row at a time that counts "pixel < srcThreshold" vx_uint32 count[16] = { 0 }; vx_uint8 * srcRow = pSrcImage; vx_uint32 width = (srcWidth + 15) >> 4; for (unsigned int y = 0; y < srcHeight; y++) { __m128i * src = (__m128i *)srcRow; __m128i count0 = _mm_set1_epi8((char)0); __m128i count1 = _mm_set1_epi8((char)0); __m128i count2 = _mm_set1_epi8((char)0); __m128i count3 = _mm_set1_epi8((char)0); for (unsigned int x = 0; x < width; x++) { __m128i pixels = _mm_load_si128(src++); pixels = _mm_xor_si128(pixels, offset); __m128i cmpout, Tnext = T0; // 0..3 cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count0 = _mm_add_epi32(count0, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count0 = _mm_add_epi32(count0, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count0 = _mm_add_epi32(count0, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count0 = _mm_add_epi32(count0, cmpout); // 4..7 Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count1 = _mm_add_epi32(count1, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count1 = _mm_add_epi32(count1, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count1 = _mm_add_epi32(count1, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count1 = _mm_add_epi32(count1, cmpout); // 8..11 Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count2 = _mm_add_epi32(count2, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count2 = _mm_add_epi32(count2, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count2 = _mm_add_epi32(count2, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count2 = _mm_add_epi32(count2, cmpout); // 12..15 Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count3 = _mm_add_epi32(count3, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count3 = _mm_add_epi32(count3, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count3 = _mm_add_epi32(count3, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count3 = _mm_add_epi32(count3, cmpout); } srcRow += srcImageStrideInBytes; // move counts from count0..2 into count[] for (int i = 0; i < 4; i++) { count[ 0 + i] += M128I(count0).m128i_u16[i] + M128I(count0).m128i_u16[4 + i]; count[ 4 + i] += M128I(count1).m128i_u16[i] + M128I(count1).m128i_u16[4 + i]; count[ 8 + i] += M128I(count2).m128i_u16[i] + M128I(count2).m128i_u16[4 + i]; count[12 + i] += M128I(count3).m128i_u16[i] + M128I(count3).m128i_u16[4 + i]; } } // extract histogram from count if (distOffset == 0) { vx_uint32 last = (distWindow >= 16) ? srcWidth * srcHeight : count[15]; for (int i = 14; i >= 0; i--) { count[i] = last - count[i]; last -= count[i]; } dstHist[0] = last; for (int i = 1; i < 16; i++) dstHist[i] = count[i - 1]; } else { vx_uint32 last = srcWidth * srcHeight; for (int i = 15; i >= 0; i--) { count[i] = last - count[i]; last -= count[i]; dstHist[i] = count[i]; } } return AGO_SUCCESS; }
long garbleCircuit(GarbledCircuit *garbledCircuit, InputLabels inputLabels, OutputMap outputMap) { GarblingContext garblingContext; GarbledGate *garbledGate; GarbledTable *garbledTable; DKCipherContext dkCipherContext; const block *sched = ((block *) (dkCipherContext.K.rd_key)); block val; block *A, *B, *plainText, *cipherText; block tweak; long i, j, rnds = 10; block blocks[4]; block keys[4]; long lsb0, lsb1; int input0, input1, output; srand_sse(time(NULL)); startTime = RDTSC; createInputLabels(inputLabels, garbledCircuit->n); garbledCircuit->id = getFreshId(); for (i = 0; i < 2 * garbledCircuit->n; i += 2) { garbledCircuit->wires[i / 2].label0 = inputLabels[i]; garbledCircuit->wires[i / 2].label1 = inputLabels[i + 1]; } garbledTable = garbledCircuit->garbledTable; garblingContext.gateIndex = 0; garblingContext.wireIndex = garbledCircuit->n + 1; block key = randomBlock(); block rkey = randomBlock(); AES_KEY KR; AES_set_encrypt_key((unsigned char *) &rkey, 128, &KR); const __m128i *sched2 = ((__m128i *) (KR.rd_key)); garblingContext.R = xorBlocks(garbledCircuit->wires[0].label0, garbledCircuit->wires[0].label1); garbledCircuit->globalKey = key; DKCipherInit(&key, &(garblingContext.dkCipherContext)); int tableIndex = 0; for (i = 0; i < garbledCircuit->q; i++) { garbledGate = &(garbledCircuit->garbledGates[i]); input0 = garbledGate->input0; input1 = garbledGate->input1; output = garbledGate->output; #ifdef FREE_XOR if (garbledGate->type == XORGATE) { garbledCircuit->wires[output].label0 = xorBlocks(garbledCircuit->wires[input0].label0, garbledCircuit->wires[input1].label0); garbledCircuit->wires[output].label1 = xorBlocks(garbledCircuit->wires[input0].label1, garbledCircuit->wires[input1].label0); continue; } #endif tweak = makeBlock(i, (long)0); lsb0 = getLSB(garbledCircuit->wires[input0].label0); lsb1 = getLSB(garbledCircuit->wires[input1].label0); block val = _mm_xor_si128(tweak, sched[0]); for (j = 1; j < rnds; j++) val = _mm_aesenc_si128(val, sched2[j]); garbledCircuit->wires[garbledGate->output].label0 = _mm_aesenclast_si128(val, sched[j]); garbledCircuit->wires[garbledGate->output].label1 = xorBlocks(garblingContext.R, garbledCircuit->wires[garbledGate->output].label0); block A0, A1, B0, B1; A0 = DOUBLE(garbledCircuit->wires[input0].label0); A1 = DOUBLE(garbledCircuit->wires[input0].label1); B0 = DOUBLE(DOUBLE(garbledCircuit->wires[input1].label0)); B1 = DOUBLE(DOUBLE(garbledCircuit->wires[input1].label1)); keys[0] = xorBlocks(xorBlocks(A0, B0) , tweak); keys[1] = xorBlocks(xorBlocks(A0,B1), tweak); keys[2] = xorBlocks(xorBlocks(A1, B0), tweak); keys[3] = xorBlocks(xorBlocks(A1, B1), tweak); block *temp[2]; temp[0] = &garbledCircuit->wires[garbledGate->output].label0; temp[1] = &garbledCircuit->wires[garbledGate->output].label1; int bp = 0; blocks[0] = xorBlocks(keys[0], *(temp[(garbledGate->type & (1<<bp))>>bp])); bp++; blocks[1] = xorBlocks(keys[1], *(temp[(garbledGate->type & (1<<bp))>>bp])); bp++; blocks[2] = xorBlocks(keys[2], *(temp[(garbledGate->type & (1<<bp))>>bp])); bp++; blocks[3] = xorBlocks(keys[3], *(temp[(garbledGate->type & (1<<bp))>>bp])); write: AES_ecb_encrypt_blks_4(keys, &(garblingContext.dkCipherContext.K)); garbledTable[tableIndex].table[2 * lsb0 + lsb1] = xorBlocks(blocks[0], keys[0]); garbledTable[tableIndex].table[2 * lsb0 + 1 - lsb1] = xorBlocks(blocks[1], keys[1]); garbledTable[tableIndex].table[2 * (1 - lsb0) + lsb1] = xorBlocks(blocks[2], keys[2]); garbledTable[tableIndex].table[2 * (1 - lsb0) + (1 - lsb1)] = xorBlocks(blocks[3], keys[3]); tableIndex++; } for (i = 0; i < garbledCircuit->m; i++) { outputMap[2 * i] = garbledCircuit->wires[garbledCircuit->outputs[i]].label0; outputMap[2 * i + 1] = garbledCircuit->wires[garbledCircuit->outputs[i]].label1; } endTime = RDTSC; return (endTime - startTime); }
long garbleCircuit(GarbledCircuit *garbledCircuit, InputLabels inputLabels, OutputMap outputMap) { GarblingContext garblingContext; GarbledGate *garbledGate; GarbledTable *garbledTable; DKCipherContext dkCipherContext; const __m128i *sched = ((__m128i *)(dkCipherContext.K.rd_key)); block val; block *A, *B, *plainText,*cipherText; block tweak; long a, b, i, j,rnds = 10; block blocks[4]; block keys[4]; long lsb0,lsb1; block keyToEncrypt; int input0, input1,output; srand_sse( time(NULL)); startTime = RDTSC; createInputLabels(inputLabels, garbledCircuit->n); garbledCircuit->id = getFreshId(); for(i=0;i<2*garbledCircuit->n;i+=2) { garbledCircuit->wires[i/2].id = i+1; garbledCircuit->wires[i/2].label0 = inputLabels[i]; garbledCircuit->wires[i/2].label1 = inputLabels[i+1]; } garbledTable = garbledCircuit->garbledTable; garblingContext.gateIndex = 0; garblingContext.wireIndex = garbledCircuit->n + 1; block key = randomBlock(); block rkey = randomBlock(); AES_KEY KR; AES_set_encrypt_key(&rkey, 128, &KR); const __m128i *sched2 = ((__m128i *)(KR.rd_key)); garblingContext.R = xorBlocks(garbledCircuit->wires[0].label0, garbledCircuit->wires[0].label1); garbledCircuit->globalKey = key; DKCipherInit(&key, &(garblingContext.dkCipherContext)); int tableIndex = 0; for(i=0; i< garbledCircuit->q;i++) { garbledGate = &(garbledCircuit->garbledGates[i]); input0 = garbledGate->input0; input1 = garbledGate->input1; output = garbledGate->output; #ifdef FREE_XOR if (garbledGate->type == XORGATE) { garbledCircuit->wires[output].label0 = xorBlocks(garbledCircuit->wires[input0].label0, garbledCircuit->wires[input1].label0); garbledCircuit->wires[output].label1 = xorBlocks(garbledCircuit->wires[input0].label1, garbledCircuit->wires[input1].label0); continue; } #endif tweak = makeBlock(i, (long)0); lsb0 = getLSB(garbledCircuit->wires[input0].label0); lsb1 = getLSB(garbledCircuit->wires[input1].label0); char templ[20]; char templ2[20]; block val = _mm_xor_si128 (tweak, sched[0]); for (j=1; j<rnds; j++) val = _mm_aesenc_si128 (val,sched2[j]); *((block*)templ) = _mm_aesenclast_si128 (val, sched[j]); val = _mm_aesenclast_si128 (val, sched[j]); *((block *)templ2) = xorBlocks(*((block *)templ), garblingContext.R); TRUNCATE(templ); TRUNCATE(templ2); block *label0 = (block *)templ; block *label1 = (block *)templ2; garbledCircuit->wires[garbledGate->output].label0 = *((block*)templ); garbledCircuit->wires[garbledGate->output].label1 = *((block*)templ2); block A0, A1, B0, B1; A0 = DOUBLE(garbledCircuit->wires[input0].label0); A1 = DOUBLE(garbledCircuit->wires[input0].label1); B0 = DOUBLE(DOUBLE(garbledCircuit->wires[input1].label0)); B1 = DOUBLE(DOUBLE(garbledCircuit->wires[input1].label1)); keys[0] = xorBlocks(A0, B0); keys[0] = xorBlocks(keys[0], tweak); keys[1] = xorBlocks(A0,B1); keys[1] = xorBlocks(keys[1], tweak); keys[2] = xorBlocks(A1, B0); keys[2] = xorBlocks(keys[2], tweak); keys[3] = xorBlocks(A1, B1); keys[3] = xorBlocks(keys[3], tweak); if (garbledGate->type == ANDGATE) { blocks[0] = xorBlocks(keys[0], *label0); blocks[1] = xorBlocks(keys[1], *label0); blocks[2] = xorBlocks(keys[2], *label0); blocks[3] = xorBlocks(keys[3], *label1); goto write; } if (garbledGate->type == ORGATE) { blocks[0] = xorBlocks(keys[0], *label0); blocks[1] = xorBlocks(keys[1], *label1); blocks[2] = xorBlocks(keys[2], *label1); blocks[3] = xorBlocks(keys[3], *label1); goto write; } if (garbledGate->type == XORGATE) { blocks[0] = xorBlocks(keys[0], *label0); blocks[1] = xorBlocks(keys[1], *label1); blocks[2] = xorBlocks(keys[2], *label1); blocks[3] = xorBlocks(keys[3], *label0); goto write; } if (garbledGate->type == NOTGATE) { blocks[0] = xorBlocks(keys[0], *label1); blocks[1] = xorBlocks(keys[1], *label0); blocks[2] = xorBlocks(keys[2], *label1); blocks[3] = xorBlocks(keys[3], *label0); goto write; } write: AES_ecb_encrypt_blks(keys, 4, &(garblingContext.dkCipherContext.K)); char toWrite[4][16]; char **dest[4]; *((block *) toWrite[0]) = xorBlocks(blocks[0], keys[0]); *((block *) toWrite[1]) = xorBlocks(blocks[1], keys[1]); *((block *) toWrite[2]) = xorBlocks(blocks[2], keys[2]); *((block *) toWrite[3]) = xorBlocks(blocks[3], keys[3]); short *cpsrc; short *cpdst; cpsrc = (short *)toWrite[0]; cpdst = (short *)&garbledTable[tableIndex].table[2*lsb0 + lsb1]; cpdst[0]=cpsrc[0]; cpdst[1]=cpsrc[1]; cpdst[2]=cpsrc[2]; cpdst[3]=cpsrc[3]; cpdst[4]=cpsrc[4]; cpsrc = (short *)toWrite[1]; cpdst = (short *)&garbledTable[tableIndex].table[2*(lsb0) + (1-lsb1)]; cpdst[0]=cpsrc[0]; cpdst[1]=cpsrc[1]; cpdst[2]=cpsrc[2]; cpdst[3]=cpsrc[3]; cpdst[4]=cpsrc[4]; cpsrc = (short *)toWrite[2]; cpdst = (short *)&garbledTable[tableIndex].table[2*(1-lsb0) + (lsb1)]; cpdst[0]=cpsrc[0]; cpdst[1]=cpsrc[1]; cpdst[2]=cpsrc[2]; cpdst[3]=cpsrc[3]; cpdst[4]=cpsrc[4]; cpsrc = (short *)toWrite[3]; cpdst = (short *)&garbledTable[tableIndex].table[2*(1-lsb0) + (1-lsb1)]; cpdst[0]=cpsrc[0]; cpdst[1]=cpsrc[1]; cpdst[2]=cpsrc[2]; cpdst[3]=cpsrc[3]; cpdst[4]=cpsrc[4]; tableIndex++; } for(i=0;i<garbledCircuit->m;i++) { outputMap[2*i] = garbledCircuit->wires[garbledCircuit->outputs[i]].label0; outputMap[2*i+1] = garbledCircuit->wires[garbledCircuit->outputs[i]].label1; } endTime = RDTSC; return (endTime - startTime); }
void InvMixColumns_sse(BYTE state[][4]) { __m128i stateSse = _mm_set_epi8( gf_mul[state[0][3]][3], gf_mul[state[0][2]][3], gf_mul[state[0][1]][3], gf_mul[state[0][0]][3], gf_mul[state[0][3]][4], gf_mul[state[0][2]][4], gf_mul[state[0][1]][4], gf_mul[state[0][0]][4], gf_mul[state[0][3]][2], gf_mul[state[0][2]][2], gf_mul[state[0][1]][2], gf_mul[state[0][0]][2], gf_mul[state[0][3]][5], gf_mul[state[0][2]][5], gf_mul[state[0][1]][5], gf_mul[state[0][0]][5]); __m128i step2 = _mm_set_epi8( gf_mul[state[1][3]][4], gf_mul[state[1][2]][4], gf_mul[state[1][1]][4], gf_mul[state[1][0]][4], gf_mul[state[1][3]][2], gf_mul[state[1][2]][2], gf_mul[state[1][1]][2], gf_mul[state[1][0]][2], gf_mul[state[1][3]][5], gf_mul[state[1][2]][5], gf_mul[state[1][1]][5], gf_mul[state[1][0]][5], gf_mul[state[1][3]][3], gf_mul[state[1][2]][3], gf_mul[state[1][1]][3], gf_mul[state[1][0]][3] ); __m128i step3 = _mm_set_epi8( gf_mul[state[2][3]][2], gf_mul[state[2][2]][2], gf_mul[state[2][1]][2], gf_mul[state[2][0]][2], gf_mul[state[2][3]][5], gf_mul[state[2][2]][5], gf_mul[state[2][1]][5], gf_mul[state[2][0]][5], gf_mul[state[2][3]][3], gf_mul[state[2][2]][3], gf_mul[state[2][1]][3], gf_mul[state[2][0]][3], gf_mul[state[2][3]][4], gf_mul[state[2][2]][4], gf_mul[state[2][1]][4], gf_mul[state[2][0]][4] ); __m128i step4 = _mm_set_epi8( gf_mul[state[3][3]][5], gf_mul[state[3][2]][5], gf_mul[state[3][1]][5], gf_mul[state[3][0]][5], gf_mul[state[3][3]][3], gf_mul[state[3][2]][3], gf_mul[state[3][1]][3], gf_mul[state[3][0]][3], gf_mul[state[3][3]][4], gf_mul[state[3][2]][4], gf_mul[state[3][1]][4], gf_mul[state[3][0]][4], gf_mul[state[3][3]][2], gf_mul[state[3][2]][2], gf_mul[state[3][1]][2], gf_mul[state[3][0]][2] ); stateSse = _mm_xor_si128 ( stateSse, step2); stateSse = _mm_xor_si128 ( stateSse, step3); stateSse = _mm_xor_si128 ( stateSse, step4); _mm_storeu_si128(state, stateSse); }
/** * Performs a reduced duplex operation for a single row, from the highest to * the lowest index, using the reduced-round Blake2b's G function as the * internal permutation * * @param state The current state of the sponge * @param rowIn Row to feed the sponge * @param rowOut Row to receive the sponge's output */ inline void reducedDuplexRow1( uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols ) { uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row int i; #if defined __AVX2__ __m256i state_v[4], in_v[3]; state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); #endif for ( i = 0; i < nCols; i++ ) { #if defined __AVX2__ in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] ); state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] ); state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] ); LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], in_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], in_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], in_v[2] ) ); #elif defined __AVX__ __m128i state_v[6], in_v[6]; state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); in_v[0] = _mm_load_si128( (__m128i*)(&ptrWordIn[0]) ); in_v[1] = _mm_load_si128( (__m128i*)(&ptrWordIn[2]) ); in_v[2] = _mm_load_si128( (__m128i*)(&ptrWordIn[4]) ); in_v[3] = _mm_load_si128( (__m128i*)(&ptrWordIn[6]) ); in_v[4] = _mm_load_si128( (__m128i*)(&ptrWordIn[8]) ); in_v[5] = _mm_load_si128( (__m128i*)(&ptrWordIn[10]) ); _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); _mm_store_si128( (__m128i*)(&state[8]), _mm_xor_si128( state_v[4], in_v[4] ) ); _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); #else //Absorbing "M[prev][col]" state[0] ^= (ptrWordIn[0]); state[1] ^= (ptrWordIn[1]); state[2] ^= (ptrWordIn[2]); state[3] ^= (ptrWordIn[3]); state[4] ^= (ptrWordIn[4]); state[5] ^= (ptrWordIn[5]); state[6] ^= (ptrWordIn[6]); state[7] ^= (ptrWordIn[7]); state[8] ^= (ptrWordIn[8]); state[9] ^= (ptrWordIn[9]); state[10] ^= (ptrWordIn[10]); state[11] ^= (ptrWordIn[11]); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); #endif #if defined __AVX2__ /* state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], in_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], in_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], in_v[2] ) ); */ #elif defined __AVX__ state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]), _mm_xor_si128( state_v[4], in_v[4] ) ); _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); #else //M[row][C-1-col] = M[prev][col] XOR rand ptrWordOut[0] = ptrWordIn[0] ^ state[0]; ptrWordOut[1] = ptrWordIn[1] ^ state[1]; ptrWordOut[2] = ptrWordIn[2] ^ state[2]; ptrWordOut[3] = ptrWordIn[3] ^ state[3]; ptrWordOut[4] = ptrWordIn[4] ^ state[4]; ptrWordOut[5] = ptrWordIn[5] ^ state[5]; ptrWordOut[6] = ptrWordIn[6] ^ state[6]; ptrWordOut[7] = ptrWordIn[7] ^ state[7]; ptrWordOut[8] = ptrWordIn[8] ^ state[8]; ptrWordOut[9] = ptrWordIn[9] ^ state[9]; ptrWordOut[10] = ptrWordIn[10] ^ state[10]; ptrWordOut[11] = ptrWordIn[11] ^ state[11]; #endif //Input: next column (i.e., next block in sequence) ptrWordIn += BLOCK_LEN_INT64; //Output: goes to previous column ptrWordOut -= BLOCK_LEN_INT64; } #if defined __AVX2__ _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); #endif }
static __m128i aes128_keyexpand(__m128i key) { key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); return _mm_xor_si128(key, _mm_slli_si128(key, 4)); }
/** * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., * the wordwise addition of two columns, ignoring carries between words). The * output of this operation, "rand", is then used to make * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit * rotation to the left and N_COLS is a system parameter. * * @param state The current state of the sponge * @param rowIn Row used only as input * @param rowInOut Row used as input and to receive output after rotation * @param rowOut Row receiving the output * */ inline void reducedDuplexRowSetup( uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols ) { uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row int i; #if defined __AVX2__ __m256i state_v[4], in_v[3], inout_v[3]; #define t_state in_v state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); for ( i = 0; i < nCols; i++ ) { in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) ); state_v[0] = _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0], inout_v[0] ) ); state_v[1] = _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1], inout_v[1] ) ); state_v[2] = _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2], inout_v[2] ) ); LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], in_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], in_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], in_v[2] ) ); //M[row*][col] = M[row*][col] XOR rotW(rand) t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 ); t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 ); t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 ); inout_v[0] = _mm256_xor_si256( inout_v[0], _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) ); inout_v[1] = _mm256_xor_si256( inout_v[1], _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) ); inout_v[2] = _mm256_xor_si256( inout_v[2], _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) ); _mm256_storeu_si256( (__m256i*)&ptrWordInOut[0], inout_v[0] ); _mm256_storeu_si256( (__m256i*)&ptrWordInOut[4], inout_v[1] ); _mm256_storeu_si256( (__m256i*)&ptrWordInOut[8], inout_v[2] ); //Inputs: next column (i.e., next block in sequence) ptrWordInOut += BLOCK_LEN_INT64; ptrWordIn += BLOCK_LEN_INT64; //Output: goes to previous column ptrWordOut -= BLOCK_LEN_INT64; } _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); #undef t_state #elif defined __AVX__ __m128i state_v[6], in_v[6], inout_v[6]; for ( i = 0; i < nCols; i++ ) { state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); inout_v[0] = _mm_load_si128( (__m128i*)(&ptrWordInOut[0]) ); inout_v[1] = _mm_load_si128( (__m128i*)(&ptrWordInOut[2]) ); inout_v[2] = _mm_load_si128( (__m128i*)(&ptrWordInOut[4]) ); inout_v[3] = _mm_load_si128( (__m128i*)(&ptrWordInOut[6]) ); inout_v[4] = _mm_load_si128( (__m128i*)(&ptrWordInOut[8]) ); inout_v[5] = _mm_load_si128( (__m128i*)(&ptrWordInOut[10]) ); in_v[0] = _mm_load_si128( (__m128i*)(&ptrWordIn[0]) ); in_v[1] = _mm_load_si128( (__m128i*)(&ptrWordIn[2]) ); in_v[2] = _mm_load_si128( (__m128i*)(&ptrWordIn[4]) ); in_v[3] = _mm_load_si128( (__m128i*)(&ptrWordIn[6]) ); in_v[4] = _mm_load_si128( (__m128i*)(&ptrWordIn[8]) ); in_v[5] = _mm_load_si128( (__m128i*)(&ptrWordIn[10]) ); _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], _mm_add_epi64( in_v[0], inout_v[0] ) ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], _mm_add_epi64( in_v[1], inout_v[1] ) ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], _mm_add_epi64( in_v[2], inout_v[2] ) ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], _mm_add_epi64( in_v[3], inout_v[3] ) ) ); _mm_store_si128( (__m128i*)(&state[8]), _mm_xor_si128( state_v[4], _mm_add_epi64( in_v[4], inout_v[4] ) ) ); _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], _mm_add_epi64( in_v[5], inout_v[5] ) ) ); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); #else for ( i = 0; i < nCols; i++ ) { //Absorbing "M[prev] [+] M[row*]" state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); //M[row][col] = M[prev][col] XOR rand #endif #if defined __AVX2__ #elif defined __AVX__ state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); _mm_store_si128( (__m128i*)(&ptrWordOut[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[8]), _mm_xor_si128( state_v[4], in_v[4] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); #else ptrWordOut[0] = ptrWordIn[0] ^ state[0]; ptrWordOut[1] = ptrWordIn[1] ^ state[1]; ptrWordOut[2] = ptrWordIn[2] ^ state[2]; ptrWordOut[3] = ptrWordIn[3] ^ state[3]; ptrWordOut[4] = ptrWordIn[4] ^ state[4]; ptrWordOut[5] = ptrWordIn[5] ^ state[5]; ptrWordOut[6] = ptrWordIn[6] ^ state[6]; ptrWordOut[7] = ptrWordIn[7] ^ state[7]; ptrWordOut[8] = ptrWordIn[8] ^ state[8]; ptrWordOut[9] = ptrWordIn[9] ^ state[9]; ptrWordOut[10] = ptrWordIn[10] ^ state[10]; ptrWordOut[11] = ptrWordIn[11] ^ state[11]; #endif //M[row*][col] = M[row*][col] XOR rotW(rand) // Need to fix this before taking state load/store out of loop #ifdef __AVX2__ #else ptrWordInOut[0] ^= state[11]; ptrWordInOut[1] ^= state[0]; ptrWordInOut[2] ^= state[1]; ptrWordInOut[3] ^= state[2]; ptrWordInOut[4] ^= state[3]; ptrWordInOut[5] ^= state[4]; ptrWordInOut[6] ^= state[5]; ptrWordInOut[7] ^= state[6]; ptrWordInOut[8] ^= state[7]; ptrWordInOut[9] ^= state[8]; ptrWordInOut[10] ^= state[9]; ptrWordInOut[11] ^= state[10]; //Inputs: next column (i.e., next block in sequence) ptrWordInOut += BLOCK_LEN_INT64; ptrWordIn += BLOCK_LEN_INT64; //Output: goes to previous column ptrWordOut -= BLOCK_LEN_INT64; } #endif } /** * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., * the wordwise addition of two columns, ignoring carries between words). The * output of this operation, "rand", is then used to make * "M[rowOut][col] = M[rowOut][col] XOR rand" and * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit * rotation to the left. * * @param state The current state of the sponge * @param rowIn Row used only as input * @param rowInOut Row used as input and to receive output after rotation * @param rowOut Row receiving the output * */ inline void reducedDuplexRow( uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols ) { uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row int i; #if defined __AVX2__ for ( i = 0; i < nCols; i++) { //Absorbing "M[prev] [+] M[row*]" __m256i state_v[4], in_v[3], inout_v[3]; #define out_v in_v // reuse register in next code block #define t_state in_v state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) ); state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); state_v[0] = _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0], inout_v[0] ) ); state_v[1] = _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1], inout_v[1] ) ); state_v[2] = _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2], inout_v[2] ) ); out_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) ); out_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) ); out_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) ); LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], out_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], out_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], out_v[2] ) ); /* t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 ); t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 ); t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 ); inout_v[0] = _mm256_xor_si256( inout_v[0], _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) ); inout_v[1] = _mm256_xor_si256( inout_v[1], _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) ); inout_v[2] = _mm256_xor_si256( inout_v[2], _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[0]), inout_v[0] ); _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[4]), inout_v[1] ); _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[8]), inout_v[2] ); _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); */ #undef out_v #undef t_state //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) ptrWordInOut[0] ^= state[11]; ptrWordInOut[1] ^= state[0]; ptrWordInOut[2] ^= state[1]; ptrWordInOut[3] ^= state[2]; ptrWordInOut[4] ^= state[3]; ptrWordInOut[5] ^= state[4]; ptrWordInOut[6] ^= state[5]; ptrWordInOut[7] ^= state[6]; ptrWordInOut[8] ^= state[7]; ptrWordInOut[9] ^= state[8]; ptrWordInOut[10] ^= state[9]; ptrWordInOut[11] ^= state[10]; //Goes to next block ptrWordOut += BLOCK_LEN_INT64; ptrWordInOut += BLOCK_LEN_INT64; ptrWordIn += BLOCK_LEN_INT64; } #elif defined __AVX__ for ( i = 0; i < nCols; i++) { __m128i state_v[6], in_v[6], inout_v[6]; #define out_v in_v // reuse register in next code block state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); inout_v[0] = _mm_load_si128( (__m128i*)(&ptrWordInOut[0]) ); inout_v[1] = _mm_load_si128( (__m128i*)(&ptrWordInOut[2]) ); inout_v[2] = _mm_load_si128( (__m128i*)(&ptrWordInOut[4]) ); inout_v[3] = _mm_load_si128( (__m128i*)(&ptrWordInOut[6]) ); inout_v[4] = _mm_load_si128( (__m128i*)(&ptrWordInOut[8]) ); inout_v[5] = _mm_load_si128( (__m128i*)(&ptrWordInOut[10]) ); in_v[0] = _mm_load_si128( (__m128i*)(&ptrWordIn[0]) ); in_v[1] = _mm_load_si128( (__m128i*)(&ptrWordIn[2]) ); in_v[2] = _mm_load_si128( (__m128i*)(&ptrWordIn[4]) ); in_v[3] = _mm_load_si128( (__m128i*)(&ptrWordIn[6]) ); in_v[4] = _mm_load_si128( (__m128i*)(&ptrWordIn[8]) ); in_v[5] = _mm_load_si128( (__m128i*)(&ptrWordIn[10]) ); _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], _mm_add_epi64( in_v[0], inout_v[0] ) ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], _mm_add_epi64( in_v[1], inout_v[1] ) ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], _mm_add_epi64( in_v[2], inout_v[2] ) ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], _mm_add_epi64( in_v[3], inout_v[3] ) ) ); _mm_store_si128( (__m128i*)(&state[8]), _mm_xor_si128( state_v[4], _mm_add_epi64( in_v[4], inout_v[4] ) ) ); _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], _mm_add_epi64( in_v[5], inout_v[5] ) ) ); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); #else for ( i = 0; i < nCols; i++) { state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); #endif //M[rowOut][col] = M[rowOut][col] XOR rand #if defined __AVX2__ /* state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); out_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); out_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); out_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], out_v[0] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], out_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], out_v[2] ) ); */ #elif defined __AVX__ state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) ); state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) ); out_v[0] = _mm_load_si128( (__m128i*)(&ptrWordOut[0]) ); out_v[1] = _mm_load_si128( (__m128i*)(&ptrWordOut[2]) ); out_v[2] = _mm_load_si128( (__m128i*)(&ptrWordOut[4]) ); out_v[3] = _mm_load_si128( (__m128i*)(&ptrWordOut[6]) ); out_v[4] = _mm_load_si128( (__m128i*)(&ptrWordOut[8]) ); out_v[5] = _mm_load_si128( (__m128i*)(&ptrWordOut[10]) ); _mm_store_si128( (__m128i*)(&ptrWordOut[0]), _mm_xor_si128( state_v[0], out_v[0] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[2]), _mm_xor_si128( state_v[1], out_v[1] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[4]), _mm_xor_si128( state_v[2], out_v[2] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[6]), _mm_xor_si128( state_v[3], out_v[3] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[8]), _mm_xor_si128( state_v[4], out_v[4] ) ); _mm_store_si128( (__m128i*)(&ptrWordOut[10]), _mm_xor_si128( state_v[5], out_v[5] ) ); //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) ptrWordInOut[0] ^= state[11]; ptrWordInOut[1] ^= state[0]; ptrWordInOut[2] ^= state[1]; ptrWordInOut[3] ^= state[2]; ptrWordInOut[4] ^= state[3]; ptrWordInOut[5] ^= state[4]; ptrWordInOut[6] ^= state[5]; ptrWordInOut[7] ^= state[6]; ptrWordInOut[8] ^= state[7]; ptrWordInOut[9] ^= state[8]; ptrWordInOut[10] ^= state[9]; ptrWordInOut[11] ^= state[10]; //Goes to next block ptrWordOut += BLOCK_LEN_INT64; ptrWordInOut += BLOCK_LEN_INT64; ptrWordIn += BLOCK_LEN_INT64; } #else ptrWordOut[0] ^= state[0]; ptrWordOut[1] ^= state[1]; ptrWordOut[2] ^= state[2]; ptrWordOut[3] ^= state[3]; ptrWordOut[4] ^= state[4]; ptrWordOut[5] ^= state[5]; ptrWordOut[6] ^= state[6]; ptrWordOut[7] ^= state[7]; ptrWordOut[8] ^= state[8]; ptrWordOut[9] ^= state[9]; ptrWordOut[10] ^= state[10]; ptrWordOut[11] ^= state[11]; //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) ptrWordInOut[0] ^= state[11]; ptrWordInOut[1] ^= state[0]; ptrWordInOut[2] ^= state[1]; ptrWordInOut[3] ^= state[2]; ptrWordInOut[4] ^= state[3]; ptrWordInOut[5] ^= state[4]; ptrWordInOut[6] ^= state[5]; ptrWordInOut[7] ^= state[6]; ptrWordInOut[8] ^= state[7]; ptrWordInOut[9] ^= state[8]; ptrWordInOut[10] ^= state[9]; ptrWordInOut[11] ^= state[10]; //Goes to next block ptrWordOut += BLOCK_LEN_INT64; ptrWordInOut += BLOCK_LEN_INT64; ptrWordIn += BLOCK_LEN_INT64; } #endif }
int bit_vec_filter_no_flipping_m128_sse1(uint8_t *read_vec0, uint8_t *read_vec1, uint8_t *ref_vec0, uint8_t *ref_vec1, __m128i mask, int max_error) { int total_difference = 0; //Start iteration int j; //read data __m128i read_XMM0 = *((__m128i *) (read_vec0)); __m128i read_XMM1 = *((__m128i *) (read_vec1)); //ref data __m128i ref_XMM0 = *((__m128i *) (ref_vec0)); __m128i ref_XMM1 = *((__m128i *) (ref_vec1)); __m128i shift_XMM; __m128i diff_XMM; __m128i temp_diff_XMM; __m128i temp_shift_XMM; __m128i temp_mask; diff_XMM = _mm_xor_si128(read_XMM0, ref_XMM0); temp_diff_XMM = _mm_xor_si128(read_XMM1, ref_XMM1); diff_XMM = _mm_or_si128(diff_XMM, temp_diff_XMM); //printf("diff_XMM: \n"); //print128_bit_twice(diff_XMM); for (j = 1; j <= max_error; j++) { temp_mask = _mm_load_si128( (__m128i *) (MASK_SSE_BEG1 + (j - 1) * SSE_BYTE_NUM)); temp_mask = _mm_and_si128(temp_mask, mask); //Right shift read shift_XMM = shift_right_sse1(read_XMM0, j); temp_diff_XMM = _mm_xor_si128(shift_XMM, ref_XMM0); shift_XMM = shift_right_sse1(read_XMM1, j); temp_shift_XMM = _mm_xor_si128(shift_XMM, ref_XMM1); temp_diff_XMM = _mm_or_si128(temp_shift_XMM, temp_diff_XMM); temp_diff_XMM = _mm_and_si128(temp_diff_XMM, temp_mask); // printf("Before flip: \t"); // print128_bit(temp_diff_XMM); // flip_false_zero(temp_diff_XMM); //No flipping // printf("After flip: \t"); // print128_bit(temp_diff_XMM); diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM); //printf("read shift %d diff_XMM: \n", j); //print128_bit_twice(diff_XMM); //Right shift ref shift_XMM = shift_right_sse1(ref_XMM0, j); temp_diff_XMM = _mm_xor_si128(shift_XMM, read_XMM0); shift_XMM = shift_right_sse1(ref_XMM1, j); temp_shift_XMM = _mm_xor_si128(shift_XMM, read_XMM1); temp_diff_XMM = _mm_or_si128(temp_shift_XMM, temp_diff_XMM); temp_diff_XMM = _mm_and_si128(temp_diff_XMM, temp_mask); // printf("Before flip: \t"); // print128_bit(temp_diff_XMM); // flip_false_zero(temp_diff_XMM); //No flipping // printf("After flip: \t"); // print128_bit(temp_diff_XMM); diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM); //printf("ref shift %d diff_XMM: \n", j); //print128_bit_twice(diff_XMM); } total_difference = popcount1_m128i_sse(diff_XMM); if (total_difference > max_error) return 0; else return 1; }
HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen) { int r; __m128i x0; __m128i x1; __m128i x2; __m128i x3; __m128i x4; __m128i x5; __m128i x6; __m128i x7; __m128i y0; __m128i y1; __m128i y2; __m128i y3; while (databitlen >= 8 && state->pos != 0) { ((unsigned char *) state->x)[state->pos / 8] ^= *data; data += 1; databitlen -= 8; state->pos += 8; if (state->pos == 8 * CUBEHASH_BLOCKBYTES) { transform(state,CUBEHASH_ROUNDS); state->pos = 0; } } x0 = state->x[0]; x1 = state->x[1]; x2 = state->x[2]; x3 = state->x[3]; x4 = state->x[4]; x5 = state->x[5]; x6 = state->x[6]; x7 = state->x[7]; while (databitlen >= 8 * CUBEHASH_BLOCKBYTES) { x0 = _mm_xor_si128(x0,_mm_set_epi32(0,0,0,(crypto_uint32) *(crypto_uint16 *) data)); data += CUBEHASH_BLOCKBYTES; databitlen -= 8 * CUBEHASH_BLOCKBYTES; for (r = 0;r < CUBEHASH_ROUNDS;++r) { x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x2; y1 = x3; y2 = x0; y3 = x1; x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0x4e); x5 = _mm_shuffle_epi32(x5,0x4e); x6 = _mm_shuffle_epi32(x6,0x4e); x7 = _mm_shuffle_epi32(x7,0x4e); x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x1; y1 = x0; y2 = x3; y3 = x2; x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0xb1); x5 = _mm_shuffle_epi32(x5,0xb1); x6 = _mm_shuffle_epi32(x6,0xb1); x7 = _mm_shuffle_epi32(x7,0xb1); } } state->x[0] = x0; state->x[1] = x1; state->x[2] = x2; state->x[3] = x3; state->x[4] = x4; state->x[5] = x5; state->x[6] = x6; state->x[7] = x7; while (databitlen >= 8) { ((unsigned char *) state->x)[state->pos / 8] ^= *data; data += 1; databitlen -= 8; state->pos += 8; if (state->pos == 8 * CUBEHASH_BLOCKBYTES) { transform(state,CUBEHASH_ROUNDS); state->pos = 0; } } if (databitlen > 0) { ((unsigned char *) state->x)[state->pos / 8] ^= *data; state->pos += databitlen; } return SUCCESS; }
void xor(byte* x, byte* y, byte* result, int32_t len) { #if defined (PORTABLE_64_BIT) if (len >= 16) { for (int32_t i = (len / 16) - 1; i >= 0; i--, x += 16, y += 16, result += 16) { __m128i xmm_x = _mm_loadu_si128((__m128i*)x); __m128i xmm_y = _mm_loadu_si128((__m128i*)y); __m128i xmm_res = _mm_xor_si128(xmm_x, xmm_y); _mm_storeu_si128((__m128i*)result, xmm_res); } } if ((len & 8) != 0) { *((uint64_t*)result) = *((uint64_t*)x) ^ *((uint64_t*)y); x += 8; y += 8; result += 8; } if ((len & 4) != 0) { *((uint32_t*)result) = *((uint32_t*)x) ^ *((uint32_t*)y); x += 4; y += 4; result += 4; } if ((len & 2) != 0) { *((uint16_t*)result) = *((uint16_t*)x) ^ *((uint16_t*)y); x += 2; y += 2; result += 2; } if ((len & 1) != 0) { *((byte*)result) = (byte)(*((byte*)x) ^ *((byte*)y)); } #elif defined (PORTABLE_32_BIT) if (len >= 16) { for (int32_t i = (len / 16) - 1; i >= 0; i--, x += 16, y += 16, result += 16) { __m128i xmm_x = _mm_loadu_si128((__m128i*)x); __m128i xmm_y = _mm_loadu_si128((__m128i*)y); __m128i xmm_res = _mm_xor_si128(xmm_x, xmm_y); _mm_storeu_si128((__m128i*)result, xmm_res); } } if ((len & 8) != 0) { *((uint32_t*)result) = *((uint32_t*)x) ^ *((uint32_t*)y); x += 4; y += 4; result += 4; *((uint32_t*)result) = *((uint32_t*)x) ^ *((uint32_t*)y); x += 4; y += 4; result += 4; } if ((len & 4) != 0) { *((uint32_t*)result) = *((uint32_t*)x) ^ *((uint32_t*)y); x += 4; y += 4; result += 4; } if ((len & 2) != 0) { *((uint16_t*)result) = *((uint16_t*)x) ^ *((uint16_t*)y); x += 2; y += 2; result += 2; } if ((len & 1) != 0) { *((byte*)result) = (byte)(*((byte*)x) ^ *((byte*)y)); } #endif }
static void transform(hashState *state,int r) { __m128i x0; __m128i x1; __m128i x2; __m128i x3; __m128i x4; __m128i x5; __m128i x6; __m128i x7; __m128i y0; __m128i y1; __m128i y2; __m128i y3; x0 = state->x[0]; x1 = state->x[1]; x2 = state->x[2]; x3 = state->x[3]; x4 = state->x[4]; x5 = state->x[5]; x6 = state->x[6]; x7 = state->x[7]; for (;r > 0;--r) { x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x2; y1 = x3; y2 = x0; y3 = x1; x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0x4e); x5 = _mm_shuffle_epi32(x5,0x4e); x6 = _mm_shuffle_epi32(x6,0x4e); x7 = _mm_shuffle_epi32(x7,0x4e); x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x1; y1 = x0; y2 = x3; y3 = x2; x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0xb1); x5 = _mm_shuffle_epi32(x5,0xb1); x6 = _mm_shuffle_epi32(x6,0xb1); x7 = _mm_shuffle_epi32(x7,0xb1); } state->x[0] = x0; state->x[1] = x1; state->x[2] = x2; state->x[3] = x3; state->x[4] = x4; state->x[5] = x5; state->x[6] = x6; state->x[7] = x7; }
void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression) { Mat img = _img.getMat(); const int K = patternSize/2, N = patternSize + K + 1; #if CV_SSE2 const int quarterPatternSize = patternSize/4; (void)quarterPatternSize; #endif int i, j, k, pixel[25]; makeOffsets(pixel, (int)img.step, patternSize); keypoints.clear(); threshold = std::min(std::max(threshold, 0), 255); #if CV_SSE2 __m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K); (void)K16; (void)delta; (void)t; #endif uchar threshold_tab[512]; for( i = -255; i <= 255; i++ ) threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0); AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128); uchar* buf[3]; buf[0] = _buf; buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols; int* cpbuf[3]; cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1; cpbuf[1] = cpbuf[0] + img.cols + 1; cpbuf[2] = cpbuf[1] + img.cols + 1; memset(buf[0], 0, img.cols*3); for(i = 3; i < img.rows-2; i++) { const uchar* ptr = img.ptr<uchar>(i) + 3; uchar* curr = buf[(i - 3)%3]; int* cornerpos = cpbuf[(i - 3)%3]; memset(curr, 0, img.cols); int ncorners = 0; if( i < img.rows - 3 ) { j = 3; #if CV_SSE2 if( patternSize == 16 ) { for(; j < img.cols - 16 - 3; j += 16, ptr += 16) { __m128i m0, m1; __m128i v0 = _mm_loadu_si128((const __m128i*)ptr); __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta); v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta); __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta); __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta); __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta); __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta); m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0)); m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1)); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2))); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3))); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0))); m0 = _mm_or_si128(m0, m1); int mask = _mm_movemask_epi8(m0); if( mask == 0 ) continue; if( (mask & 255) == 0 ) { j -= 8; ptr -= 8; continue; } __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0; for( k = 0; k < N; k++ ) { __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta); m0 = _mm_cmpgt_epi8(x, v0); m1 = _mm_cmpgt_epi8(v1, x); c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0); c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1); max0 = _mm_max_epu8(max0, c0); max1 = _mm_max_epu8(max1, c1); } max0 = _mm_max_epu8(max0, max1); int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16)); for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) if(m & 1) { cornerpos[ncorners++] = j+k; if(nonmax_suppression) curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold); } } } #endif for( ; j < img.cols - 3; j++, ptr++ ) { int v = ptr[0]; const uchar* tab = &threshold_tab[0] - v + 255; int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]]; d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]]; d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]]; d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]]; d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]]; d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]]; if( d & 1 ) { int vt = v - threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x < vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold); break; } } else count = 0; } } if( d & 2 ) { int vt = v + threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x > vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold); break; } } else count = 0; } } } } cornerpos[-1] = ncorners; if( i == 3 ) continue; const uchar* prev = buf[(i - 4 + 3)%3]; const uchar* pprev = buf[(i - 5 + 3)%3]; cornerpos = cpbuf[(i - 4 + 3)%3]; ncorners = cornerpos[-1]; for( k = 0; k < ncorners; k++ ) { j = cornerpos[k]; int score = prev[j]; if( !nonmax_suppression || (score > prev[j+1] && score > prev[j-1] && score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] && score > curr[j-1] && score > curr[j] && score > curr[j+1]) ) { keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score)); } } }
// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i three = _mm_set1_epi16(3); // Load, combine and tranpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2); const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2); const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2); const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2); // b0_extra = (a0 != 0); const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one); const __m128i b0_base = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); const __m128i b0 = _mm_add_epi16(b0_base, b0_extra); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so // we can use _mm_load_si128 instead of _mm_loadu_si128. const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); { // sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15); const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15); const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15); const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15); // b = abs(b) = (b ^ sign) - sign A_b0 = _mm_xor_si128(A_b0, sign_A_b0); A_b2 = _mm_xor_si128(A_b2, sign_A_b2); B_b0 = _mm_xor_si128(B_b0, sign_B_b0); B_b2 = _mm_xor_si128(B_b2, sign_B_b2); A_b0 = _mm_sub_epi16(A_b0, sign_A_b0); A_b2 = _mm_sub_epi16(A_b2, sign_A_b2); B_b0 = _mm_sub_epi16(B_b0, sign_B_b0); B_b2 = _mm_sub_epi16(B_b2, sign_B_b2); } // b = abs(b) + 3 A_b0 = _mm_add_epi16(A_b0, three); A_b2 = _mm_add_epi16(A_b2, three); B_b0 = _mm_add_epi16(B_b0, three); B_b2 = _mm_add_epi16(B_b2, three); // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 // b = (abs(b) + 3) >> 3 A_b0 = _mm_srai_epi16(A_b0, 3); A_b2 = _mm_srai_epi16(A_b2, 3); B_b0 = _mm_srai_epi16(B_b0, 3); B_b2 = _mm_srai_epi16(B_b2, 3); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b0 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b0); } return sum[0] + sum[1] + sum[2] + sum[3]; }