static inline void desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts, uint32_t *ptype_tbl) { __m128i ptype0 = _mm_unpackhi_epi64(descs[0], descs[1]); __m128i ptype1 = _mm_unpackhi_epi64(descs[2], descs[3]); ptype0 = _mm_srli_epi64(ptype0, 30); ptype1 = _mm_srli_epi64(ptype1, 30); rx_pkts[0]->packet_type = ptype_tbl[_mm_extract_epi8(ptype0, 0)]; rx_pkts[1]->packet_type = ptype_tbl[_mm_extract_epi8(ptype0, 8)]; rx_pkts[2]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 0)]; rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 8)]; }
static inline int8_t _mm_hmax_epi8_rpl(__m128i a) { a = _mm_max_epi8(a, _mm_srli_si128(a, 8)); a = _mm_max_epi8(a, _mm_srli_si128(a, 4)); a = _mm_max_epi8(a, _mm_srli_si128(a, 2)); a = _mm_max_epi8(a, _mm_srli_si128(a, 1)); return _mm_extract_epi8(a, 0); }
void GarbledCct3::evl_next_gen_inp_com(const Bytes &row, size_t kx) { Bytes out(m_gen_inp_decom[0].size()); for (size_t jx = 0; jx < Env::circuit().gen_inp_cnt(); jx++) { if (row.get_ith_bit(jx)) { out ^= m_gen_inp_decom[jx]; } } byte bit = out.get_ith_bit(0); static Bytes tmp; Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes(); __m128i aes_key, aes_plaintext, aes_ciphertext, out_key; tmp.assign(out.begin(), out.begin()+Env::key_size_in_bytes()); tmp.resize(16, 0); aes_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); aes_plaintext = _mm_set1_epi64x((uint64_t)kx); KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); out_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); out_key = _mm_xor_si128(out_key, aes_ciphertext); bit = _mm_extract_epi8(out_key, 0) & 0x01; m_gen_inp_hash.set_ith_bit(kx, bit); m_i_bufr_ix += 2*Env::key_size_in_bytes(); // tmp.resize(16); // _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key); // std::cout << "EVL " << m_gen_inp_hash_ix << " : " << tmp.to_hex() << std::endl; m_gen_inp_hash_ix++; }
static inline void arr_store( int *array, vec128i vH, int32_t t, int32_t seglen, int32_t d, int32_t dlen, int32_t bias) { array[1LL*( 0*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 0) - bias; array[1LL*( 1*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 1) - bias; array[1LL*( 2*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 2) - bias; array[1LL*( 3*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 3) - bias; array[1LL*( 4*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 4) - bias; array[1LL*( 5*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 5) - bias; array[1LL*( 6*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 6) - bias; array[1LL*( 7*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 7) - bias; array[1LL*( 8*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 8) - bias; array[1LL*( 9*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 9) - bias; array[1LL*(10*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 10) - bias; array[1LL*(11*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 11) - bias; array[1LL*(12*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 12) - bias; array[1LL*(13*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 13) - bias; array[1LL*(14*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 14) - bias; array[1LL*(15*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 15) - bias; }
static inline void arr_store_si128( int8_t *array, vec128i vWH, int32_t i, int32_t s1Len, int32_t j, int32_t s2Len) { if (0 <= i+0 && i+0 < s1Len && 0 <= j-0 && j-0 < s2Len) { array[1LL*(i+0)*s2Len + (j-0)] = (int8_t)_mm_extract_epi8(vWH, 15); } if (0 <= i+1 && i+1 < s1Len && 0 <= j-1 && j-1 < s2Len) { array[1LL*(i+1)*s2Len + (j-1)] = (int8_t)_mm_extract_epi8(vWH, 14); } if (0 <= i+2 && i+2 < s1Len && 0 <= j-2 && j-2 < s2Len) { array[1LL*(i+2)*s2Len + (j-2)] = (int8_t)_mm_extract_epi8(vWH, 13); } if (0 <= i+3 && i+3 < s1Len && 0 <= j-3 && j-3 < s2Len) { array[1LL*(i+3)*s2Len + (j-3)] = (int8_t)_mm_extract_epi8(vWH, 12); } if (0 <= i+4 && i+4 < s1Len && 0 <= j-4 && j-4 < s2Len) { array[1LL*(i+4)*s2Len + (j-4)] = (int8_t)_mm_extract_epi8(vWH, 11); } if (0 <= i+5 && i+5 < s1Len && 0 <= j-5 && j-5 < s2Len) { array[1LL*(i+5)*s2Len + (j-5)] = (int8_t)_mm_extract_epi8(vWH, 10); } if (0 <= i+6 && i+6 < s1Len && 0 <= j-6 && j-6 < s2Len) { array[1LL*(i+6)*s2Len + (j-6)] = (int8_t)_mm_extract_epi8(vWH, 9); } if (0 <= i+7 && i+7 < s1Len && 0 <= j-7 && j-7 < s2Len) { array[1LL*(i+7)*s2Len + (j-7)] = (int8_t)_mm_extract_epi8(vWH, 8); } if (0 <= i+8 && i+8 < s1Len && 0 <= j-8 && j-8 < s2Len) { array[1LL*(i+8)*s2Len + (j-8)] = (int8_t)_mm_extract_epi8(vWH, 7); } if (0 <= i+9 && i+9 < s1Len && 0 <= j-9 && j-9 < s2Len) { array[1LL*(i+9)*s2Len + (j-9)] = (int8_t)_mm_extract_epi8(vWH, 6); } if (0 <= i+10 && i+10 < s1Len && 0 <= j-10 && j-10 < s2Len) { array[1LL*(i+10)*s2Len + (j-10)] = (int8_t)_mm_extract_epi8(vWH, 5); } if (0 <= i+11 && i+11 < s1Len && 0 <= j-11 && j-11 < s2Len) { array[1LL*(i+11)*s2Len + (j-11)] = (int8_t)_mm_extract_epi8(vWH, 4); } if (0 <= i+12 && i+12 < s1Len && 0 <= j-12 && j-12 < s2Len) { array[1LL*(i+12)*s2Len + (j-12)] = (int8_t)_mm_extract_epi8(vWH, 3); } if (0 <= i+13 && i+13 < s1Len && 0 <= j-13 && j-13 < s2Len) { array[1LL*(i+13)*s2Len + (j-13)] = (int8_t)_mm_extract_epi8(vWH, 2); } if (0 <= i+14 && i+14 < s1Len && 0 <= j-14 && j-14 < s2Len) { array[1LL*(i+14)*s2Len + (j-14)] = (int8_t)_mm_extract_epi8(vWH, 1); } if (0 <= i+15 && i+15 < s1Len && 0 <= j-15 && j-15 < s2Len) { array[1LL*(i+15)*s2Len + (j-15)] = (int8_t)_mm_extract_epi8(vWH, 0); } }
int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) { if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) { uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12; const __m128i* xPtr = (const __m128i*) input; const __m128i* lutPtr = (const __m128i*) deinter; __m128i xVal, lutVal1, lutVal2; /* Simplify load if we do not need to wrap (ie high rates) */ if (in_len <= out_len) { for (int i=0;i<in_len/16;i++) { xVal = _mm_loadu_si128(xPtr); xPtr ++; lutVal1 = _mm_loadu_si128(lutPtr); lutPtr++; lutVal2 = _mm_loadu_si128(lutPtr); lutPtr ++; for (int j=0;j<8;j++) { int8_t x = (int8_t) _mm_extract_epi8(xVal, j); uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j); output[l] += x; } for (int j=0;j<8;j++) { int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8); uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j); output[l] += x; } } for (int i=16*(in_len/16);i<in_len;i++) { output[deinter[i%out_len]] += input[i]; } } else { int intCnt = 16; int inputCnt = 0; int nwrapps = 0; while(inputCnt < in_len - 16) { xVal = _mm_loadu_si128(xPtr); xPtr ++; lutVal1 = _mm_loadu_si128(lutPtr); lutPtr++; lutVal2 = _mm_loadu_si128(lutPtr); lutPtr ++; for (int j=0;j<8;j++) { int8_t x = (int8_t) _mm_extract_epi8(xVal, j); uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j); output[l] += x; } for (int j=0;j<8;j++) { int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8); uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j); output[l] += x; } intCnt += 16; inputCnt += 16; if (intCnt >= out_len && inputCnt < in_len - 16) { /* Copy last elements */ if ((out_len%16) == 12) { for (int j=(nwrapps+1)*out_len-12;j<(nwrapps+1)*out_len;j++) { output[deinter[j%out_len]] += input[j]; inputCnt++; } } else { for (int j=(nwrapps+1)*out_len-4;j<(nwrapps+1)*out_len;j++) { output[deinter[j%out_len]] += input[j]; inputCnt++; } } /* And wrap pointers */ nwrapps++; intCnt = 16; xPtr = (const __m128i*) &input[nwrapps*out_len]; lutPtr = (const __m128i*) deinter; } } for (int i=inputCnt;i<in_len;i++) { output[deinter[i%out_len]] += input[i]; } } return 0; } else { printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx); return SRSLTE_ERROR_INVALID_INPUTS; } }
// useful for debugging inline std::string str8(__m128i x) { std::stringstream ret; ret << "[" << std::hex << " " << _mm_extract_epi8(x,0) << " " << _mm_extract_epi8(x,1) << " " << _mm_extract_epi8(x,2) << " " << _mm_extract_epi8(x,3) << " " << _mm_extract_epi8(x,4) << " " << _mm_extract_epi8(x,5) << " " << _mm_extract_epi8(x,6) << " " << _mm_extract_epi8(x,7) << " " << _mm_extract_epi8(x,8) << " " << _mm_extract_epi8(x,9) << " " << _mm_extract_epi8(x,10) << " " << _mm_extract_epi8(x,11) << " " << _mm_extract_epi8(x,12) << " " << _mm_extract_epi8(x,13) << " " << _mm_extract_epi8(x,14) << " " << _mm_extract_epi8(x,15) << " ]"; return ret.str(); }
int test_mm_extract_epi8(__m128i x) { // CHECK-LABEL: test_mm_extract_epi8 // CHECK: extractelement <16 x i8> %{{.*}}, i32 1 // CHECK: zext i8 %{{.*}} to i32 return _mm_extract_epi8(x, 1); }
static inline void arr_store_si128( int *array, __m128i vH, int32_t t, int32_t seglen, int32_t d, int32_t dlen) { array[( 0*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 0); array[( 1*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 1); array[( 2*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 2); array[( 3*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 3); array[( 4*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 4); array[( 5*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 5); array[( 6*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 6); array[( 7*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 7); array[( 8*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 8); array[( 9*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 9); array[(10*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 10); array[(11*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 11); array[(12*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 12); array[(13*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 13); array[(14*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 14); array[(15*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 15); }
void GarbledCct::evl_next_gate(const Gate ¤t_gate) { __m128i current_key, a; Bytes::const_iterator it; static Bytes tmp; if (current_gate.m_tag == Circuit::GEN_INP) { uint8_t bit = m_gen_inp_mask.get_ith_bit(m_gen_inp_ix); Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes(); tmp = m_M[m_gen_inp_ix].to_bytes().hash(Env::k()); tmp.resize(16, 0); current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); m_i_bufr_ix += Env::key_size_in_bytes()*2; current_key = _mm_xor_si128(current_key, a); m_gen_inp_ix++; } else if (current_gate.m_tag == Circuit::EVL_INP) { uint8_t bit = m_evl_inp.get_ith_bit(m_evl_inp_ix); Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes(); tmp = (*m_ot_keys)[m_evl_inp_ix]; tmp.resize(16, 0); current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); m_i_bufr_ix += Env::key_size_in_bytes()*2; current_key = _mm_xor_si128(current_key, a); m_evl_inp_ix++; } else { const vector<uint64_t> &inputs = current_gate.m_input_idx; #ifdef FREE_XOR if (is_xor(current_gate)) { current_key = inputs.size() == 2? _mm_xor_si128(m_w[inputs[0]], m_w[inputs[1]]) : _mm_load_si128(m_w+inputs[0]); } else #endif if (inputs.size() == 2) // 2-arity gates { __m128i aes_key[2], aes_plaintext, aes_ciphertext; aes_plaintext = _mm_set1_epi64x(m_gate_ix); aes_key[0] = _mm_load_si128(m_w+inputs[0]); aes_key[1] = _mm_load_si128(m_w+inputs[1]); const uint8_t perm_x = _mm_extract_epi8(aes_key[0], 0) & 0x01; const uint8_t perm_y = _mm_extract_epi8(aes_key[1], 0) & 0x01; KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); uint8_t garbled_ix = (perm_y<<1) | (perm_x<<0); #ifdef GRR if (garbled_ix == 0) { current_key = _mm_load_si128(&aes_ciphertext); } else { it = m_i_bufr_ix+(garbled_ix-1)*Env::key_size_in_bytes(); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); current_key = _mm_xor_si128(aes_ciphertext, a); } m_i_bufr_ix += 3*Env::key_size_in_bytes(); #else it = m_i_bufr_ix + garbled_ix*Env::key_size_in_bytes(); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); current_key = _mm_xor_si128(current_key, aes_ciphertext); m_i_bufr_ix += 4*Env::key_size_in_bytes(); #endif } else // 1-arity gates { __m128i aes_key, aes_plaintext, aes_ciphertext; aes_plaintext = _mm_set1_epi64x(m_gate_ix); aes_key = _mm_load_si128(m_w+inputs[0]); KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); const uint8_t perm_x = _mm_extract_epi8(aes_key, 0) & 0x01; #ifdef GRR if (perm_x == 0) { current_key = _mm_load_si128(&aes_ciphertext); } else { tmp.assign(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes()); tmp.resize(16, 0); a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); current_key = _mm_xor_si128(aes_ciphertext, a); } m_i_bufr_ix += Env::key_size_in_bytes(); #else it = m_i_bufr_ix + garbled_ix*Env::key_size_in_bytes(); tmp.assign(it, it+Env::key_size_in_bytes()); tmp.resize(16, 0); current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); current_key = _mm_xor_si128(current_key, aes_ciphertext); m_i_bufr_ix += 2*Env::key_size_in_bytes(); #endif } if (current_gate.m_tag == Circuit::EVL_OUT) { uint8_t out_bit = _mm_extract_epi8(current_key, 0) & 0x01; out_bit ^= *m_i_bufr_ix; m_evl_out.set_ith_bit(m_evl_out_ix, out_bit); m_i_bufr_ix++; m_evl_out_ix++; } else if (current_gate.m_tag == Circuit::GEN_OUT) { // TODO: Ki08 implementation uint8_t out_bit = _mm_extract_epi8(current_key, 0) & 0x01; out_bit ^= *m_i_bufr_ix; m_gen_out.set_ith_bit(m_gen_out_ix, out_bit); m_i_bufr_ix++; // m_C[2*m_gen_out_ix+0] = Bytes(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes()); // m_i_bufr_ix += Env::key_size_in_bytes(); // // m_C[2*m_gen_out_ix+1] = Bytes(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes()); // m_i_bufr_ix += Env::key_size_in_bytes(); m_gen_out_ix++; } } _mm_store_si128(m_w+current_gate.m_idx, current_key); update_hash(m_i_bufr); m_gate_ix++; }
/* ** batch doubling from [MSK15] */ __inline__ static void mul2_PIPE(__m128i *dat) { unsigned a, b; block tmp = le(dat[0]); block carry[PIPE]; block up4, up8; const block sh1 = _mm_set_epi8(255, 255, 255, 255, 255, 255, 15, 14, 255, 255, 255, 255, 255, 255, 7, 6); const block sh2 = _mm_set_epi8(255, 255, 255, 255, 255, 255, 13, 12, 255, 255, 255, 255, 255, 255, 5, 4); const block sh3 = _mm_set_epi8(255, 255, 255, 255, 255, 255, 11, 10, 255, 255, 255, 255, 255, 255, 3, 2); const block sh4 = _mm_set_epi8(255, 255, 255, 255, 255, 255, 9, 8, 255, 255, 255, 255, 255, 255, 1, 0); const block *Txp = (const block*)TX; const block *Typ = (const block*)TY; a = _mm_extract_epi8(tmp, 15); b = _mm_extract_epi8(tmp, 7); up4 = _mm_unpacklo_epi64(Txp[a], Typ[b]); up8 = _mm_unpackhi_epi64(Txp[a], Typ[b]); carry[0] = _mm_shuffle_epi8(up4, sh1); carry[1] = _mm_shuffle_epi8(up4, sh2); carry[2] = _mm_shuffle_epi8(up4, sh3); carry[3] = _mm_shuffle_epi8(up4, sh4); #if(PIPE>=5) carry[4] = _mm_shuffle_epi8(up8, sh1); #endif #if(PIPE>=6) carry[5] = _mm_shuffle_epi8(up8, sh2); #endif #if(PIPE>=7) carry[6] = _mm_shuffle_epi8(up8, sh3); #endif #if(PIPE==8) carry[7] = _mm_shuffle_epi8(up8, sh4); #endif dat[1] = _mm_slli_epi64(tmp, 1); dat[2] = _mm_slli_epi64(tmp, 2); dat[3] = _mm_slli_epi64(tmp, 3); dat[4] = _mm_slli_epi64(tmp, 4); #if(PIPE>=5) dat[5] = _mm_slli_epi64(tmp, 5); #endif #if(PIPE>=6) dat[6] = _mm_slli_epi64(tmp, 6); #endif #if(PIPE>=7) dat[7] = _mm_slli_epi64(tmp, 7); #endif #if (PIPE==8) dat[8] = (_mm_slli_epi64(tmp, 8)); #endif dat[1] = le(_mm_xor_si128(dat[1], carry[0])); dat[2] = le(_mm_xor_si128(dat[2], carry[1])); dat[3] = le(_mm_xor_si128(dat[3], carry[2])); dat[4] = le(_mm_xor_si128(dat[4], carry[3])); #if(PIPE>=5) dat[5] = le(_mm_xor_si128(dat[5], carry[4])); #endif #if(PIPE>=6) dat[6] = le(_mm_xor_si128(dat[6], carry[5])); #endif #if(PIPE>=7) dat[7] = le(_mm_xor_si128(dat[7], carry[6])); #endif #if (PIPE==8) dat[8] = le(_mm_xor_si128(dat[8], carry[7])); #endif }
int test_extract_epi8(__m128i x) { // CHECK-LABEL: test_extract_epi8 // CHECK: extractelement <16 x i8> %{{.*}}, i32 0 // CHECK-ASM: pextrb return _mm_extract_epi8(x, 16); }
TARGET_SSE4_1 static inline __m128i extract(__m128i xmm0, __m128i xmm1, __m128i xmm2, __m128i xmm3) { qint64 r0, r1; r0 = Ax[0][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 0))]; r0 ^= Ax[1][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 8))]; r0 ^= Ax[2][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 0))]; r0 ^= Ax[3][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 8))]; r0 ^= Ax[4][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 0))]; r0 ^= Ax[5][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 8))]; r0 ^= Ax[6][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 0))]; r0 ^= Ax[7][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 8))]; r1 = Ax[0][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 1))]; r1 ^= Ax[1][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 9))]; r1 ^= Ax[2][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 1))]; r1 ^= Ax[3][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 9))]; r1 ^= Ax[4][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 1))]; r1 ^= Ax[5][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 9))]; r1 ^= Ax[6][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 1))]; r1 ^= Ax[7][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 9))]; return _mm_insert_epi64(_mm_cvtsi64_si128(r0), r1, 1); }
TARGET_SSE4_1 static inline __m128i extract(__m128i xmm0, __m128i xmm1, __m128i xmm2, __m128i xmm3) { __m64 mm0, mm1; mm0 = _mm_cvtsi64_m64(Ax[0][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 0))]); mm0 = _mm_xor_64(mm0, Ax[1][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 8))]); mm0 = _mm_xor_64(mm0, Ax[2][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 0))]); mm0 = _mm_xor_64(mm0, Ax[3][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 8))]); mm0 = _mm_xor_64(mm0, Ax[4][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 0))]); mm0 = _mm_xor_64(mm0, Ax[5][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 8))]); mm0 = _mm_xor_64(mm0, Ax[6][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 0))]); mm0 = _mm_xor_64(mm0, Ax[7][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 8))]); mm1 = _mm_cvtsi64_m64(Ax[0][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 1))]); mm1 = _mm_xor_64(mm1, Ax[1][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 9))]); mm1 = _mm_xor_64(mm1, Ax[2][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 1))]); mm1 = _mm_xor_64(mm1, Ax[3][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 9))]); mm1 = _mm_xor_64(mm1, Ax[4][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 1))]); mm1 = _mm_xor_64(mm1, Ax[5][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 9))]); mm1 = _mm_xor_64(mm1, Ax[6][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 1))]); mm1 = _mm_xor_64(mm1, Ax[7][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 9))]); return _mm_set_epi64(mm1, mm0); }
int test_extract_epi8(__m128i __a) { // CHECK-LABEL: @test_extract_epi8 // CHECK: extractelement <16 x i8> %{{.*}}, i32 0 return _mm_extract_epi8(__a, 16); }
static inline void arr_store_col( int *col, vec128i vH, int32_t t, int32_t seglen, int32_t bias) { col[ 0*seglen+t] = (int8_t)_mm_extract_epi8(vH, 0) - bias; col[ 1*seglen+t] = (int8_t)_mm_extract_epi8(vH, 1) - bias; col[ 2*seglen+t] = (int8_t)_mm_extract_epi8(vH, 2) - bias; col[ 3*seglen+t] = (int8_t)_mm_extract_epi8(vH, 3) - bias; col[ 4*seglen+t] = (int8_t)_mm_extract_epi8(vH, 4) - bias; col[ 5*seglen+t] = (int8_t)_mm_extract_epi8(vH, 5) - bias; col[ 6*seglen+t] = (int8_t)_mm_extract_epi8(vH, 6) - bias; col[ 7*seglen+t] = (int8_t)_mm_extract_epi8(vH, 7) - bias; col[ 8*seglen+t] = (int8_t)_mm_extract_epi8(vH, 8) - bias; col[ 9*seglen+t] = (int8_t)_mm_extract_epi8(vH, 9) - bias; col[10*seglen+t] = (int8_t)_mm_extract_epi8(vH, 10) - bias; col[11*seglen+t] = (int8_t)_mm_extract_epi8(vH, 11) - bias; col[12*seglen+t] = (int8_t)_mm_extract_epi8(vH, 12) - bias; col[13*seglen+t] = (int8_t)_mm_extract_epi8(vH, 13) - bias; col[14*seglen+t] = (int8_t)_mm_extract_epi8(vH, 14) - bias; col[15*seglen+t] = (int8_t)_mm_extract_epi8(vH, 15) - bias; }
void GarbledCct::gen_next_gate(const Gate ¤t_gate) { __m128i current_zero_key; if (current_gate.m_tag == Circuit::GEN_INP) { __m128i a[2]; // zero_key = m_prng.rand(Env::k()); static Bytes tmp; tmp = m_prng.rand(Env::k()); tmp.resize(16, 0); current_zero_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[0] = m_M[2*m_gen_inp_ix+0].to_bytes().hash(Env::k()); tmp = m_M[2*m_gen_inp_ix+0].to_bytes().hash(Env::k()); tmp.resize(16, 0); a[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[1] = m_M[2*m_gen_inp_ix+1].to_bytes().hash(Env::k()); tmp = m_M[2*m_gen_inp_ix+1].to_bytes().hash(Env::k()); tmp.resize(16, 0); a[1] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[0] ^= zero_key; a[1] ^= zero_key ^ R; a[0] = _mm_xor_si128(a[0], current_zero_key); a[1] = _mm_xor_si128(a[1], _mm_xor_si128(current_zero_key, m_R)); uint8_t bit = m_gen_inp_mask.get_ith_bit(m_gen_inp_ix); // m_o_bufr += a[bit]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[bit]); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); // m_o_bufr += a[1-bit]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[1-bit]); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); m_gen_inp_ix++; } else if (current_gate.m_tag == Circuit::EVL_INP) { __m128i a[2]; // zero_key = m_prng.rand(Env::k()); static Bytes tmp; tmp = m_prng.rand(Env::k()); tmp.resize(16, 0); current_zero_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[0] = (*m_ot_keys)[2*m_evl_inp_ix+0]; tmp = (*m_ot_keys)[2*m_evl_inp_ix+0]; tmp.resize(16, 0); a[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[1] = (*m_ot_keys)[2*m_evl_inp_ix+1]; tmp = (*m_ot_keys)[2*m_evl_inp_ix+1]; tmp.resize(16, 0); a[1] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); // a[0] ^= zero_key; a[1] ^= zero_key ^ R; a[0] = _mm_xor_si128(a[0], current_zero_key); a[1] = _mm_xor_si128(a[1], _mm_xor_si128(current_zero_key, m_R)); // m_o_bufr += a[0]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[0]); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); // m_o_bufr += a[1]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[1]); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); m_evl_inp_ix++; } else { const vector<uint64_t> &inputs = current_gate.m_input_idx; assert(inputs.size() == 1 || inputs.size() == 2); #ifdef FREE_XOR if (is_xor(current_gate)) { current_zero_key = inputs.size() == 2? _mm_xor_si128(m_w[inputs[0]], m_w[inputs[1]]) : _mm_load_si128(m_w+inputs[0]); } else #endif if (inputs.size() == 2) // 2-arity gates { uint8_t bit; __m128i aes_key[2], aes_plaintext, aes_ciphertext; __m128i X[2], Y[2], Z[2]; static Bytes tmp(16, 0); aes_plaintext = _mm_set1_epi64x(m_gate_ix); X[0] = _mm_load_si128(m_w+inputs[0]); Y[0] = _mm_load_si128(m_w+inputs[1]); X[1] = _mm_xor_si128(X[0], m_R); // X[1] = X[0] ^ R Y[1] = _mm_xor_si128(Y[0], m_R); // Y[1] = Y[0] ^ R const uint8_t perm_x = _mm_extract_epi8(X[0], 0) & 0x01; // permutation bit for X const uint8_t perm_y = _mm_extract_epi8(Y[0], 0) & 0x01; // permutation bit for Y const uint8_t de_garbled_ix = (perm_y<<1)|perm_x; // encrypt the 0-th entry : (X[x], Y[y]) aes_key[0] = _mm_load_si128(X+perm_x); aes_key[1] = _mm_load_si128(Y+perm_y); KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); // clear extra bits so that only k bits left bit = current_gate.m_table[de_garbled_ix]; #ifdef GRR // GRR technique: using zero entry's key as one of the output keys _mm_store_si128(Z+bit, aes_ciphertext); Z[1-bit] = _mm_xor_si128(Z[bit], m_R); current_zero_key = _mm_load_si128(Z); #else tmp = m_prng.rand(Env::k()); tmp.resize(16, 0); Z[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); Z[1] = _mm_xor_si128(Z[0], m_R); aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); #endif // encrypt the 1st entry : (X[1-x], Y[y]) aes_key[0] = _mm_xor_si128(aes_key[0], m_R); KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); bit = current_gate.m_table[0x01^de_garbled_ix]; aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); // encrypt the 2nd entry : (X[x], Y[1-y]) aes_key[0] = _mm_xor_si128(aes_key[0], m_R); aes_key[1] = _mm_xor_si128(aes_key[1], m_R); KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); bit = current_gate.m_table[0x02^de_garbled_ix]; aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); // encrypt the 3rd entry : (X[1-x], Y[1-y]) aes_key[0] = _mm_xor_si128(aes_key[0], m_R); KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); bit = current_gate.m_table[0x03^de_garbled_ix]; aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); } else // 1-arity gates { uint8_t bit; __m128i aes_key, aes_plaintext, aes_ciphertext; __m128i X[2], Z[2]; static Bytes tmp; tmp.assign(16, 0); aes_plaintext = _mm_set1_epi64x(m_gate_ix); X[0] = _mm_load_si128(m_w+inputs[0]); X[1] = _mm_xor_si128(X[0], m_R); const uint8_t perm_x = _mm_extract_epi8(X[0], 0) & 0x01; // 0-th entry : X[x] aes_key = _mm_load_si128(X+perm_x); KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); bit = current_gate.m_table[perm_x]; #ifdef GRR _mm_store_si128(Z+bit, aes_ciphertext); Z[1-bit] = _mm_xor_si128(Z[bit], m_R); current_zero_key = _mm_load_si128(Z); #else tmp = m_prng.rand(Env::k()); tmp.resize(16, 0); Z[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0])); Z[1] = _mm_xor_si128(Z[0], m_R); aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); #endif // 1-st entry : X[1-x] aes_key = _mm_xor_si128(aes_key, m_R); KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key); aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); bit = current_gate.m_table[0x01^perm_x]; aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]); _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext); m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes()); } if (current_gate.m_tag == Circuit::EVL_OUT) { m_o_bufr.push_back(_mm_extract_epi8(current_zero_key, 0) & 0x01); // permutation bit } else if (current_gate.m_tag == Circuit::GEN_OUT) { m_o_bufr.push_back(_mm_extract_epi8(current_zero_key, 0) & 0x01); // permutation bit // // TODO: C[ix_0] = w[ix0] || randomness, C[ix_1] = w[ix1] || randomness // m_o_bufr += (key_pair[0] + m_prng.rand(Env::k())).hash(Env::k()); // m_o_bufr += (key_pair[1] + m_prng.rand(Env::k())).hash(Env::k()); } } _mm_store_si128(m_w+current_gate.m_idx, current_zero_key); m_gate_ix++; }
static void TEST (void) { union { __m128i x; int i[4]; char c[16]; } val1; int res[16], masks[16]; int i; val1.i[0] = 0x04030201; val1.i[1] = 0x08070605; val1.i[2] = 0x0C0B0A09; val1.i[3] = 0x100F0E0D; res[0] = _mm_extract_epi8 (val1.x, msk0); res[1] = _mm_extract_epi8 (val1.x, msk1); res[2] = _mm_extract_epi8 (val1.x, msk2); res[3] = _mm_extract_epi8 (val1.x, msk3); res[4] = _mm_extract_epi8 (val1.x, msk4); res[5] = _mm_extract_epi8 (val1.x, msk5); res[6] = _mm_extract_epi8 (val1.x, msk6); res[7] = _mm_extract_epi8 (val1.x, msk7); res[8] = _mm_extract_epi8 (val1.x, msk8); res[9] = _mm_extract_epi8 (val1.x, msk9); res[10] = _mm_extract_epi8 (val1.x, msk10); res[11] = _mm_extract_epi8 (val1.x, msk11); res[12] = _mm_extract_epi8 (val1.x, msk12); res[13] = _mm_extract_epi8 (val1.x, msk13); res[14] = _mm_extract_epi8 (val1.x, msk14); res[15] = _mm_extract_epi8 (val1.x, msk15); masks[0] = msk0; masks[1] = msk1; masks[2] = msk2; masks[3] = msk3; masks[4] = msk4; masks[5] = msk5; masks[6] = msk6; masks[7] = msk7; masks[8] = msk8; masks[9] = msk9; masks[10] = msk10; masks[11] = msk11; masks[12] = msk12; masks[13] = msk13; masks[14] = msk14; masks[15] = msk15; for (i = 0; i < 16; i++) if (res[i] != val1.c [masks[i]]) abort (); }
long long int foo8(__m128i x) { return (char) _mm_extract_epi8(x, 4); }
unsigned long long int foo8(__m128i x) { return _mm_extract_epi8(x, 4); }
static inline void arr_store_col( int *col, __m128i vH, int32_t t, int32_t seglen) { col[ 0*seglen+t] = (int8_t)_mm_extract_epi8(vH, 0); col[ 1*seglen+t] = (int8_t)_mm_extract_epi8(vH, 1); col[ 2*seglen+t] = (int8_t)_mm_extract_epi8(vH, 2); col[ 3*seglen+t] = (int8_t)_mm_extract_epi8(vH, 3); col[ 4*seglen+t] = (int8_t)_mm_extract_epi8(vH, 4); col[ 5*seglen+t] = (int8_t)_mm_extract_epi8(vH, 5); col[ 6*seglen+t] = (int8_t)_mm_extract_epi8(vH, 6); col[ 7*seglen+t] = (int8_t)_mm_extract_epi8(vH, 7); col[ 8*seglen+t] = (int8_t)_mm_extract_epi8(vH, 8); col[ 9*seglen+t] = (int8_t)_mm_extract_epi8(vH, 9); col[10*seglen+t] = (int8_t)_mm_extract_epi8(vH, 10); col[11*seglen+t] = (int8_t)_mm_extract_epi8(vH, 11); col[12*seglen+t] = (int8_t)_mm_extract_epi8(vH, 12); col[13*seglen+t] = (int8_t)_mm_extract_epi8(vH, 13); col[14*seglen+t] = (int8_t)_mm_extract_epi8(vH, 14); col[15*seglen+t] = (int8_t)_mm_extract_epi8(vH, 15); }
static inline void arr_store_rowcol( int *row, int *col, vec128i vWH, int32_t i, int32_t s1Len, int32_t j, int32_t s2Len) { if (i+0 == s1Len-1 && 0 <= j-0 && j-0 < s2Len) { row[j-0] = (int8_t)_mm_extract_epi8(vWH, 15); } if (j-0 == s2Len-1 && 0 <= i+0 && i+0 < s1Len) { col[(i+0)] = (int8_t)_mm_extract_epi8(vWH, 15); } if (i+1 == s1Len-1 && 0 <= j-1 && j-1 < s2Len) { row[j-1] = (int8_t)_mm_extract_epi8(vWH, 14); } if (j-1 == s2Len-1 && 0 <= i+1 && i+1 < s1Len) { col[(i+1)] = (int8_t)_mm_extract_epi8(vWH, 14); } if (i+2 == s1Len-1 && 0 <= j-2 && j-2 < s2Len) { row[j-2] = (int8_t)_mm_extract_epi8(vWH, 13); } if (j-2 == s2Len-1 && 0 <= i+2 && i+2 < s1Len) { col[(i+2)] = (int8_t)_mm_extract_epi8(vWH, 13); } if (i+3 == s1Len-1 && 0 <= j-3 && j-3 < s2Len) { row[j-3] = (int8_t)_mm_extract_epi8(vWH, 12); } if (j-3 == s2Len-1 && 0 <= i+3 && i+3 < s1Len) { col[(i+3)] = (int8_t)_mm_extract_epi8(vWH, 12); } if (i+4 == s1Len-1 && 0 <= j-4 && j-4 < s2Len) { row[j-4] = (int8_t)_mm_extract_epi8(vWH, 11); } if (j-4 == s2Len-1 && 0 <= i+4 && i+4 < s1Len) { col[(i+4)] = (int8_t)_mm_extract_epi8(vWH, 11); } if (i+5 == s1Len-1 && 0 <= j-5 && j-5 < s2Len) { row[j-5] = (int8_t)_mm_extract_epi8(vWH, 10); } if (j-5 == s2Len-1 && 0 <= i+5 && i+5 < s1Len) { col[(i+5)] = (int8_t)_mm_extract_epi8(vWH, 10); } if (i+6 == s1Len-1 && 0 <= j-6 && j-6 < s2Len) { row[j-6] = (int8_t)_mm_extract_epi8(vWH, 9); } if (j-6 == s2Len-1 && 0 <= i+6 && i+6 < s1Len) { col[(i+6)] = (int8_t)_mm_extract_epi8(vWH, 9); } if (i+7 == s1Len-1 && 0 <= j-7 && j-7 < s2Len) { row[j-7] = (int8_t)_mm_extract_epi8(vWH, 8); } if (j-7 == s2Len-1 && 0 <= i+7 && i+7 < s1Len) { col[(i+7)] = (int8_t)_mm_extract_epi8(vWH, 8); } if (i+8 == s1Len-1 && 0 <= j-8 && j-8 < s2Len) { row[j-8] = (int8_t)_mm_extract_epi8(vWH, 7); } if (j-8 == s2Len-1 && 0 <= i+8 && i+8 < s1Len) { col[(i+8)] = (int8_t)_mm_extract_epi8(vWH, 7); } if (i+9 == s1Len-1 && 0 <= j-9 && j-9 < s2Len) { row[j-9] = (int8_t)_mm_extract_epi8(vWH, 6); } if (j-9 == s2Len-1 && 0 <= i+9 && i+9 < s1Len) { col[(i+9)] = (int8_t)_mm_extract_epi8(vWH, 6); } if (i+10 == s1Len-1 && 0 <= j-10 && j-10 < s2Len) { row[j-10] = (int8_t)_mm_extract_epi8(vWH, 5); } if (j-10 == s2Len-1 && 0 <= i+10 && i+10 < s1Len) { col[(i+10)] = (int8_t)_mm_extract_epi8(vWH, 5); } if (i+11 == s1Len-1 && 0 <= j-11 && j-11 < s2Len) { row[j-11] = (int8_t)_mm_extract_epi8(vWH, 4); } if (j-11 == s2Len-1 && 0 <= i+11 && i+11 < s1Len) { col[(i+11)] = (int8_t)_mm_extract_epi8(vWH, 4); } if (i+12 == s1Len-1 && 0 <= j-12 && j-12 < s2Len) { row[j-12] = (int8_t)_mm_extract_epi8(vWH, 3); } if (j-12 == s2Len-1 && 0 <= i+12 && i+12 < s1Len) { col[(i+12)] = (int8_t)_mm_extract_epi8(vWH, 3); } if (i+13 == s1Len-1 && 0 <= j-13 && j-13 < s2Len) { row[j-13] = (int8_t)_mm_extract_epi8(vWH, 2); } if (j-13 == s2Len-1 && 0 <= i+13 && i+13 < s1Len) { col[(i+13)] = (int8_t)_mm_extract_epi8(vWH, 2); } if (i+14 == s1Len-1 && 0 <= j-14 && j-14 < s2Len) { row[j-14] = (int8_t)_mm_extract_epi8(vWH, 1); } if (j-14 == s2Len-1 && 0 <= i+14 && i+14 < s1Len) { col[(i+14)] = (int8_t)_mm_extract_epi8(vWH, 1); } if (i+15 == s1Len-1 && 0 <= j-15 && j-15 < s2Len) { row[j-15] = (int8_t)_mm_extract_epi8(vWH, 0); } if (j-15 == s2Len-1 && 0 <= i+15 && i+15 < s1Len) { col[(i+15)] = (int8_t)_mm_extract_epi8(vWH, 0); } }