Exemple #1
0
static inline void
desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
		uint32_t *ptype_tbl)
{
	__m128i ptype0 = _mm_unpackhi_epi64(descs[0], descs[1]);
	__m128i ptype1 = _mm_unpackhi_epi64(descs[2], descs[3]);

	ptype0 = _mm_srli_epi64(ptype0, 30);
	ptype1 = _mm_srli_epi64(ptype1, 30);

	rx_pkts[0]->packet_type = ptype_tbl[_mm_extract_epi8(ptype0, 0)];
	rx_pkts[1]->packet_type = ptype_tbl[_mm_extract_epi8(ptype0, 8)];
	rx_pkts[2]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 0)];
	rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 8)];
}
static inline int8_t _mm_hmax_epi8_rpl(__m128i a) {
    a = _mm_max_epi8(a, _mm_srli_si128(a, 8));
    a = _mm_max_epi8(a, _mm_srli_si128(a, 4));
    a = _mm_max_epi8(a, _mm_srli_si128(a, 2));
    a = _mm_max_epi8(a, _mm_srli_si128(a, 1));
    return _mm_extract_epi8(a, 0);
}
Exemple #3
0
void GarbledCct3::evl_next_gen_inp_com(const Bytes &row, size_t kx)
{
	Bytes out(m_gen_inp_decom[0].size());

	for (size_t jx = 0; jx < Env::circuit().gen_inp_cnt(); jx++)
	{
		if (row.get_ith_bit(jx)) { out ^= m_gen_inp_decom[jx]; }
	}

	byte bit = out.get_ith_bit(0);

	static Bytes tmp;

	Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes();

	__m128i aes_key, aes_plaintext, aes_ciphertext, out_key;

	tmp.assign(out.begin(), out.begin()+Env::key_size_in_bytes());
	tmp.resize(16, 0);
	aes_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

	aes_plaintext = _mm_set1_epi64x((uint64_t)kx);

	KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key);
	aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);

	tmp.assign(it, it+Env::key_size_in_bytes());
	tmp.resize(16, 0);
	out_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
	out_key = _mm_xor_si128(out_key, aes_ciphertext);

	bit = _mm_extract_epi8(out_key, 0) & 0x01;
	m_gen_inp_hash.set_ith_bit(kx, bit);

	m_i_bufr_ix += 2*Env::key_size_in_bytes();

//	tmp.resize(16);
//	_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), out_key);
//	std::cout << "EVL " << m_gen_inp_hash_ix << " : " << tmp.to_hex() << std::endl;

	m_gen_inp_hash_ix++;
}
static inline void arr_store(
        int *array,
        vec128i vH,
        int32_t t,
        int32_t seglen,
        int32_t d,
        int32_t dlen,
        int32_t bias)
{
    array[1LL*( 0*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  0) - bias;
    array[1LL*( 1*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  1) - bias;
    array[1LL*( 2*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  2) - bias;
    array[1LL*( 3*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  3) - bias;
    array[1LL*( 4*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  4) - bias;
    array[1LL*( 5*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  5) - bias;
    array[1LL*( 6*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  6) - bias;
    array[1LL*( 7*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  7) - bias;
    array[1LL*( 8*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  8) - bias;
    array[1LL*( 9*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  9) - bias;
    array[1LL*(10*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 10) - bias;
    array[1LL*(11*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 11) - bias;
    array[1LL*(12*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 12) - bias;
    array[1LL*(13*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 13) - bias;
    array[1LL*(14*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 14) - bias;
    array[1LL*(15*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 15) - bias;
}
static inline void arr_store_si128(
        int8_t *array,
        vec128i vWH,
        int32_t i,
        int32_t s1Len,
        int32_t j,
        int32_t s2Len)
{
    if (0 <= i+0 && i+0 < s1Len && 0 <= j-0 && j-0 < s2Len) {
        array[1LL*(i+0)*s2Len + (j-0)] = (int8_t)_mm_extract_epi8(vWH, 15);
    }
    if (0 <= i+1 && i+1 < s1Len && 0 <= j-1 && j-1 < s2Len) {
        array[1LL*(i+1)*s2Len + (j-1)] = (int8_t)_mm_extract_epi8(vWH, 14);
    }
    if (0 <= i+2 && i+2 < s1Len && 0 <= j-2 && j-2 < s2Len) {
        array[1LL*(i+2)*s2Len + (j-2)] = (int8_t)_mm_extract_epi8(vWH, 13);
    }
    if (0 <= i+3 && i+3 < s1Len && 0 <= j-3 && j-3 < s2Len) {
        array[1LL*(i+3)*s2Len + (j-3)] = (int8_t)_mm_extract_epi8(vWH, 12);
    }
    if (0 <= i+4 && i+4 < s1Len && 0 <= j-4 && j-4 < s2Len) {
        array[1LL*(i+4)*s2Len + (j-4)] = (int8_t)_mm_extract_epi8(vWH, 11);
    }
    if (0 <= i+5 && i+5 < s1Len && 0 <= j-5 && j-5 < s2Len) {
        array[1LL*(i+5)*s2Len + (j-5)] = (int8_t)_mm_extract_epi8(vWH, 10);
    }
    if (0 <= i+6 && i+6 < s1Len && 0 <= j-6 && j-6 < s2Len) {
        array[1LL*(i+6)*s2Len + (j-6)] = (int8_t)_mm_extract_epi8(vWH, 9);
    }
    if (0 <= i+7 && i+7 < s1Len && 0 <= j-7 && j-7 < s2Len) {
        array[1LL*(i+7)*s2Len + (j-7)] = (int8_t)_mm_extract_epi8(vWH, 8);
    }
    if (0 <= i+8 && i+8 < s1Len && 0 <= j-8 && j-8 < s2Len) {
        array[1LL*(i+8)*s2Len + (j-8)] = (int8_t)_mm_extract_epi8(vWH, 7);
    }
    if (0 <= i+9 && i+9 < s1Len && 0 <= j-9 && j-9 < s2Len) {
        array[1LL*(i+9)*s2Len + (j-9)] = (int8_t)_mm_extract_epi8(vWH, 6);
    }
    if (0 <= i+10 && i+10 < s1Len && 0 <= j-10 && j-10 < s2Len) {
        array[1LL*(i+10)*s2Len + (j-10)] = (int8_t)_mm_extract_epi8(vWH, 5);
    }
    if (0 <= i+11 && i+11 < s1Len && 0 <= j-11 && j-11 < s2Len) {
        array[1LL*(i+11)*s2Len + (j-11)] = (int8_t)_mm_extract_epi8(vWH, 4);
    }
    if (0 <= i+12 && i+12 < s1Len && 0 <= j-12 && j-12 < s2Len) {
        array[1LL*(i+12)*s2Len + (j-12)] = (int8_t)_mm_extract_epi8(vWH, 3);
    }
    if (0 <= i+13 && i+13 < s1Len && 0 <= j-13 && j-13 < s2Len) {
        array[1LL*(i+13)*s2Len + (j-13)] = (int8_t)_mm_extract_epi8(vWH, 2);
    }
    if (0 <= i+14 && i+14 < s1Len && 0 <= j-14 && j-14 < s2Len) {
        array[1LL*(i+14)*s2Len + (j-14)] = (int8_t)_mm_extract_epi8(vWH, 1);
    }
    if (0 <= i+15 && i+15 < s1Len && 0 <= j-15 && j-15 < s2Len) {
        array[1LL*(i+15)*s2Len + (j-15)] = (int8_t)_mm_extract_epi8(vWH, 0);
    }
}
Exemple #6
0
int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{
  if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
    uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;

    const __m128i* xPtr   = (const __m128i*) input;
    const __m128i* lutPtr = (const __m128i*) deinter;
    __m128i xVal, lutVal1, lutVal2;

    /* Simplify load if we do not need to wrap (ie high rates) */
    if (in_len <= out_len) {
      for (int i=0;i<in_len/16;i++) {
        xVal   = _mm_loadu_si128(xPtr);
        xPtr ++;
        lutVal1 = _mm_loadu_si128(lutPtr);
        lutPtr++;
        lutVal2 = _mm_loadu_si128(lutPtr);
        lutPtr ++;

        for (int j=0;j<8;j++) {
          int8_t x  = (int8_t)  _mm_extract_epi8(xVal,   j);
          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j);
          output[l] += x;
        }
        for (int j=0;j<8;j++) {
          int8_t x  = (int8_t)  _mm_extract_epi8(xVal,   j+8);
          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j);
          output[l] += x;
        }
      }
      for (int i=16*(in_len/16);i<in_len;i++) {
        output[deinter[i%out_len]] += input[i];
      }
    } else {
      int intCnt = 16;
      int inputCnt = 0;
      int nwrapps = 0;
      while(inputCnt < in_len - 16) {
        xVal   = _mm_loadu_si128(xPtr);
        xPtr ++;
        lutVal1 = _mm_loadu_si128(lutPtr);
        lutPtr++;
        lutVal2 = _mm_loadu_si128(lutPtr);
        lutPtr ++;

        for (int j=0;j<8;j++) {
          int8_t x  = (int8_t)  _mm_extract_epi8(xVal,   j);
          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j);
          output[l] += x;
        }
        for (int j=0;j<8;j++) {
          int8_t x  = (int8_t)  _mm_extract_epi8(xVal,   j+8);
          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j);
          output[l] += x;
        }
        intCnt   += 16;
        inputCnt += 16;
        if (intCnt >= out_len && inputCnt < in_len - 16) {
          /* Copy last elements */
          if ((out_len%16) == 12) {
            for (int j=(nwrapps+1)*out_len-12;j<(nwrapps+1)*out_len;j++) {
              output[deinter[j%out_len]] += input[j];
              inputCnt++;
            }
          } else {
            for (int j=(nwrapps+1)*out_len-4;j<(nwrapps+1)*out_len;j++) {
              output[deinter[j%out_len]] += input[j];
              inputCnt++;
            }
          }
          /* And wrap pointers */
          nwrapps++;
          intCnt = 16;
          xPtr   = (const __m128i*) &input[nwrapps*out_len];
          lutPtr = (const __m128i*) deinter;
        }
      }
      for (int i=inputCnt;i<in_len;i++) {
        output[deinter[i%out_len]] += input[i];
      }
    }


    return 0;
  } else {
    printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx);
    return SRSLTE_ERROR_INVALID_INPUTS;
  }
}
// useful for debugging
inline std::string str8(__m128i x)
{
    std::stringstream ret;

    ret << "[" << std::hex
	<< " " << _mm_extract_epi8(x,0)
	<< " " << _mm_extract_epi8(x,1)
	<< " " << _mm_extract_epi8(x,2)
	<< " " << _mm_extract_epi8(x,3)
	<< " " << _mm_extract_epi8(x,4)
	<< " " << _mm_extract_epi8(x,5)
	<< " " << _mm_extract_epi8(x,6)
	<< " " << _mm_extract_epi8(x,7)
	<< " " << _mm_extract_epi8(x,8)
	<< " " << _mm_extract_epi8(x,9)
	<< " " << _mm_extract_epi8(x,10)
	<< " " << _mm_extract_epi8(x,11)
	<< " " << _mm_extract_epi8(x,12)
	<< " " << _mm_extract_epi8(x,13)
	<< " " << _mm_extract_epi8(x,14)
	<< " " << _mm_extract_epi8(x,15)
	<< " ]";

    return ret.str();
}
Exemple #8
0
int test_mm_extract_epi8(__m128i x) {
  // CHECK-LABEL: test_mm_extract_epi8
  // CHECK: extractelement <16 x i8> %{{.*}}, i32 1
  // CHECK: zext i8 %{{.*}} to i32
  return _mm_extract_epi8(x, 1);
}
static inline void arr_store_si128(
        int *array,
        __m128i vH,
        int32_t t,
        int32_t seglen,
        int32_t d,
        int32_t dlen)
{
    array[( 0*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  0);
    array[( 1*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  1);
    array[( 2*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  2);
    array[( 3*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  3);
    array[( 4*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  4);
    array[( 5*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  5);
    array[( 6*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  6);
    array[( 7*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  7);
    array[( 8*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  8);
    array[( 9*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH,  9);
    array[(10*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 10);
    array[(11*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 11);
    array[(12*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 12);
    array[(13*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 13);
    array[(14*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 14);
    array[(15*seglen+t)*dlen + d] = (int8_t)_mm_extract_epi8(vH, 15);
}
void GarbledCct::evl_next_gate(const Gate &current_gate)
{
	__m128i current_key, a;
	Bytes::const_iterator it;
	static Bytes tmp;

	if (current_gate.m_tag == Circuit::GEN_INP)
	{
		uint8_t bit = m_gen_inp_mask.get_ith_bit(m_gen_inp_ix);
		Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes();

		tmp = m_M[m_gen_inp_ix].to_bytes().hash(Env::k());
		tmp.resize(16, 0);
		current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		tmp.assign(it, it+Env::key_size_in_bytes());
		tmp.resize(16, 0);
		a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		m_i_bufr_ix += Env::key_size_in_bytes()*2;

		current_key = _mm_xor_si128(current_key, a);

		m_gen_inp_ix++;
	}
	else if (current_gate.m_tag == Circuit::EVL_INP)
	{
		uint8_t bit = m_evl_inp.get_ith_bit(m_evl_inp_ix);
		Bytes::iterator it = m_i_bufr_ix + bit*Env::key_size_in_bytes();

		tmp = (*m_ot_keys)[m_evl_inp_ix];
		tmp.resize(16, 0);
		current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		tmp.assign(it, it+Env::key_size_in_bytes());
		tmp.resize(16, 0);
		a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		m_i_bufr_ix += Env::key_size_in_bytes()*2;

		current_key = _mm_xor_si128(current_key, a);

		m_evl_inp_ix++;
	}
	else
	{
        const vector<uint64_t> &inputs = current_gate.m_input_idx;

#ifdef FREE_XOR
		if (is_xor(current_gate))
		{
			current_key = inputs.size() == 2?
				_mm_xor_si128(m_w[inputs[0]], m_w[inputs[1]]) : _mm_load_si128(m_w+inputs[0]);
		}
		else
#endif
        if (inputs.size() == 2) // 2-arity gates
		{
        	__m128i aes_key[2], aes_plaintext, aes_ciphertext;

			aes_plaintext = _mm_set1_epi64x(m_gate_ix);

			aes_key[0] = _mm_load_si128(m_w+inputs[0]);
			aes_key[1] = _mm_load_si128(m_w+inputs[1]);

			const uint8_t perm_x = _mm_extract_epi8(aes_key[0], 0) & 0x01;
			const uint8_t perm_y = _mm_extract_epi8(aes_key[1], 0) & 0x01;

			KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			uint8_t garbled_ix = (perm_y<<1) | (perm_x<<0);

#ifdef GRR
			if (garbled_ix == 0)
			{
				current_key = _mm_load_si128(&aes_ciphertext);
			}
			else
			{
				it = m_i_bufr_ix+(garbled_ix-1)*Env::key_size_in_bytes();
				tmp.assign(it, it+Env::key_size_in_bytes());
				tmp.resize(16, 0);
				a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
				current_key = _mm_xor_si128(aes_ciphertext, a);
			}
			m_i_bufr_ix += 3*Env::key_size_in_bytes();
#else
			it = m_i_bufr_ix + garbled_ix*Env::key_size_in_bytes();
			tmp.assign(it, it+Env::key_size_in_bytes());
			tmp.resize(16, 0);
			current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
			current_key = _mm_xor_si128(current_key, aes_ciphertext);

			m_i_bufr_ix += 4*Env::key_size_in_bytes();
#endif
		}
		else // 1-arity gates
		{
        	__m128i aes_key, aes_plaintext, aes_ciphertext;

			aes_plaintext = _mm_set1_epi64x(m_gate_ix);
			aes_key = _mm_load_si128(m_w+inputs[0]);
			KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);

			const uint8_t perm_x = _mm_extract_epi8(aes_key, 0) & 0x01;

#ifdef GRR
			if (perm_x == 0)
			{
				current_key = _mm_load_si128(&aes_ciphertext);
			}
			else
			{
				tmp.assign(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes());
				tmp.resize(16, 0);
				a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
				current_key = _mm_xor_si128(aes_ciphertext, a);
			}
			m_i_bufr_ix += Env::key_size_in_bytes();
#else
			it = m_i_bufr_ix + garbled_ix*Env::key_size_in_bytes();
			tmp.assign(it, it+Env::key_size_in_bytes());
			tmp.resize(16, 0);
			current_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
			current_key = _mm_xor_si128(current_key, aes_ciphertext);

			m_i_bufr_ix += 2*Env::key_size_in_bytes();
#endif
		}

		if (current_gate.m_tag == Circuit::EVL_OUT)
		{
			uint8_t out_bit = _mm_extract_epi8(current_key, 0) & 0x01;
			out_bit ^= *m_i_bufr_ix;
			m_evl_out.set_ith_bit(m_evl_out_ix, out_bit);
			m_i_bufr_ix++;

			m_evl_out_ix++;
		}
		else if (current_gate.m_tag == Circuit::GEN_OUT)
		{
			// TODO: Ki08 implementation
			uint8_t out_bit = _mm_extract_epi8(current_key, 0) & 0x01;
			out_bit ^= *m_i_bufr_ix;
			m_gen_out.set_ith_bit(m_gen_out_ix, out_bit);
			m_i_bufr_ix++;

//			m_C[2*m_gen_out_ix+0] = Bytes(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes());
//			m_i_bufr_ix += Env::key_size_in_bytes();
//
//			m_C[2*m_gen_out_ix+1] = Bytes(m_i_bufr_ix, m_i_bufr_ix+Env::key_size_in_bytes());
//			m_i_bufr_ix += Env::key_size_in_bytes();

			m_gen_out_ix++;
		}
	}

	_mm_store_si128(m_w+current_gate.m_idx, current_key);

	update_hash(m_i_bufr);
	m_gate_ix++;
}
Exemple #11
0
/*
** batch doubling from [MSK15]
*/
__inline__ static void mul2_PIPE(__m128i *dat) {
	unsigned a, b;
	block tmp = le(dat[0]);
	block carry[PIPE];
	block up4, up8;

	const block sh1 = _mm_set_epi8(255, 255, 255, 255, 255, 255, 15, 14, 255, 255, 255, 255, 255, 255, 7, 6);
	const block sh2 = _mm_set_epi8(255, 255, 255, 255, 255, 255, 13, 12, 255, 255, 255, 255, 255, 255, 5, 4);
	const block sh3 = _mm_set_epi8(255, 255, 255, 255, 255, 255, 11, 10, 255, 255, 255, 255, 255, 255, 3, 2);
	const block sh4 = _mm_set_epi8(255, 255, 255, 255, 255, 255, 9, 8, 255, 255, 255, 255, 255, 255, 1, 0);
	const block *Txp = (const block*)TX;
	const block *Typ = (const block*)TY;

	a = _mm_extract_epi8(tmp, 15);
	b = _mm_extract_epi8(tmp, 7);
	up4 = _mm_unpacklo_epi64(Txp[a], Typ[b]);
	up8 = _mm_unpackhi_epi64(Txp[a], Typ[b]);
	carry[0] = _mm_shuffle_epi8(up4, sh1);
	carry[1] = _mm_shuffle_epi8(up4, sh2);
	carry[2] = _mm_shuffle_epi8(up4, sh3);
	carry[3] = _mm_shuffle_epi8(up4, sh4);
#if(PIPE>=5)
	carry[4] = _mm_shuffle_epi8(up8, sh1);
#endif
#if(PIPE>=6)
	carry[5] = _mm_shuffle_epi8(up8, sh2);
#endif
#if(PIPE>=7)
	carry[6] = _mm_shuffle_epi8(up8, sh3);
#endif
#if(PIPE==8)
	carry[7] = _mm_shuffle_epi8(up8, sh4);
#endif
	dat[1] = _mm_slli_epi64(tmp, 1);
	dat[2] = _mm_slli_epi64(tmp, 2);
	dat[3] = _mm_slli_epi64(tmp, 3);
	dat[4] = _mm_slli_epi64(tmp, 4);
#if(PIPE>=5)
	dat[5] = _mm_slli_epi64(tmp, 5);
#endif
#if(PIPE>=6)
	dat[6] = _mm_slli_epi64(tmp, 6);
#endif
#if(PIPE>=7)
	dat[7] = _mm_slli_epi64(tmp, 7);
#endif
#if (PIPE==8)
	dat[8] = (_mm_slli_epi64(tmp, 8));
#endif
	dat[1] = le(_mm_xor_si128(dat[1], carry[0]));
	dat[2] = le(_mm_xor_si128(dat[2], carry[1]));
	dat[3] = le(_mm_xor_si128(dat[3], carry[2]));
	dat[4] = le(_mm_xor_si128(dat[4], carry[3]));
#if(PIPE>=5)
	dat[5] = le(_mm_xor_si128(dat[5], carry[4]));
#endif
#if(PIPE>=6)
	dat[6] = le(_mm_xor_si128(dat[6], carry[5]));
#endif
#if(PIPE>=7)
	dat[7] = le(_mm_xor_si128(dat[7], carry[6]));
#endif
#if (PIPE==8)
	dat[8] = le(_mm_xor_si128(dat[8], carry[7]));
#endif
}
Exemple #12
0
int test_extract_epi8(__m128i x) {
  // CHECK-LABEL: test_extract_epi8
  // CHECK: extractelement <16 x i8> %{{.*}}, i32 0
  // CHECK-ASM: pextrb
  return _mm_extract_epi8(x, 16);
}
TARGET_SSE4_1 static inline __m128i extract(__m128i xmm0, __m128i xmm1, __m128i xmm2, __m128i xmm3)
{
	qint64 r0, r1;

	r0  = Ax[0][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 0))];
	r0 ^= Ax[1][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 8))];
	r0 ^= Ax[2][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 0))];
	r0 ^= Ax[3][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 8))];
	r0 ^= Ax[4][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 0))];
	r0 ^= Ax[5][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 8))];
	r0 ^= Ax[6][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 0))];
	r0 ^= Ax[7][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 8))];

	r1  = Ax[0][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 1))];
	r1 ^= Ax[1][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 9))];
	r1 ^= Ax[2][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 1))];
	r1 ^= Ax[3][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 9))];
	r1 ^= Ax[4][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 1))];
	r1 ^= Ax[5][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 9))];
	r1 ^= Ax[6][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 1))];
	r1 ^= Ax[7][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 9))];

	return _mm_insert_epi64(_mm_cvtsi64_si128(r0), r1, 1);
}
TARGET_SSE4_1 static inline __m128i extract(__m128i xmm0, __m128i xmm1, __m128i xmm2, __m128i xmm3)
{
	__m64 mm0, mm1;

	mm0 = _mm_cvtsi64_m64(Ax[0][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 0))]);
	mm0 = _mm_xor_64(mm0, Ax[1][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 8))]);
	mm0 = _mm_xor_64(mm0, Ax[2][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 0))]);
	mm0 = _mm_xor_64(mm0, Ax[3][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 8))]);
	mm0 = _mm_xor_64(mm0, Ax[4][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 0))]);
	mm0 = _mm_xor_64(mm0, Ax[5][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 8))]);
	mm0 = _mm_xor_64(mm0, Ax[6][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 0))]);
	mm0 = _mm_xor_64(mm0, Ax[7][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 8))]);

	mm1 = _mm_cvtsi64_m64(Ax[0][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 1))]);
	mm1 = _mm_xor_64(mm1, Ax[1][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 9))]);
	mm1 = _mm_xor_64(mm1, Ax[2][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 1))]);
	mm1 = _mm_xor_64(mm1, Ax[3][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 9))]);
	mm1 = _mm_xor_64(mm1, Ax[4][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 1))]);
	mm1 = _mm_xor_64(mm1, Ax[5][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 9))]);
	mm1 = _mm_xor_64(mm1, Ax[6][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 1))]);
	mm1 = _mm_xor_64(mm1, Ax[7][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 9))]);

	return _mm_set_epi64(mm1, mm0);
}
Exemple #15
0
int test_extract_epi8(__m128i __a) {
  // CHECK-LABEL: @test_extract_epi8
  // CHECK: extractelement <16 x i8> %{{.*}}, i32 0
  return _mm_extract_epi8(__a, 16);
}
static inline void arr_store_col(
        int *col,
        vec128i vH,
        int32_t t,
        int32_t seglen,
        int32_t bias)
{
    col[ 0*seglen+t] = (int8_t)_mm_extract_epi8(vH,  0) - bias;
    col[ 1*seglen+t] = (int8_t)_mm_extract_epi8(vH,  1) - bias;
    col[ 2*seglen+t] = (int8_t)_mm_extract_epi8(vH,  2) - bias;
    col[ 3*seglen+t] = (int8_t)_mm_extract_epi8(vH,  3) - bias;
    col[ 4*seglen+t] = (int8_t)_mm_extract_epi8(vH,  4) - bias;
    col[ 5*seglen+t] = (int8_t)_mm_extract_epi8(vH,  5) - bias;
    col[ 6*seglen+t] = (int8_t)_mm_extract_epi8(vH,  6) - bias;
    col[ 7*seglen+t] = (int8_t)_mm_extract_epi8(vH,  7) - bias;
    col[ 8*seglen+t] = (int8_t)_mm_extract_epi8(vH,  8) - bias;
    col[ 9*seglen+t] = (int8_t)_mm_extract_epi8(vH,  9) - bias;
    col[10*seglen+t] = (int8_t)_mm_extract_epi8(vH, 10) - bias;
    col[11*seglen+t] = (int8_t)_mm_extract_epi8(vH, 11) - bias;
    col[12*seglen+t] = (int8_t)_mm_extract_epi8(vH, 12) - bias;
    col[13*seglen+t] = (int8_t)_mm_extract_epi8(vH, 13) - bias;
    col[14*seglen+t] = (int8_t)_mm_extract_epi8(vH, 14) - bias;
    col[15*seglen+t] = (int8_t)_mm_extract_epi8(vH, 15) - bias;
}
void GarbledCct::gen_next_gate(const Gate &current_gate)
{
	__m128i current_zero_key;

	if (current_gate.m_tag == Circuit::GEN_INP)
	{
		__m128i a[2];

		// zero_key = m_prng.rand(Env::k());
		static Bytes tmp;

		tmp = m_prng.rand(Env::k());
		tmp.resize(16, 0);
		current_zero_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[0] = m_M[2*m_gen_inp_ix+0].to_bytes().hash(Env::k());
		tmp = m_M[2*m_gen_inp_ix+0].to_bytes().hash(Env::k());
		tmp.resize(16, 0);
		a[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[1] = m_M[2*m_gen_inp_ix+1].to_bytes().hash(Env::k());
		tmp = m_M[2*m_gen_inp_ix+1].to_bytes().hash(Env::k());
		tmp.resize(16, 0);
		a[1] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[0] ^= zero_key; a[1] ^= zero_key ^ R;
		a[0] = _mm_xor_si128(a[0], current_zero_key);
		a[1] = _mm_xor_si128(a[1], _mm_xor_si128(current_zero_key, m_R));

		uint8_t bit = m_gen_inp_mask.get_ith_bit(m_gen_inp_ix);

		// m_o_bufr += a[bit];
		_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[bit]);
		m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

		// m_o_bufr += a[1-bit];
		_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[1-bit]);
		m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

		m_gen_inp_ix++;
	}
	else if (current_gate.m_tag == Circuit::EVL_INP)
	{
		__m128i a[2];

		// zero_key = m_prng.rand(Env::k());
		static Bytes tmp;

		tmp = m_prng.rand(Env::k());
		tmp.resize(16, 0);
		current_zero_key = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[0] = (*m_ot_keys)[2*m_evl_inp_ix+0];
		tmp = (*m_ot_keys)[2*m_evl_inp_ix+0];
		tmp.resize(16, 0);
		a[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[1] = (*m_ot_keys)[2*m_evl_inp_ix+1];
		tmp = (*m_ot_keys)[2*m_evl_inp_ix+1];
		tmp.resize(16, 0);
		a[1] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));

		// a[0] ^= zero_key; a[1] ^= zero_key ^ R;
		a[0] = _mm_xor_si128(a[0], current_zero_key);
		a[1] = _mm_xor_si128(a[1], _mm_xor_si128(current_zero_key, m_R));

		// m_o_bufr += a[0];
		_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[0]);
		m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

		// m_o_bufr += a[1];
		_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), a[1]);
		m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

		m_evl_inp_ix++;
	}
	else
	{
		const vector<uint64_t> &inputs = current_gate.m_input_idx;
		assert(inputs.size() == 1 || inputs.size() == 2);

#ifdef FREE_XOR
		if (is_xor(current_gate))
		{
			current_zero_key = inputs.size() == 2?
				_mm_xor_si128(m_w[inputs[0]], m_w[inputs[1]]) : _mm_load_si128(m_w+inputs[0]);
		}
		else
#endif
		if (inputs.size() == 2) // 2-arity gates
		{
			uint8_t bit;
			__m128i aes_key[2], aes_plaintext, aes_ciphertext;
			__m128i X[2], Y[2], Z[2];
			static Bytes tmp(16, 0);

			aes_plaintext = _mm_set1_epi64x(m_gate_ix);

			X[0] = _mm_load_si128(m_w+inputs[0]);
			Y[0] = _mm_load_si128(m_w+inputs[1]);

			X[1] = _mm_xor_si128(X[0], m_R); // X[1] = X[0] ^ R
			Y[1] = _mm_xor_si128(Y[0], m_R); // Y[1] = Y[0] ^ R

			const uint8_t perm_x = _mm_extract_epi8(X[0], 0) & 0x01; // permutation bit for X
			const uint8_t perm_y = _mm_extract_epi8(Y[0], 0) & 0x01; // permutation bit for Y
			const uint8_t de_garbled_ix = (perm_y<<1)|perm_x;

			// encrypt the 0-th entry : (X[x], Y[y])
			aes_key[0] = _mm_load_si128(X+perm_x);
			aes_key[1] = _mm_load_si128(Y+perm_y);

			KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask); // clear extra bits so that only k bits left
			bit = current_gate.m_table[de_garbled_ix];

#ifdef GRR
			// GRR technique: using zero entry's key as one of the output keys
			_mm_store_si128(Z+bit, aes_ciphertext);
			Z[1-bit] = _mm_xor_si128(Z[bit], m_R);
			current_zero_key = _mm_load_si128(Z);
#else
			tmp = m_prng.rand(Env::k());
			tmp.resize(16, 0);
			Z[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
			Z[1] = _mm_xor_si128(Z[0], m_R);

			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());
#endif

			// encrypt the 1st entry : (X[1-x], Y[y])
			aes_key[0] = _mm_xor_si128(aes_key[0], m_R);

			KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			bit = current_gate.m_table[0x01^de_garbled_ix];
			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

			// encrypt the 2nd entry : (X[x], Y[1-y])
			aes_key[0] = _mm_xor_si128(aes_key[0], m_R);
			aes_key[1] = _mm_xor_si128(aes_key[1], m_R);

			KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			bit = current_gate.m_table[0x02^de_garbled_ix];
			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());

			// encrypt the 3rd entry : (X[1-x], Y[1-y])
			aes_key[0] = _mm_xor_si128(aes_key[0], m_R);

			KDF256((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			bit = current_gate.m_table[0x03^de_garbled_ix];
			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());
		}
		else // 1-arity gates
		{
			uint8_t bit;
			__m128i aes_key, aes_plaintext, aes_ciphertext;
			__m128i X[2], Z[2];
			static Bytes tmp;

			tmp.assign(16, 0);

			aes_plaintext = _mm_set1_epi64x(m_gate_ix);

			X[0] = _mm_load_si128(m_w+inputs[0]);
			X[1] = _mm_xor_si128(X[0], m_R);

			const uint8_t perm_x = _mm_extract_epi8(X[0], 0) & 0x01;

			// 0-th entry : X[x]
			aes_key = _mm_load_si128(X+perm_x);
			KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			bit = current_gate.m_table[perm_x];

#ifdef GRR
			_mm_store_si128(Z+bit, aes_ciphertext);
			Z[1-bit] = _mm_xor_si128(Z[bit], m_R);
			current_zero_key = _mm_load_si128(Z);
#else
			tmp = m_prng.rand(Env::k());
			tmp.resize(16, 0);
			Z[0] = _mm_loadu_si128(reinterpret_cast<__m128i*>(&tmp[0]));
			Z[1] = _mm_xor_si128(Z[0], m_R);

			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());
#endif

			// 1-st entry : X[1-x]
			aes_key = _mm_xor_si128(aes_key, m_R);

			KDF128((uint8_t*)&aes_plaintext, (uint8_t*)&aes_ciphertext, (uint8_t*)&aes_key);
			aes_ciphertext = _mm_and_si128(aes_ciphertext, m_clear_mask);
			bit = current_gate.m_table[0x01^perm_x];
			aes_ciphertext = _mm_xor_si128(aes_ciphertext, Z[bit]);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[0]), aes_ciphertext);
			m_o_bufr.insert(m_o_bufr.end(), tmp.begin(), tmp.begin()+Env::key_size_in_bytes());
		}

		if (current_gate.m_tag == Circuit::EVL_OUT)
		{
			m_o_bufr.push_back(_mm_extract_epi8(current_zero_key, 0) & 0x01); // permutation bit
		}
		else if (current_gate.m_tag == Circuit::GEN_OUT)
		{
			m_o_bufr.push_back(_mm_extract_epi8(current_zero_key, 0) & 0x01); // permutation bit

//			// TODO: C[ix_0] = w[ix0] || randomness, C[ix_1] = w[ix1] || randomness
//			m_o_bufr += (key_pair[0] + m_prng.rand(Env::k())).hash(Env::k());
//			m_o_bufr += (key_pair[1] + m_prng.rand(Env::k())).hash(Env::k());
		}
	}

	_mm_store_si128(m_w+current_gate.m_idx, current_zero_key);

	m_gate_ix++;
}
Exemple #18
0
static void
TEST (void)
{
  union
    {
      __m128i x;
      int i[4];
      char c[16];
    } val1;
  int res[16], masks[16];
  int i;

  val1.i[0] = 0x04030201;
  val1.i[1] = 0x08070605;
  val1.i[2] = 0x0C0B0A09;
  val1.i[3] = 0x100F0E0D;

  res[0] = _mm_extract_epi8 (val1.x, msk0);
  res[1] = _mm_extract_epi8 (val1.x, msk1);
  res[2] = _mm_extract_epi8 (val1.x, msk2);
  res[3] = _mm_extract_epi8 (val1.x, msk3);
  res[4] = _mm_extract_epi8 (val1.x, msk4);
  res[5] = _mm_extract_epi8 (val1.x, msk5);
  res[6] = _mm_extract_epi8 (val1.x, msk6);
  res[7] = _mm_extract_epi8 (val1.x, msk7);
  res[8] = _mm_extract_epi8 (val1.x, msk8);
  res[9] = _mm_extract_epi8 (val1.x, msk9);
  res[10] = _mm_extract_epi8 (val1.x, msk10);
  res[11] = _mm_extract_epi8 (val1.x, msk11);
  res[12] = _mm_extract_epi8 (val1.x, msk12);
  res[13] = _mm_extract_epi8 (val1.x, msk13);
  res[14] = _mm_extract_epi8 (val1.x, msk14);
  res[15] = _mm_extract_epi8 (val1.x, msk15);

  masks[0] = msk0;
  masks[1] = msk1;
  masks[2] = msk2;
  masks[3] = msk3;
  masks[4] = msk4;
  masks[5] = msk5;
  masks[6] = msk6;
  masks[7] = msk7;
  masks[8] = msk8;
  masks[9] = msk9;
  masks[10] = msk10;
  masks[11] = msk11;
  masks[12] = msk12;
  masks[13] = msk13;
  masks[14] = msk14;
  masks[15] = msk15;

  for (i = 0; i < 16; i++)
    if (res[i] != val1.c [masks[i]])
      abort ();
}
Exemple #19
0
long long int foo8(__m128i x) { return (char) _mm_extract_epi8(x, 4); }
Exemple #20
0
unsigned long long int foo8(__m128i x) { return _mm_extract_epi8(x, 4); }
static inline void arr_store_col(
        int *col,
        __m128i vH,
        int32_t t,
        int32_t seglen)
{
    col[ 0*seglen+t] = (int8_t)_mm_extract_epi8(vH,  0);
    col[ 1*seglen+t] = (int8_t)_mm_extract_epi8(vH,  1);
    col[ 2*seglen+t] = (int8_t)_mm_extract_epi8(vH,  2);
    col[ 3*seglen+t] = (int8_t)_mm_extract_epi8(vH,  3);
    col[ 4*seglen+t] = (int8_t)_mm_extract_epi8(vH,  4);
    col[ 5*seglen+t] = (int8_t)_mm_extract_epi8(vH,  5);
    col[ 6*seglen+t] = (int8_t)_mm_extract_epi8(vH,  6);
    col[ 7*seglen+t] = (int8_t)_mm_extract_epi8(vH,  7);
    col[ 8*seglen+t] = (int8_t)_mm_extract_epi8(vH,  8);
    col[ 9*seglen+t] = (int8_t)_mm_extract_epi8(vH,  9);
    col[10*seglen+t] = (int8_t)_mm_extract_epi8(vH, 10);
    col[11*seglen+t] = (int8_t)_mm_extract_epi8(vH, 11);
    col[12*seglen+t] = (int8_t)_mm_extract_epi8(vH, 12);
    col[13*seglen+t] = (int8_t)_mm_extract_epi8(vH, 13);
    col[14*seglen+t] = (int8_t)_mm_extract_epi8(vH, 14);
    col[15*seglen+t] = (int8_t)_mm_extract_epi8(vH, 15);
}
static inline void arr_store_rowcol(
        int *row,
        int *col,
        vec128i vWH,
        int32_t i,
        int32_t s1Len,
        int32_t j,
        int32_t s2Len)
{
    if (i+0 == s1Len-1 && 0 <= j-0 && j-0 < s2Len) {
        row[j-0] = (int8_t)_mm_extract_epi8(vWH, 15);
    }
    if (j-0 == s2Len-1 && 0 <= i+0 && i+0 < s1Len) {
        col[(i+0)] = (int8_t)_mm_extract_epi8(vWH, 15);
    }
    if (i+1 == s1Len-1 && 0 <= j-1 && j-1 < s2Len) {
        row[j-1] = (int8_t)_mm_extract_epi8(vWH, 14);
    }
    if (j-1 == s2Len-1 && 0 <= i+1 && i+1 < s1Len) {
        col[(i+1)] = (int8_t)_mm_extract_epi8(vWH, 14);
    }
    if (i+2 == s1Len-1 && 0 <= j-2 && j-2 < s2Len) {
        row[j-2] = (int8_t)_mm_extract_epi8(vWH, 13);
    }
    if (j-2 == s2Len-1 && 0 <= i+2 && i+2 < s1Len) {
        col[(i+2)] = (int8_t)_mm_extract_epi8(vWH, 13);
    }
    if (i+3 == s1Len-1 && 0 <= j-3 && j-3 < s2Len) {
        row[j-3] = (int8_t)_mm_extract_epi8(vWH, 12);
    }
    if (j-3 == s2Len-1 && 0 <= i+3 && i+3 < s1Len) {
        col[(i+3)] = (int8_t)_mm_extract_epi8(vWH, 12);
    }
    if (i+4 == s1Len-1 && 0 <= j-4 && j-4 < s2Len) {
        row[j-4] = (int8_t)_mm_extract_epi8(vWH, 11);
    }
    if (j-4 == s2Len-1 && 0 <= i+4 && i+4 < s1Len) {
        col[(i+4)] = (int8_t)_mm_extract_epi8(vWH, 11);
    }
    if (i+5 == s1Len-1 && 0 <= j-5 && j-5 < s2Len) {
        row[j-5] = (int8_t)_mm_extract_epi8(vWH, 10);
    }
    if (j-5 == s2Len-1 && 0 <= i+5 && i+5 < s1Len) {
        col[(i+5)] = (int8_t)_mm_extract_epi8(vWH, 10);
    }
    if (i+6 == s1Len-1 && 0 <= j-6 && j-6 < s2Len) {
        row[j-6] = (int8_t)_mm_extract_epi8(vWH, 9);
    }
    if (j-6 == s2Len-1 && 0 <= i+6 && i+6 < s1Len) {
        col[(i+6)] = (int8_t)_mm_extract_epi8(vWH, 9);
    }
    if (i+7 == s1Len-1 && 0 <= j-7 && j-7 < s2Len) {
        row[j-7] = (int8_t)_mm_extract_epi8(vWH, 8);
    }
    if (j-7 == s2Len-1 && 0 <= i+7 && i+7 < s1Len) {
        col[(i+7)] = (int8_t)_mm_extract_epi8(vWH, 8);
    }
    if (i+8 == s1Len-1 && 0 <= j-8 && j-8 < s2Len) {
        row[j-8] = (int8_t)_mm_extract_epi8(vWH, 7);
    }
    if (j-8 == s2Len-1 && 0 <= i+8 && i+8 < s1Len) {
        col[(i+8)] = (int8_t)_mm_extract_epi8(vWH, 7);
    }
    if (i+9 == s1Len-1 && 0 <= j-9 && j-9 < s2Len) {
        row[j-9] = (int8_t)_mm_extract_epi8(vWH, 6);
    }
    if (j-9 == s2Len-1 && 0 <= i+9 && i+9 < s1Len) {
        col[(i+9)] = (int8_t)_mm_extract_epi8(vWH, 6);
    }
    if (i+10 == s1Len-1 && 0 <= j-10 && j-10 < s2Len) {
        row[j-10] = (int8_t)_mm_extract_epi8(vWH, 5);
    }
    if (j-10 == s2Len-1 && 0 <= i+10 && i+10 < s1Len) {
        col[(i+10)] = (int8_t)_mm_extract_epi8(vWH, 5);
    }
    if (i+11 == s1Len-1 && 0 <= j-11 && j-11 < s2Len) {
        row[j-11] = (int8_t)_mm_extract_epi8(vWH, 4);
    }
    if (j-11 == s2Len-1 && 0 <= i+11 && i+11 < s1Len) {
        col[(i+11)] = (int8_t)_mm_extract_epi8(vWH, 4);
    }
    if (i+12 == s1Len-1 && 0 <= j-12 && j-12 < s2Len) {
        row[j-12] = (int8_t)_mm_extract_epi8(vWH, 3);
    }
    if (j-12 == s2Len-1 && 0 <= i+12 && i+12 < s1Len) {
        col[(i+12)] = (int8_t)_mm_extract_epi8(vWH, 3);
    }
    if (i+13 == s1Len-1 && 0 <= j-13 && j-13 < s2Len) {
        row[j-13] = (int8_t)_mm_extract_epi8(vWH, 2);
    }
    if (j-13 == s2Len-1 && 0 <= i+13 && i+13 < s1Len) {
        col[(i+13)] = (int8_t)_mm_extract_epi8(vWH, 2);
    }
    if (i+14 == s1Len-1 && 0 <= j-14 && j-14 < s2Len) {
        row[j-14] = (int8_t)_mm_extract_epi8(vWH, 1);
    }
    if (j-14 == s2Len-1 && 0 <= i+14 && i+14 < s1Len) {
        col[(i+14)] = (int8_t)_mm_extract_epi8(vWH, 1);
    }
    if (i+15 == s1Len-1 && 0 <= j-15 && j-15 < s2Len) {
        row[j-15] = (int8_t)_mm_extract_epi8(vWH, 0);
    }
    if (j-15 == s2Len-1 && 0 <= i+15 && i+15 < s1Len) {
        col[(i+15)] = (int8_t)_mm_extract_epi8(vWH, 0);
    }
}