コード例 #1
0
void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
  const __m128i IXABCDEF =
      _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
  const __m128i XIJKLMNO =
      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
  const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
  __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
  __m128i rowa = avg2;
  __m128i rowb = avg3;
  int i;
  (void)bd;
  for (i = 0; i < 8; i += 2) {
    _mm_store_si128((__m128i *)dst, rowa);
    dst += stride;
    _mm_store_si128((__m128i *)dst, rowb);
    dst += stride;
    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
    rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
  }
}
コード例 #2
0
void __stdcall
planar_shader_to_rgb32_3_f16c(uint8_t** dstp, const uint8_t** srcp, const int dpitch,
    const int spitch, const int width, const int height, void* _buff) noexcept
{
    const uint8_t* sr = srcp[0];
    const uint8_t* sg = srcp[1];
    const uint8_t* sb = srcp[2];
    uint8_t* d = dstp[0] + (height - 1) * dpitch;

    float* bb = reinterpret_cast<float*>(_buff);
    float* bg = bb + ((width + 7) & ~7); // must be aligned 32 bytes
    float* br = bg + ((width + 7) & ~7); // must be aligned 32 bytes

    const __m128 coef = _mm_set1_ps(255.0f);
    const __m128i zero = _mm_setzero_si128();

    for (int y = 0; y < height; ++y) {
        convert_half_to_float(br, sr, width);
        convert_half_to_float(bg, sg, width);
        convert_half_to_float(bb, sb, width);
        for (int x = 0; x < width; x += 4) {
            __m128i b = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(bb + x)));
            __m128i g = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(bg + x)));
            __m128i r = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(br + x)));
            __m128i bgra = _mm_or_si128(b, _mm_slli_si128(g, 1));
            bgra = _mm_or_si128(bgra, _mm_slli_si128(r, 2));
            _mm_stream_si128(reinterpret_cast<__m128i*>(d + x * 4), bgra);
        }
        sr += spitch;
        sg += spitch;
        sb += spitch;
        d -= dpitch;
    }
}
コード例 #3
0
ファイル: aes_ssse3.cpp プロジェクト: fxdupont/botan
__m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2)
   {
   if(rcon)
      {
      input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15),
                             input2);

      *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon

      input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate
      input1 = _mm_alignr_epi8(input1, input1, 1);
      }

   __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4));
   smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B));

   __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4);

   input1 = _mm_and_si128(low_nibs, input1);

   __m128i t2 = _mm_shuffle_epi8(k_inv2, input1);

   input1 = _mm_xor_si128(input1, t);

   __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
   __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1));

   __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3));
   __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));

   return mm_xor3(_mm_shuffle_epi8(sb1u, t5),
                  _mm_shuffle_epi8(sb1t, t6),
                  smeared);
   }
コード例 #4
0
ファイル: AES.cpp プロジェクト: netromdk/faes
    inline void Cryptor::assistKey192(__m128i *tmp, __m128i *tmp2,
                                      __m128i *tmp3) {
      // Duplicate the 2nd 32-bit part 4 times:
      // [1, 2, 3, 4] -> [2, 2, 2, 2]      
      __m128i tmp4;
      *tmp2 = _mm_shuffle_epi32(*tmp2, SHUFFLE4_32(1, 1, 1, 1));
      
      tmp4 = _mm_slli_si128(*tmp, 0x4);
      *tmp = _mm_xor_si128(*tmp, tmp4);
      
      tmp4 = _mm_slli_si128(tmp4, 0x4);
      *tmp = _mm_xor_si128(*tmp, tmp4);
      
      tmp4 = _mm_slli_si128(tmp4, 0x4);
      *tmp = _mm_xor_si128(*tmp, tmp4);
      
      *tmp = _mm_xor_si128(*tmp, *tmp2);

      // Duplicate the 4th 32-bit part 4 times.
      *tmp2 = _mm_shuffle_epi32(*tmp, SHUFFLE4_32(3, 3, 3, 3));
      
      tmp4 = _mm_slli_si128(*tmp3, 0x4);
      *tmp3 = _mm_xor_si128(*tmp3, tmp4);
      
      *tmp3 = _mm_xor_si128(*tmp3, *tmp2);      
    }
コード例 #5
0
ファイル: filter_sse.c プロジェクト: alexzeitgeist/YT2
void
png_read_filter_row_sub4_sse(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_size_t i;
   __m128i racc = _mm_setzero_si128();
   __m128i* rp = (__m128i*)(row);

   PNG_UNUSED(prev_row)

   for (i = (row_info->rowbytes + 15) >> 4; i > 0; i--)
   {
      __m128i rb = _mm_load_si128(rp);

#ifndef __SSSE3__
      racc = _mm_srli_si128(racc, 12);
      racc = _mm_or_si128(racc, _mm_slli_si128(rb, 4));
#else
      racc =  _mm_alignr_epi8(rb, racc, 12);
#endif

      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 4);
      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 4);
      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 4);
      rb = _mm_add_epi8(rb, racc);
      racc = rb;

      _mm_store_si128(rp++, rb);
   }
}
コード例 #6
0
ファイル: botan_all_aesni.cpp プロジェクト: adolby/Kryvos
void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
                           uint32_t out[], bool last)
   {
   __m128i key1 = *K1;
   __m128i key2 = *K2;

   key2_with_rcon  = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
   key1 = _mm_xor_si128(key1, key2_with_rcon);

   *K1 = key1;
   _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1);

   if(last)
      return;

   key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
   key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));

   *K2 = key2;
   out[4] = _mm_cvtsi128_si32(key2);
   out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
   }
コード例 #7
0
// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length,
                            int inverse) {
  int i;
  if (length <= 0) return;
  if (inverse) {
    const int max_pos = length & ~7;
    __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]);
    for (i = 0; i < max_pos; i += 8) {
      const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i));
      const __m128i A1 = _mm_add_epi8(A0, last);
      const __m128i A2 = _mm_slli_si128(A1, 1);
      const __m128i A3 = _mm_add_epi8(A1, A2);
      const __m128i A4 = _mm_slli_si128(A3, 2);
      const __m128i A5 = _mm_add_epi8(A3, A4);
      const __m128i A6 = _mm_slli_si128(A5, 4);
      const __m128i A7 = _mm_add_epi8(A5, A6);
      _mm_storel_epi64((__m128i*)(dst + i), A7);
      last = _mm_srli_epi64(A7, 56);
    }
    for (; i < length; ++i) dst[i] = src[i] + dst[i - 1];
  } else {
    const int max_pos = length & ~31;
    for (i = 0; i < max_pos; i += 32) {
      const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i +  0    ));
      const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i +  0 - 1));
      const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16    ));
      const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1));
      const __m128i C0 = _mm_sub_epi8(A0, B0);
      const __m128i C1 = _mm_sub_epi8(A1, B1);
      _mm_storeu_si128((__m128i*)(dst + i +  0), C0);
      _mm_storeu_si128((__m128i*)(dst + i + 16), C1);
    }
    for (; i < length; ++i) dst[i] = src[i] - src[i - 1];
  }
}
コード例 #8
0
ファイル: mul9k3.c プロジェクト: nesciens/gf2x
/* same as mul5, but assumes {d,2} contains a[4]*b[4] */
GF2X_STORAGE_CLASS_mul5
void GF2X_FUNC(mul9k3_mul5b) (unsigned long *c, const unsigned long *a,
                        const unsigned long *b, const unsigned long *d)
{
  /* Montgomery formulae with 13 multiplications */
  unsigned long ta[3], tb[3], pa[8], pb[8];
  __v2di p0, p2, p4, p6, p8, p10, p12, p14, p16, p18, p20, p22, p24;
  __v2di t0, t2, t4, t6, t8, t10, t12;

  ta[0] = a[0]  ^ a[4]         ; tb[0] = b[0]  ^ b[4];
  ta[1] = a[1]  ^ a[2]         ; tb[1] = b[1]  ^ b[2];
  ta[2] = a[3]  ^ ta[0]        ; tb[2] = b[3]  ^ tb[0];
  pa[0] = ta[1] ^ ta[2]        ; pb[0] = tb[1] ^ tb[2];
  pa[1] = a[2]  ^ ta[2]        ; pb[1] = b[2]  ^ tb[2];
  pa[2] = ta[0] ^ ta[1]        ; pb[2] = tb[0] ^ tb[1];
  pa[3] = a[1]  ^ ta[2]        ; pb[3] = b[1]  ^ tb[2];
  pa[4] = a[0]  ^ a[2]  ^ a[3] ; pb[4] = b[0]  ^ b[2]  ^ b[3];
  pa[5] = a[4]  ^ ta[1]        ; pb[5] = b[4]  ^ tb[1];
  pa[6] = a[3]  ^ a[4]         ; pb[6] = b[3]  ^ b[4];
  pa[7] = a[0]  ^ a[1]         ; pb[7] = b[0]  ^ b[1];
  p0  = GF2X_FUNC(mul9k3_mul1)(pa[0], pb[0]);
  p2  = GF2X_FUNC(mul9k3_mul1)(pa[1], pb[1]);
  p4  = GF2X_FUNC(mul9k3_mul1)(pa[2], pb[2]);
  p6  = GF2X_FUNC(mul9k3_mul1)(pa[3], pb[3]);
  p8  = GF2X_FUNC(mul9k3_mul1)(pa[4], pb[4]);
  p10 = GF2X_FUNC(mul9k3_mul1)(pa[5], pb[5]);
  p12 = GF2X_FUNC(mul9k3_mul1)(pa[6], pb[6]);
  p14 = GF2X_FUNC(mul9k3_mul1)(pa[7], pb[7]);
  p16 = GF2X_FUNC(mul9k3_mul1)(ta[0], tb[0]);
  /* p18 = GF2X_FUNC(mul9k3_mul1)(a[4],  b[4]); */
  p18 = _mm_loadu_si128((__v2di *) d);
  p20 = GF2X_FUNC(mul9k3_mul1)(a[3],  b[3]);
  p22 = GF2X_FUNC(mul9k3_mul1)(a[1],  b[1]);
  p24 = GF2X_FUNC(mul9k3_mul1)(a[0],  b[0]);
  t0  = p14 ^ p24;
  t2  = p12 ^ p18;
  t4  = p2  ^ p16;
  t6  = p0  ^ p6;
  t8  = p4  ^ p16;
  t10 = p10 ^ t0;
  t12 = p8  ^ t2;

  __v2di ce0 = p24;
  __v2di ce2 = p18 ^ t8  ^ t10;
  __v2di ce4 = p0  ^ p20 ^ p22 ^ t10 ^ t12;
  __v2di ce6 = p24 ^ t4  ^ t12;
  __v2di ce8 = p18;

  __v2di co1 = p22 ^ t0;
  __v2di co3 = t2  ^ t4  ^ t6;
  __v2di co5 = t0  ^ t6  ^ t8;
  __v2di co7 = p20 ^ t2;

  _mm_storeu_si128((__v2di*)(c),   ce0 ^ _mm_slli_si128(co1, 8));
  _mm_storeu_si128((__v2di*)(c+2), ce2 ^ _mm_srli_si128(co1, 8) ^ _mm_slli_si128(co3, 8));
  _mm_storeu_si128((__v2di*)(c+4), ce4 ^ _mm_srli_si128(co3, 8) ^ _mm_slli_si128(co5, 8));
  _mm_storeu_si128((__v2di*)(c+6), ce6 ^ _mm_srli_si128(co5, 8) ^ _mm_slli_si128(co7, 8));
  _mm_storeu_si128((__v2di*)(c+8), ce8 ^ _mm_srli_si128(co7, 8));
}
コード例 #9
0
ファイル: mul5clk_c.c プロジェクト: nesciens/gf2x
GF2X_STORAGE_CLASS_mul5
void gf2x_mul5 (unsigned long *c, const unsigned long *a,
        const unsigned long *b)
{
  /* Montgomery formulae with 13 multiplications, see
     Five, Six, and Seven-Term {K}aratsuba-Like Formulae,
     IEEE Transactions on Computers, volume 54, number 3, p. 362-369, 2005 */
  unsigned long ta[3], tb[3], pa[8], pb[8];
  __v2di p0, p2, p4, p6, p8, p10, p12, p14, p16, p18, p20, p22, p24;
  __v2di t0, t2, t4, t6, t8, t10, t12;
  ta[0] = a[0]  ^ a[4]         ; tb[0] = b[0]  ^ b[4];
  ta[1] = a[1]  ^ a[2]         ; tb[1] = b[1]  ^ b[2];
  ta[2] = a[3]  ^ ta[0]        ; tb[2] = b[3]  ^ tb[0];
  pa[0] = ta[1] ^ ta[2]        ; pb[0] = tb[1] ^ tb[2];
  pa[1] = a[2]  ^ ta[2]        ; pb[1] = b[2]  ^ tb[2];
  pa[2] = ta[0] ^ ta[1]        ; pb[2] = tb[0] ^ tb[1];
  pa[3] = a[1]  ^ ta[2]        ; pb[3] = b[1]  ^ tb[2];
  pa[4] = a[0]  ^ a[2]  ^ a[3] ; pb[4] = b[0]  ^ b[2]  ^ b[3];
  pa[5] = a[4]  ^ ta[1]        ; pb[5] = b[4]  ^ tb[1];
  pa[6] = a[3]  ^ a[4]         ; pb[6] = b[3]  ^ b[4];
  pa[7] = a[0]  ^ a[1]         ; pb[7] = b[0]  ^ b[1];
  p0  = GF2X_FUNC(mul5clk_c_mul1)(pa[0], pb[0]);
  p2  = GF2X_FUNC(mul5clk_c_mul1)(pa[1], pb[1]);
  p4  = GF2X_FUNC(mul5clk_c_mul1)(pa[2], pb[2]);
  p6  = GF2X_FUNC(mul5clk_c_mul1)(pa[3], pb[3]);
  p8  = GF2X_FUNC(mul5clk_c_mul1)(pa[4], pb[4]);
  p10 = GF2X_FUNC(mul5clk_c_mul1)(pa[5], pb[5]);
  p12 = GF2X_FUNC(mul5clk_c_mul1)(pa[6], pb[6]);
  p14 = GF2X_FUNC(mul5clk_c_mul1)(pa[7], pb[7]);
  p16 = GF2X_FUNC(mul5clk_c_mul1)(ta[0], tb[0]);
  p18 = GF2X_FUNC(mul5clk_c_mul1)(a[4],  b[4]);
  p20 = GF2X_FUNC(mul5clk_c_mul1)(a[3],  b[3]);
  p22 = GF2X_FUNC(mul5clk_c_mul1)(a[1],  b[1]);
  p24 = GF2X_FUNC(mul5clk_c_mul1)(a[0],  b[0]);
  t0  = p14 ^ p24;
  t2  = p12 ^ p18;
  t4  = p2  ^ p16;
  t6  = p0  ^ p6;
  t8  = p4  ^ p16;
  t10 = p10 ^ t0;
  t12 = p8  ^ t2;

  __v2di ce0 = p24;
  __v2di ce2 = p18 ^ t8  ^ t10;
  __v2di ce4 = p0  ^ p20 ^ p22 ^ t10 ^ t12;
  __v2di ce6 = p24 ^ t4  ^ t12;
  __v2di ce8 = p18;

  __v2di co1 = p22 ^ t0;
  __v2di co3 = t2  ^ t4  ^ t6;
  __v2di co5 = t0  ^ t6  ^ t8;
  __v2di co7 = p20 ^ t2;

  _mm_storeu_si128((__v2di*)(c),   ce0 ^ _mm_slli_si128(co1, 8));
  _mm_storeu_si128((__v2di*)(c+2), ce2 ^ _mm_srli_si128(co1, 8) ^ _mm_slli_si128(co3, 8));
  _mm_storeu_si128((__v2di*)(c+4), ce4 ^ _mm_srli_si128(co3, 8) ^ _mm_slli_si128(co5, 8));
  _mm_storeu_si128((__v2di*)(c+6), ce6 ^ _mm_srli_si128(co5, 8) ^ _mm_slli_si128(co7, 8));
  _mm_storeu_si128((__v2di*)(c+8), ce8 ^ _mm_srli_si128(co7, 8));
}
コード例 #10
0
ファイル: polyval.c プロジェクト: Shay-Gueron/AES-GCM-SIV
void Polyval_Horner(unsigned char* TAG,
					unsigned char* pH,
					unsigned char* inp,
					int length)
{
	__m128i TMP0, TMP1, TMP2, TMP3, TMP4, T, POLY, H;
	int i=0;
	if (length==0)
		return;
	int has_semi = length%16;
	uint8_t B[16]={0};
	length /=16;
	
	H = _mm_loadu_si128(((__m128i*)pH));
	T = _mm_loadu_si128(((__m128i*)TAG));
	POLY = _mm_setr_epi32(0x1,0,0,0xc2000000);
	for (i=0; i< length; i++)
	{
		T = _mm_xor_si128(T, _mm_loadu_si128(&((__m128i*)inp)[i]));
		TMP1 = _mm_clmulepi64_si128(T, H, 0x00);
		TMP4 = _mm_clmulepi64_si128(T, H, 0x11);
		TMP2 = _mm_clmulepi64_si128(T, H, 0x10);
		TMP3 = _mm_clmulepi64_si128(T, H, 0x01);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_slli_si128(TMP2, 8);
		TMP2 = _mm_srli_si128(TMP2, 8);
		TMP1 = _mm_xor_si128(TMP3, TMP1);
		TMP4 = _mm_xor_si128(TMP4, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		T = _mm_xor_si128(TMP4, TMP1);
	}
	if (has_semi!=0)
	{
		memcpy(B, inp+length*16, has_semi);
		T = _mm_xor_si128(T, _mm_loadu_si128((__m128i*)B));
		TMP1 = _mm_clmulepi64_si128(T, H, 0x00);
		TMP4 = _mm_clmulepi64_si128(T, H, 0x11);
		TMP2 = _mm_clmulepi64_si128(T, H, 0x10);
		TMP3 = _mm_clmulepi64_si128(T, H, 0x01);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_slli_si128(TMP2, 8);
		TMP2 = _mm_srli_si128(TMP2, 8);
		TMP1 = _mm_xor_si128(TMP3, TMP1);
		TMP4 = _mm_xor_si128(TMP4, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		T = _mm_xor_si128(TMP4, TMP1);
	}
	_mm_storeu_si128(((__m128i*)TAG), T);
}
コード例 #11
0
__m128i shift_right_sse1(__m128i vec, int shift_num) {
	if(shift_num == 8)	
		return _mm_slli_si128(vec, 1);
	__m128i carryover = _mm_slli_si128(vec, 1);
	carryover = _mm_srli_epi64(carryover, 8 - (shift_num % 8));
	vec = _mm_slli_epi64(vec, shift_num % 8);
	return _mm_or_si128(vec, carryover);
}
コード例 #12
0
ファイル: AESNI.c プロジェクト: 26618929/pycrypto
static __m128i aes128_keyexpand(__m128i key, __m128i keygened, int shuf)
{
    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
    keygened = _mm_shuffle_epi32(keygened, shuf);
    return _mm_xor_si128(key, keygened);
}
コード例 #13
0
static __m128i assist128(__m128i a, __m128i b)
{
    __m128i tmp = _mm_slli_si128 (a, 0x04);
    a = _mm_xor_si128 (a, tmp);
    tmp = _mm_slli_si128 (tmp, 0x04);
    a = _mm_xor_si128 (_mm_xor_si128 (a, tmp), _mm_slli_si128 (tmp, 0x04));
    return _mm_xor_si128 (a, _mm_shuffle_epi32 (b ,0xff));
}
コード例 #14
0
ファイル: encrypt.c プロジェクト: 0x64616E69656C/supercop
__m128i aes128_keyexpand(__m128i key, __m128i keygened)
{
	key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
	key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
	key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
	keygened = _mm_shuffle_epi32(keygened, _MM_SHUFFLE(3,3,3,3));
	return _mm_xor_si128(key, keygened);
}
コード例 #15
0
void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_srli_si128(A1, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_srli_si128(A1, 4);
  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  const __m128i L0 = _mm_load_si128((const __m128i *)left);
  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
  __m128i row_0 = avg3_0;
  __m128i row_1 = avg3_1;
  __m128i avg2_avg3_left[2][2];
  int i, j;
  (void)bd;

  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);

  for (j = 0; j < 2; ++j) {
    for (i = 0; i < 2; ++i) {
      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
      _mm_store_si128((__m128i *)dst, row_0);
      _mm_store_si128((__m128i *)(dst + 8), row_1);
      dst += stride;
      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
      _mm_store_si128((__m128i *)dst, row_0);
      _mm_store_si128((__m128i *)(dst + 8), row_1);
      dst += stride;
      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
      _mm_store_si128((__m128i *)dst, row_0);
      _mm_store_si128((__m128i *)(dst + 8), row_1);
      dst += stride;
      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
      _mm_store_si128((__m128i *)dst, row_0);
      _mm_store_si128((__m128i *)(dst + 8), row_1);
      dst += stride;
    }
  }
}
コード例 #16
0
void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
  const __m128i B0 = _mm_load_si128((const __m128i *)above);
  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
  const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
  const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
  const __m128i L0 = _mm_load_si128((const __m128i *)left);
  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
  const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
  const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
  const __m128i C3 = _mm_srli_si128(B3, 2);
  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
  const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
  const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
  __m128i rowa_0 = avg3_0;
  __m128i rowa_1 = avg3_1;
  __m128i rowa_2 = avg3_2;
  __m128i rowa_3 = avg3_3;
  __m128i avg3_left[4];
  int i, j;
  (void)bd;
  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
  avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
  avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
  for (i = 0; i < 4; ++i) {
    __m128i avg_left = avg3_left[i];
    for (j = 0; j < 8; ++j) {
      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
      _mm_store_si128((__m128i *)dst, rowa_0);
      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
      dst += stride;
    }
  }
}
コード例 #17
0
ファイル: filter_sse.c プロジェクト: alexzeitgeist/YT2
void
png_read_filter_row_avg4_sse(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_size_t i;
   __m128i* rp = (__m128i*)row;
   const __m128i* prp = (const __m128i*)prev_row;
   __m128i pixel = _mm_setzero_si128();
   const __m128i mask = _mm_set1_epi8(0x01);

   for (i = (row_info->rowbytes + 15) >> 4; i > 0; i--)
   {
      __m128i prb = _mm_load_si128(prp++);
      __m128i rb = _mm_load_si128(rp);

      // First pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 4);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      // Second pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 4);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      // Third pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 4);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      // Fourth pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      _mm_store_si128(rp++, rb);
   }
}
コード例 #18
0
ファイル: botan_all_aesni.cpp プロジェクト: adolby/Kryvos
__m128i aes_256_key_expansion(__m128i key, __m128i key2)
   {
   __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
   key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));

   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
   key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
   return _mm_xor_si128(key, key_with_rcon);
   }
コード例 #19
0
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
{
	__m128i tmp4;
	*tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
	tmp4 = _mm_slli_si128(*tmp1, 0x04);
	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
	tmp4 = _mm_slli_si128(tmp4, 0x04);
	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
	tmp4 = _mm_slli_si128(tmp4, 0x04);
	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
	*tmp1 = _mm_xor_si128(*tmp1, *tmp2);
}
コード例 #20
0
ファイル: aesni.c プロジェクト: 6e6f36/hashkill
static inline void KEY_256_ASSIST_1(__m128i* temp1, __m128i * temp2)
{
    __m128i temp4;
    *temp2 = _mm_shuffle_epi32(*temp2, 0xff);
    temp4 = _mm_slli_si128 (*temp1, 0x4);
    *temp1 = _mm_xor_si128 (*temp1, temp4);
    temp4 = _mm_slli_si128 (temp4, 0x4);
    *temp1 = _mm_xor_si128 (*temp1, temp4);
    temp4 = _mm_slli_si128 (temp4, 0x4);
    *temp1 = _mm_xor_si128 (*temp1, temp4);
    *temp1 = _mm_xor_si128 (*temp1, *temp2);
}
コード例 #21
0
ファイル: aesni.c プロジェクト: behemot/pm
inline __m128i aesni_128_assist(__m128i t1, __m128i t2)
{
	__m128i t3;
	t2 = _mm_shuffle_epi32(t2 ,0xff);
	t3 = _mm_slli_si128(t1, 0x4);
	t1 = _mm_xor_si128(t1, t3);
	t3 = _mm_slli_si128(t3, 0x4);
	t1 = _mm_xor_si128(t1, t3);
	t3 = _mm_slli_si128(t3, 0x4);
	t1 = _mm_xor_si128(t1, t3);
	t1 = _mm_xor_si128(t1, t2);
	return t1;
}
コード例 #22
0
ファイル: aesni.c プロジェクト: 6e6f36/hashkill
static inline void KEY_256_ASSIST_2(__m128i* temp1, __m128i * temp3)
{
    __m128i temp2,temp4;
    temp4 = _mm_aeskeygenassist_si128 (*temp1, 0x0);
    temp2 = _mm_shuffle_epi32(temp4, 0xaa);
    temp4 = _mm_slli_si128 (*temp3, 0x4);
    *temp3 = _mm_xor_si128 (*temp3, temp4);
    temp4 = _mm_slli_si128 (temp4, 0x4);
    *temp3 = _mm_xor_si128 (*temp3, temp4);
    temp4 = _mm_slli_si128 (temp4, 0x4);
    *temp3 = _mm_xor_si128 (*temp3, temp4);
    *temp3 = _mm_xor_si128 (*temp3, temp2);
}
コード例 #23
0
ファイル: siv.c プロジェクト: medsec/riv
static __m128i aes_keygen_assist(__m128i temp1, __m128i temp2)
{
    __m128i temp3;
    temp2 = _mm_shuffle_epi32(temp2, 0xff);
    temp3 = _mm_slli_si128(temp1, 0x4);
    temp1 = vxor(temp1, temp3);
    temp3 = _mm_slli_si128(temp3, 0x4);
    temp1 = vxor(temp1, temp3);
    temp3 = _mm_slli_si128(temp3, 0x4);
    temp1 = vxor(temp1, temp3);
    temp1 = vxor(temp1, temp2);
    return temp1;
}
コード例 #24
0
ファイル: aes_key_expansion.c プロジェクト: sduc/AES-hashfunc
inline __m128i AES_128_ASSIST (__m128i temp1, __m128i temp2) 
        { 
            __m128i temp3; 
            temp2 = _mm_shuffle_epi32 (temp2 ,0xff); 
            temp3 = _mm_slli_si128 (temp1, 0x4); 
            temp1 = _mm_xor_si128 (temp1, temp3); 
            temp3 = _mm_slli_si128 (temp3, 0x4); 
            temp1 = _mm_xor_si128 (temp1, temp3); 
            temp3 = _mm_slli_si128 (temp3, 0x4); 
            temp1 = _mm_xor_si128 (temp1, temp3);   
            temp1 = _mm_xor_si128 (temp1, temp2); 
            return temp1; 
        } 
コード例 #25
0
ファイル: ghash_pclmulqdq_impl.c プロジェクト: nomaster/fastd
/** Performs a carryless multiplication of two 128bit integers modulo \f$ x^{128} + x^7 + x^2 + x + 1 \f$ */
static __m128i gmul(__m128i v, __m128i h) {
	/* multiply */
	__m128i z0, z1, z2, tmp;
	z0 = _mm_clmulepi64_si128(v, h, 0x11);
	z2 = _mm_clmulepi64_si128(v, h, 0x00);

	__m128i tmpv = _mm_srli_si128(v, 8);
	tmpv = _mm_xor_si128(tmpv, v);

	__m128i tmph = _mm_srli_si128(h, 8);
	tmph = _mm_xor_si128(tmph, h);

	z1 = _mm_clmulepi64_si128(tmpv, tmph, 0x00);
	z1 = _mm_xor_si128(z1, z0);
	z1 = _mm_xor_si128(z1, z2);

	tmp = _mm_srli_si128(z1, 8);
	__m128i pl = _mm_xor_si128(z0, tmp);

	tmp = _mm_slli_si128(z1, 8);
	__m128i ph = _mm_xor_si128(z2, tmp);

	tmp = _mm_srli_epi64(ph, 63);
	tmp = _mm_srli_si128(tmp, 8);

	pl = shl(pl, 1);
	pl = _mm_xor_si128(pl, tmp);

	ph = shl(ph, 1);

	/* reduce */
	__m128i b, c;
	b = c = _mm_slli_si128(ph, 8);

	b = _mm_slli_epi64(b, 62);
	c = _mm_slli_epi64(c, 57);

	tmp = _mm_xor_si128(b, c);
	__m128i d = _mm_xor_si128(ph, tmp);

	__m128i e = shr(d, 1);
	__m128i f = shr(d, 2);
	__m128i g = shr(d, 7);

	pl = _mm_xor_si128(pl, d);
	pl = _mm_xor_si128(pl, e);
	pl = _mm_xor_si128(pl, f);
	pl = _mm_xor_si128(pl, g);

	return pl;
}
コード例 #26
0
ファイル: AES.cpp プロジェクト: netromdk/faes
 void Cryptor::assistKey256_1(__m128i *tmp, __m128i *tmp2) {
   // Duplicate 4th part 4 times.
   *tmp2 = _mm_shuffle_epi32(*tmp2, SHUFFLE4_32(3, 3, 3, 3));
   
   __m128i tmp3 = _mm_slli_si128(*tmp, 0x4);
   *tmp = _mm_xor_si128(*tmp, tmp3);
   
   tmp3 = _mm_slli_si128(tmp3, 0x4);
   *tmp = _mm_xor_si128(*tmp, tmp3);
   
   tmp3 = _mm_slli_si128(tmp3, 0x4);
   *tmp = _mm_xor_si128(*tmp, tmp3);
   *tmp = _mm_xor_si128(*tmp, *tmp2);
 }
コード例 #27
0
static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
{
	__m128i tmp2, tmp4;
	
	tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
	tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
	tmp4 = _mm_slli_si128(*tmp3, 0x04);
	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
	tmp4 = _mm_slli_si128(tmp4, 0x04);
	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
	tmp4 = _mm_slli_si128(tmp4, 0x04);
	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
	*tmp3 = _mm_xor_si128(*tmp3, tmp2);
}
コード例 #28
0
ファイル: siphash_ssse3.c プロジェクト: dbremner/smhasher
uint64_t
siphash13(const unsigned char key[16], const unsigned char *m, size_t len) {
	xmmi k,v02,v20,v13,v11,v33,mi;
	uint64_t last7;
	uint32_t lo, hi;
	size_t i, blocks;

	k = _mm_loadu_si128((xmmi *)(key + 0));
	v02 = siphash_init[0].v;
	v13 = siphash_init[1].v;
	v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
	v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));

	last7 = (uint64_t)(len & 0xff) << 56;

	for (i = 0, blocks = (len & ~7); i < blocks; i += 8) {
		mi = _mm_loadl_epi64((xmmi *)(m + i));
		v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
		sipcompress()
		v02 = _mm_xor_si128(v02, mi);
	}

	switch (len - blocks) {
		case 7: last7 |= (uint64_t)m[i + 6] << 48;
		case 6: last7 |= (uint64_t)m[i + 5] << 40;
		case 5: last7 |= (uint64_t)m[i + 4] << 32;
		case 4: last7 |= (uint64_t)m[i + 3] << 24;
		case 3: last7 |= (uint64_t)m[i + 2] << 16;
		case 2: last7 |= (uint64_t)m[i + 1] <<  8;
		case 1: last7 |= (uint64_t)m[i + 0]      ;
		case 0:
		default:;
	};

	mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32)));
	v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
	sipcompress()
	v02 = _mm_xor_si128(v02, mi);
	v02 = _mm_xor_si128(v02, siphash_final.v);
	sipcompress()
	sipcompress()
	sipcompress()

	v02 = _mm_xor_si128(v02, v13);
	v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
	lo = _mm_cvtsi128_si32(v02);
	hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4));
	return ((uint64_t)hi << 32) | lo;
}
コード例 #29
0
ファイル: SimdSse2BgrToBgra.cpp プロジェクト: 4144/Simd
        template <bool align> void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
            const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
        {
            assert(width >= HA);
            if(align)
            {
                assert(Aligned(blue) && Aligned(blueStride));
                assert(Aligned(green) && Aligned(greenStride));
                assert(Aligned(red) && Aligned(redStride));
                assert(Aligned(bgra) && Aligned(bgraStride));
            }

            __m128i _alpha = _mm_slli_si128(_mm_set1_epi16(alpha), 1);
            size_t alignedWidth = AlignLo(width, HA);
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0, srcOffset = 0, dstOffset = 0; col < alignedWidth; col += HA, srcOffset += A, dstOffset += DA)
                    Bgr48pToBgra32<align>(bgra + dstOffset, blue, green, red, srcOffset, _alpha);
                if(width != alignedWidth)
                    Bgr48pToBgra32<false>(bgra + (width - HA)*4, blue, green, red, (width - HA)*2, _alpha);
                blue += blueStride;
                green += greenStride;
                red += redStride;
                bgra += bgraStride;
            }
        }
コード例 #30
0
ファイル: cn_gpu_ssse3.cpp プロジェクト: mike2001/xmrig
inline void single_compute_wrap(__m128 n0, __m128 n1, __m128 n2,  __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
{
    __m128i r = single_compute<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum);
    if(rot != 0)
        r = _mm_or_si128(_mm_slli_si128(r, 16 - rot), _mm_srli_si128(r, rot));
    out = _mm_xor_si128(out, r);
}