Ejemplo n.º 1
0
void fb_sqrl_low(dig_t *c, const dig_t *a) {
	__m128i m0, m1, m2, m3, m4, m5, m6, mask;
	__m128i t0;

	t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100);
	mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);

	m0 = _mm_load_si128((__m128i *)(a));
	m1 = _mm_and_si128(m0, mask);
	m1 = _mm_shuffle_epi8(t0, m1);
	m2 = _mm_srli_epi64(m0, 4);
	m2 = _mm_and_si128(m2, mask);
	m2 = _mm_shuffle_epi8(t0, m2);
	m3 = _mm_unpacklo_epi8(m1, m2);
	m4 = _mm_unpackhi_epi8(m1, m2);

	m0 = _mm_load_si128((__m128i *)(a+2));
	m1 = _mm_and_si128(m0, mask);
	m1 = _mm_shuffle_epi8(t0, m1);
	m2 = _mm_srli_epi64(m0, 4);
	m2 = _mm_and_si128(m2, mask);
	m2 = _mm_shuffle_epi8(t0, m2);
	m5 = _mm_unpacklo_epi8(m1, m2);
	m6 = _mm_unpackhi_epi8(m1, m2);

	_mm_store_si128((__m128i *)(c + 0), m3);
	_mm_store_si128((__m128i *)(c + 2), m4);
	_mm_store_si128((__m128i *)(c + 4), m5);
	_mm_store_si128((__m128i *)(c + 6), m6);
}
Ejemplo n.º 2
0
static WEBP_INLINE void ProcessRow(const __m128i* const A0,
                                   const __m128i* const A1,
                                   const __m128i* const A2,
                                   const __m128i* const A3,
                                   const __m128i* const mult,
                                   uint8_t* const dst) {
  const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
  const __m128i B0 = _mm_mul_epu32(*A0, *mult);
  const __m128i B1 = _mm_mul_epu32(*A1, *mult);
  const __m128i B2 = _mm_mul_epu32(*A2, *mult);
  const __m128i B3 = _mm_mul_epu32(*A3, *mult);
  const __m128i C0 = _mm_add_epi64(B0, rounder);
  const __m128i C1 = _mm_add_epi64(B1, rounder);
  const __m128i C2 = _mm_add_epi64(B2, rounder);
  const __m128i C3 = _mm_add_epi64(B3, rounder);
  const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);
  const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
  const __m128i D2 = _mm_and_si128(C2, mask);
  const __m128i D3 = _mm_and_si128(C3, mask);
  const __m128i E0 = _mm_or_si128(D0, D2);
  const __m128i E1 = _mm_or_si128(D1, D3);
  const __m128i F = _mm_packs_epi32(E0, E1);
  const __m128i G = _mm_packus_epi16(F, F);
  _mm_storel_epi64((__m128i*)dst, G);
}
Ejemplo n.º 3
0
static WEBP_INLINE void ProcessRow_Floor_SSE2(const __m128i* const A0,
                                              const __m128i* const A1,
                                              const __m128i* const A2,
                                              const __m128i* const A3,
                                              const __m128i* const mult,
                                              uint8_t* const dst) {
  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
  const __m128i B0 = _mm_mul_epu32(*A0, *mult);
  const __m128i B1 = _mm_mul_epu32(*A1, *mult);
  const __m128i B2 = _mm_mul_epu32(*A2, *mult);
  const __m128i B3 = _mm_mul_epu32(*A3, *mult);
  const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX);
  const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX);
#if (WEBP_RESCALER_RFIX < 32)
  const __m128i D2 =
      _mm_and_si128(_mm_slli_epi64(B2, 32 - WEBP_RESCALER_RFIX), mask);
  const __m128i D3 =
      _mm_and_si128(_mm_slli_epi64(B3, 32 - WEBP_RESCALER_RFIX), mask);
#else
  const __m128i D2 = _mm_and_si128(B2, mask);
  const __m128i D3 = _mm_and_si128(B3, mask);
#endif
  const __m128i E0 = _mm_or_si128(D0, D2);
  const __m128i E1 = _mm_or_si128(D1, D3);
  const __m128i F = _mm_packs_epi32(E0, E1);
  const __m128i G = _mm_packus_epi16(F, F);
  _mm_storel_epi64((__m128i*)dst, G);
}
Ejemplo n.º 4
0
unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
  __m128i s0, s1, u0;
  unsigned int avg = 0;
  u0  = _mm_setzero_si128();
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);

  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
  avg = _mm_extract_epi16(s0, 0);
  return (avg + 32) >> 6;
}
Ejemplo n.º 5
0
static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
                                  int num_pixels, uint8_t* dst) {
  const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
  const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
  const __m128i* in = (const __m128i*)src;
  const uint8_t* const end = dst + num_pixels * 3;
  // the last storel_epi64 below writes 8 bytes starting at offset 18
  while (dst + 26 <= end) {
    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    const __m128i a0l = _mm_and_si128(bgra0, mask_l);   // bgr0|0|bgr0|0
    const __m128i a4l = _mm_and_si128(bgra4, mask_l);   // bgr0|0|bgr0|0
    const __m128i a0h = _mm_and_si128(bgra0, mask_h);   // 0|bgr0|0|bgr0
    const __m128i a4h = _mm_and_si128(bgra4, mask_h);   // 0|bgr0|0|bgr0
    const __m128i b0h = _mm_srli_epi64(a0h, 8);         // 000b|gr00|000b|gr00
    const __m128i b4h = _mm_srli_epi64(a4h, 8);         // 000b|gr00|000b|gr00
    const __m128i c0 = _mm_or_si128(a0l, b0h);          // rgbrgb00|rgbrgb00
    const __m128i c4 = _mm_or_si128(a4l, b4h);          // rgbrgb00|rgbrgb00
    const __m128i c2 = _mm_srli_si128(c0, 8);
    const __m128i c6 = _mm_srli_si128(c4, 8);
    _mm_storel_epi64((__m128i*)(dst +   0), c0);
    _mm_storel_epi64((__m128i*)(dst +   6), c2);
    _mm_storel_epi64((__m128i*)(dst +  12), c4);
    _mm_storel_epi64((__m128i*)(dst +  18), c6);
    dst += 24;
    num_pixels -= 8;
  }
  // left-overs
  if (num_pixels > 0) {
    VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
  }
}
Ejemplo n.º 6
0
static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
  int x_out;
  uint8_t* const dst = wrk->dst;
  rescaler_t* const irow = wrk->irow;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  const rescaler_t* const frow = wrk->frow;
  const __m128i mult = _mm_set_epi32(0, wrk->fy_scale, 0, wrk->fy_scale);

  assert(!WebPRescalerOutputDone(wrk));
  assert(wrk->y_accum <= 0 && wrk->y_sub + wrk->y_accum >= 0);
  assert(wrk->y_expand);
  if (wrk->y_accum == 0) {
    for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
      __m128i A0, A1, A2, A3;
      LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3);
      ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
    }
    for (; x_out < x_out_max; ++x_out) {
      const uint32_t J = frow[x_out];
      const int v = (int)MULT_FIX(J, wrk->fy_scale);
      assert(v >= 0 && v <= 255);
      dst[x_out] = v;
    }
  } else {
    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
    const __m128i mA = _mm_set_epi32(0, A, 0, A);
    const __m128i mB = _mm_set_epi32(0, B, 0, B);
    const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
    for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
      __m128i A0, A1, A2, A3, B0, B1, B2, B3;
      LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3);
      LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3);
      {
        const __m128i C0 = _mm_add_epi64(A0, B0);
        const __m128i C1 = _mm_add_epi64(A1, B1);
        const __m128i C2 = _mm_add_epi64(A2, B2);
        const __m128i C3 = _mm_add_epi64(A3, B3);
        const __m128i D0 = _mm_add_epi64(C0, rounder);
        const __m128i D1 = _mm_add_epi64(C1, rounder);
        const __m128i D2 = _mm_add_epi64(C2, rounder);
        const __m128i D3 = _mm_add_epi64(C3, rounder);
        const __m128i E0 = _mm_srli_epi64(D0, WEBP_RESCALER_RFIX);
        const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX);
        const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX);
        const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX);
        ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out);
      }
    }
    for (; x_out < x_out_max; ++x_out) {
      const uint64_t I = (uint64_t)A * frow[x_out]
                       + (uint64_t)B * irow[x_out];
      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
      const int v = (int)MULT_FIX(J, wrk->fy_scale);
      assert(v >= 0 && v <= 255);
      dst[x_out] = v;
    }
  }
}
Ejemplo n.º 7
0
static inline __m128i
v4_interpolate_color_sse2(__m128i a, __m128i c0, __m128i c1)
{
   const __m128i rb_mask = _mm_set1_epi32(0xFF00FF00);
   const __m128i zero = _mm_setzero_si128();

   __m128i a_l = a;
   __m128i a_h = a;
   a_l = _mm_unpacklo_epi16(a_l, a_l);
   a_h = _mm_unpackhi_epi16(a_h, a_h);

   __m128i a_t = _mm_slli_epi64(a_l, 32);
   __m128i a_t0 = _mm_slli_epi64(a_h, 32);

   a_l = _mm_add_epi32(a_l, a_t);
   a_h = _mm_add_epi32(a_h, a_t0);

   __m128i c0_l = c0;
   __m128i c0_h = c0;

   c0_l = _mm_unpacklo_epi8(c0_l, zero);
   c0_h = _mm_unpackhi_epi8(c0_h, zero);

   __m128i c1_l = c1;
   __m128i c1_h = c1;

   c1_l = _mm_unpacklo_epi8(c1_l, zero);
   c1_h = _mm_unpackhi_epi8(c1_h, zero);

   __m128i cl_sub = _mm_sub_epi16(c0_l, c1_l);
   __m128i ch_sub = _mm_sub_epi16(c0_h, c1_h);

   cl_sub = _mm_mullo_epi16(cl_sub, a_l);
   ch_sub = _mm_mullo_epi16(ch_sub, a_h);

   __m128i c1ls = _mm_slli_epi16(c1_l, 8);
   __m128i c1hs = _mm_slli_epi16(c1_h, 8);

   cl_sub = _mm_add_epi16(cl_sub, c1ls);
   ch_sub = _mm_add_epi16(ch_sub, c1hs);

   cl_sub = _mm_and_si128(cl_sub, rb_mask);
   ch_sub = _mm_and_si128(ch_sub, rb_mask);

   cl_sub = _mm_srli_epi64(cl_sub, 8);
   ch_sub = _mm_srli_epi64(ch_sub, 8);

   cl_sub = _mm_packus_epi16(cl_sub, cl_sub);
   ch_sub = _mm_packus_epi16(ch_sub, ch_sub);

   return  (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44);
}
Ejemplo n.º 8
0
static inline void
desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
		uint32_t *ptype_tbl)
{
	__m128i ptype0 = _mm_unpackhi_epi64(descs[0], descs[1]);
	__m128i ptype1 = _mm_unpackhi_epi64(descs[2], descs[3]);

	ptype0 = _mm_srli_epi64(ptype0, 30);
	ptype1 = _mm_srli_epi64(ptype1, 30);

	rx_pkts[0]->packet_type = ptype_tbl[_mm_extract_epi8(ptype0, 0)];
	rx_pkts[1]->packet_type = ptype_tbl[_mm_extract_epi8(ptype0, 8)];
	rx_pkts[2]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 0)];
	rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 8)];
}
Ejemplo n.º 9
0
// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length,
                            int inverse) {
  int i;
  if (length <= 0) return;
  if (inverse) {
    const int max_pos = length & ~7;
    __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]);
    for (i = 0; i < max_pos; i += 8) {
      const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i));
      const __m128i A1 = _mm_add_epi8(A0, last);
      const __m128i A2 = _mm_slli_si128(A1, 1);
      const __m128i A3 = _mm_add_epi8(A1, A2);
      const __m128i A4 = _mm_slli_si128(A3, 2);
      const __m128i A5 = _mm_add_epi8(A3, A4);
      const __m128i A6 = _mm_slli_si128(A5, 4);
      const __m128i A7 = _mm_add_epi8(A5, A6);
      _mm_storel_epi64((__m128i*)(dst + i), A7);
      last = _mm_srli_epi64(A7, 56);
    }
    for (; i < length; ++i) dst[i] = src[i] + dst[i - 1];
  } else {
    const int max_pos = length & ~31;
    for (i = 0; i < max_pos; i += 32) {
      const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i +  0    ));
      const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i +  0 - 1));
      const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16    ));
      const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1));
      const __m128i C0 = _mm_sub_epi8(A0, B0);
      const __m128i C1 = _mm_sub_epi8(A1, B1);
      _mm_storeu_si128((__m128i*)(dst + i +  0), C0);
      _mm_storeu_si128((__m128i*)(dst + i + 16), C1);
    }
    for (; i < length; ++i) dst[i] = src[i] - src[i - 1];
  }
}
Ejemplo n.º 10
0
inline __m128i Convert8DigitsSSE2(uint32_t value) {
	assert(value <= 99999999);

	// abcd, efgh = abcdefgh divmod 10000 
	const __m128i abcdefgh = _mm_cvtsi32_si128(value);
	const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45);
	const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0]));

	// v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ]
	const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh);

	// v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ]
	const __m128i v1a = _mm_slli_epi64(v1, 2);

	// v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ]
	const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a);
	const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a);

	// v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ]
	const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]);
	const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]);

	// v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ]
	const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]);

	// v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ]
	const __m128i v6 = _mm_slli_epi64(v5, 16);

	// v7 = v4 - v6 = { a, b, c, d, e, f, g, h }
	const __m128i v7 = _mm_sub_epi16(v4, v6);

	return v7;
}
Ejemplo n.º 11
0
static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
                                                int stride) {
  const __m128i v_val_0_w =
      _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
  const __m128i v_val_1_w =
      _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
  const __m128i v_val_2_w =
      _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
  const __m128i v_val_3_w =
      _mm_loadl_epi64((const __m128i *)(src + 3 * stride));

  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);

  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);

  const __m128i v_sum_d =
      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));

  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
}
Ejemplo n.º 12
0
/** Right shift on a 128bit integer */
static inline __m128i shr(__m128i v, int a) {
	__m128i tmpr = _mm_srli_epi64(v, a);
	__m128i tmpl = _mm_slli_epi64(v, 64-a);
	tmpl = _mm_srli_si128(tmpl, 8);

	return _mm_xor_si128(tmpr, tmpl);
}
Ejemplo n.º 13
0
int aom_satd_sse2(const tran_low_t *coeff, int length) {
  int i;
  const __m128i zero = _mm_setzero_si128();
  __m128i accum = zero;

  for (i = 0; i < length; i += 8) {
    const __m128i src_line = load_tran_low(coeff);
    const __m128i inv = _mm_sub_epi16(zero, src_line);
    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
    accum = _mm_add_epi32(accum, sum);
    coeff += 8;
  }

  {  // cascading summation of accum
    __m128i hi = _mm_srli_si128(accum, 8);
    accum = _mm_add_epi32(accum, hi);
    hi = _mm_srli_epi64(accum, 32);
    accum = _mm_add_epi32(accum, hi);
  }

  return _mm_cvtsi128_si32(accum);
}
Ejemplo n.º 14
0
void
maddrc4_shuffle_ssse3(uint8_t* region1, const uint8_t* region2, uint8_t constant, 
								size_t length)
{
	uint8_t *end;
	register __m128i in1, in2, out, t1, t2, m1, m2, l, h;

	if (constant == 0)
		return;

	if (constant == 1) {
		xorr_sse2(region1, region2, length);
		return;
	}
	
	t1 = _mm_loadu_si128((void *)tl[constant]);
	t2 = _mm_slli_epi64(t1, 4);
	m1 = _mm_set1_epi8(0x0f);
	m2 = _mm_set1_epi8(0xf0);

	for (end=region1+length; region1<end; region1+=16, region2+=16) {
		in2 = _mm_load_si128((void *)region2);
		in1 = _mm_load_si128((void *)region1);
		l = _mm_and_si128(in2, m1);
		l = _mm_shuffle_epi8(t1, l);
		h = _mm_and_si128(in2, m2);
		h = _mm_srli_epi64(h, 4);
		h = _mm_shuffle_epi8(t2, h);
		out = _mm_xor_si128(h,l);
		out = _mm_xor_si128(out, in1);
		_mm_store_si128((void *)region1, out);
	}
}
Ejemplo n.º 15
0
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
  int x = 0;
  if (!inverse) {
    const int kSpan = 2;
    const __m128i zero = _mm_setzero_si128();
    const __m128i kRound =
        _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
    const __m128i kMult =
        _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
    const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
    const int w2 = width & ~(kSpan - 1);
    for (x = 0; x < w2; x += kSpan) {
      const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
      const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
      const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
      const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
      const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
      const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
      const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
      const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
      const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
      const __m128i argb6 = _mm_srli_epi16(argb5, 8);
      const __m128i argb7 = _mm_packus_epi16(argb6, zero);
      _mm_storel_epi64((__m128i*)&ptr[x], argb7);
    }
  }
  width -= x;
  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
}
Ejemplo n.º 16
0
void
mulrc4_shuffle_ssse3(uint8_t *region, uint8_t constant, size_t length)
{
	uint8_t *end;
	register __m128i in, out, t1, t2, m1, m2, l, h;

	if (constant == 0) {
		memset(region, 0, length);
		return;
	}

	if (constant == 1)
		return;

	t1 = _mm_loadu_si128((void *)tl[constant]);
	t2 = _mm_slli_epi64(t1, 4);
	m1 = _mm_set1_epi8(0x0f);
	m2 = _mm_set1_epi8(0xf0);

	for (end=region+length; region<end; region+=16) {
		in = _mm_load_si128((void *)region);
		l = _mm_and_si128(in, m1);
		l = _mm_shuffle_epi8(t1, l);
		h = _mm_and_si128(in, m2);
		h = _mm_srli_epi64(h, 4);
		h = _mm_shuffle_epi8(t2, h);
		out = _mm_xor_si128(h, l);
		_mm_store_si128((void *)region, out);
	}
}
Ejemplo n.º 17
0
__m128i test_mm_srli_epi64(__m128i A) {
  // DAG-LABEL: test_mm_srli_epi64
  // DAG: call <2 x i64> @llvm.x86.sse2.psrli.q
  //
  // ASM-LABEL: test_mm_srli_epi64
  // ASM: psrlq
  return _mm_srli_epi64(A, 1);
}
__m128i shift_left_sse1(__m128i vec, int shift_num) {
	if(shift_num == 8)
		return _mm_srli_si128(vec, 1);
	__m128i carryover = _mm_srli_si128(vec, 1);
	carryover = _mm_slli_epi64(carryover, 8 - (shift_num % 8));
	vec = _mm_srli_epi64(vec, shift_num % 8);
	return _mm_or_si128(vec, carryover);
}
Ejemplo n.º 19
0
static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
  __m128i v_acc0_q = _mm_setzero_si128();
  __m128i v_acc1_q = _mm_setzero_si128();

  const int16_t *const end = src + n;

  assert(n % 64 == 0);

  while (src < end) {
    const __m128i v_val_0_w = xx_load_128(src);
    const __m128i v_val_1_w = xx_load_128(src + 8);
    const __m128i v_val_2_w = xx_load_128(src + 16);
    const __m128i v_val_3_w = xx_load_128(src + 24);
    const __m128i v_val_4_w = xx_load_128(src + 32);
    const __m128i v_val_5_w = xx_load_128(src + 40);
    const __m128i v_val_6_w = xx_load_128(src + 48);
    const __m128i v_val_7_w = xx_load_128(src + 56);

    const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
    const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
    const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
    const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
    const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
    const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
    const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
    const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);

    const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
    const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
    const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
    const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);

    const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
    const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);

    const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);

    v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
    v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));

    src += 64;
  }

  v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));

#if ARCH_X86_64
  return (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
#else
  {
    uint64_t tmp;
    _mm_storel_epi64((__m128i *)&tmp, v_acc0_q);
    return tmp;
  }
#endif
}
Ejemplo n.º 20
0
AYCW_INLINE __m128i BS_SHR(__m128i v, int n)
{
   __m128i v1, v2;

   if ((n) >= 64)
   {
      v1 = _mm_srli_si128(v, 8);
      v1 = _mm_srli_epi64(v1, (n)-64);
   }
   else
   {
      v1 = _mm_srli_epi64(v, n);
      v2 = _mm_srli_si128(v, 8);
      v2 = _mm_slli_epi64(v2, 64 - (n));
      v1 = _mm_or_si128(v1, v2);
   }
   return v1;
}
Ejemplo n.º 21
0
__m64 _m_psrlqi(__m64 _M, int _Count)
{
    __m128i lhs = {0};
    lhs.m128i_i64[0] = _M.m64_i64;

    lhs = _mm_srli_epi64(lhs, _Count);

    _M.m64_i64 = lhs.m128i_i64[0];
    return _M;
}
Ejemplo n.º 22
0
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
                                         const uint8_t* src) {
  const int x_sub = wrk->x_sub;
  int accum = 0;
  const __m128i zero = _mm_setzero_si128();
  const __m128i mult0 = _mm_set1_epi16(x_sub);
  const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale);
  const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
  __m128i sum = zero;
  rescaler_t* frow = wrk->frow;
  const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;

  if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
    WebPRescalerImportRowShrink_C(wrk, src);
    return;
  }
  assert(!WebPRescalerInputDone(wrk));
  assert(!wrk->x_expand);

  for (; frow < frow_end; frow += 4) {
    __m128i base = zero;
    accum += wrk->x_add;
    while (accum > 0) {
      const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src));
      src += 4;
      base = _mm_unpacklo_epi8(A, zero);
      // To avoid overflow, we need: base * x_add / x_sub < 32768
      // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit.
      sum = _mm_add_epi16(sum, base);
      accum -= x_sub;
    }
    {    // Emit next horizontal pixel.
      const __m128i mult = _mm_set1_epi16(-accum);
      const __m128i frac0 = _mm_mullo_epi16(base, mult);  // 16b x 16b -> 32b
      const __m128i frac1 = _mm_mulhi_epu16(base, mult);
      const __m128i frac = _mm_unpacklo_epi16(frac0, frac1);  // frac is 32b
      const __m128i A0 = _mm_mullo_epi16(sum, mult0);
      const __m128i A1 = _mm_mulhi_epu16(sum, mult0);
      const __m128i B0 = _mm_unpacklo_epi16(A0, A1);      // sum * x_sub
      const __m128i frow_out = _mm_sub_epi32(B0, frac);   // sum * x_sub - frac
      const __m128i D0 = _mm_srli_epi64(frac, 32);
      const __m128i D1 = _mm_mul_epu32(frac, mult1);      // 32b x 16b -> 64b
      const __m128i D2 = _mm_mul_epu32(D0, mult1);
      const __m128i E1 = _mm_add_epi64(D1, rounder);
      const __m128i E2 = _mm_add_epi64(D2, rounder);
      const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2));
      const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2));
      const __m128i G = _mm_unpacklo_epi32(F1, F2);
      sum = _mm_packs_epi32(G, zero);
      _mm_storeu_si128((__m128i*)frow, frow_out);
    }
  }
  assert(accum == 0);
}
Ejemplo n.º 23
0
/* constant-time doubling in GF(2^128) */
static __m128i gf128_mul2(const __m128i x)
{
    const __m128i REDPOLY = _mm_set_epi64x(0, 0x87);
    const __m128i ZERO = _mm_setzero_si128();
	__m128i x2;

    __m128i mask = _mm_cmpgt_epi32(ZERO, x);
    mask = _mm_shuffle_epi32(mask, 0xff);

    x2 = _mm_slli_epi64(x, 1) | _mm_srli_epi64(_mm_slli_si128(x, 8), 63);

    return x2 ^ (REDPOLY & mask);
}
Ejemplo n.º 24
0
// load *src as epi64, multiply by mult and store result in [out0 ... out3]
static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
                                            const __m128i* const mult,
                                            __m128i* const out0,
                                            __m128i* const out1,
                                            __m128i* const out2,
                                            __m128i* const out3) {
  const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
  const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
  const __m128i A2 = _mm_srli_epi64(A0, 32);
  const __m128i A3 = _mm_srli_epi64(A1, 32);
  if (mult != NULL) {
    *out0 = _mm_mul_epu32(A0, *mult);
    *out1 = _mm_mul_epu32(A1, *mult);
    *out2 = _mm_mul_epu32(A2, *mult);
    *out3 = _mm_mul_epu32(A3, *mult);
  } else {
    *out0 = A0;
    *out1 = A1;
    *out2 = A2;
    *out3 = A3;
  }
}
Ejemplo n.º 25
0
/** Performs a carryless multiplication of two 128bit integers modulo \f$ x^{128} + x^7 + x^2 + x + 1 \f$ */
static __m128i gmul(__m128i v, __m128i h) {
	/* multiply */
	__m128i z0, z1, z2, tmp;
	z0 = _mm_clmulepi64_si128(v, h, 0x11);
	z2 = _mm_clmulepi64_si128(v, h, 0x00);

	__m128i tmpv = _mm_srli_si128(v, 8);
	tmpv = _mm_xor_si128(tmpv, v);

	__m128i tmph = _mm_srli_si128(h, 8);
	tmph = _mm_xor_si128(tmph, h);

	z1 = _mm_clmulepi64_si128(tmpv, tmph, 0x00);
	z1 = _mm_xor_si128(z1, z0);
	z1 = _mm_xor_si128(z1, z2);

	tmp = _mm_srli_si128(z1, 8);
	__m128i pl = _mm_xor_si128(z0, tmp);

	tmp = _mm_slli_si128(z1, 8);
	__m128i ph = _mm_xor_si128(z2, tmp);

	tmp = _mm_srli_epi64(ph, 63);
	tmp = _mm_srli_si128(tmp, 8);

	pl = shl(pl, 1);
	pl = _mm_xor_si128(pl, tmp);

	ph = shl(ph, 1);

	/* reduce */
	__m128i b, c;
	b = c = _mm_slli_si128(ph, 8);

	b = _mm_slli_epi64(b, 62);
	c = _mm_slli_epi64(c, 57);

	tmp = _mm_xor_si128(b, c);
	__m128i d = _mm_xor_si128(ph, tmp);

	__m128i e = shr(d, 1);
	__m128i f = shr(d, 2);
	__m128i g = shr(d, 7);

	pl = _mm_xor_si128(pl, d);
	pl = _mm_xor_si128(pl, e);
	pl = _mm_xor_si128(pl, f);
	pl = _mm_xor_si128(pl, g);

	return pl;
}
Ejemplo n.º 26
0
/**
 * This function represents the recursion formula.
 * @param r output 128-bit
 * @param a a 128-bit part of the internal state array
 * @param b a 128-bit part of the internal state array
 * @param d a 128-bit part of the internal state array (I/O)
 */
static inlinec void do_recursion(w128_t *r, w128_t *a, w128_t *b, w128_t *u) {
  __m128i x = a->si;
  __m128i z = _mm_slli_epi64(x, DSFMT_SL1);
  __m128i y = _mm_shuffle_epi32(u->si, SSE2_SHUFF);
  z = _mm_xor_si128(z, b->si);
  y = _mm_xor_si128(y, z);

  __m128i v = _mm_srli_epi64(y, DSFMT_SR);
  __m128i w = _mm_and_si128(y, sse2_param_mask.i128);
  v = _mm_xor_si128(v, x);
  v = _mm_xor_si128(v, w);
  r->si = v;
  u->si = y;
}
Ejemplo n.º 27
0
void fb_sqrm_low(dig_t *c, const dig_t *a) {
	__m128i t0, t1, m0, m1, m2, m3, m4, m5, m6, m8, m9, mask;
	align dig_t x[2];

	t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100);
	mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);

	m0 = _mm_load_si128((__m128i *)(a));
	m1 = _mm_and_si128(m0, mask);
	m1 = _mm_shuffle_epi8(t0, m1);
	m2 = _mm_srli_epi64(m0, 4);
	m2 = _mm_and_si128(m2, mask);
	m2 = _mm_shuffle_epi8(t0, m2);
	m3 = _mm_unpacklo_epi8(m1, m2);
	m4 = _mm_unpackhi_epi8(m1, m2);

	m0 = _mm_load_si128((__m128i *)(a+2));
	m1 = _mm_and_si128(m0, mask);
	m1 = _mm_shuffle_epi8(t0, m1);
	m2 = _mm_srli_epi64(m0, 4);
	m2 = _mm_and_si128(m2, mask);
	m2 = _mm_shuffle_epi8(t0, m2);
	m5 = _mm_unpacklo_epi8(m1, m2);
	m6 = _mm_unpackhi_epi8(m1, m2);

	m0 = m3;
	m1 = m4;
	m2 = m5;
	m3 = m6;

	REDUCE();
	_mm_store_si128((__m128i *) c + 0, m0);
	_mm_store_si128((__m128i *) x, m1);
	c[2] = x[0];
	c[3] = x[1] & 0x07FFFFFFFFFFFFFF;
}
Ejemplo n.º 28
0
/** Return the significand and the exponent both in double precision **/
__m128d frexp_sse(__m128d x, __m128d *e)
{
    /* Integer exponent */
    __m128i ei;

    /* Save the exponent */
    ei = _mm_and_si128(_mm_castpd_si128(x), *(__m128i*)pi64_mantissa_mask);
    ei = _mm_srli_epi64(ei, 52);
    ei = _mm_shuffle_epi32(ei,216);
    ei = _mm_sub_epi32(ei, *(__m128i*)pi32_bias4i);

    *e = _mm_cvtepi32_pd(ei);

    /* Save the significand */
    x = _mm_and_pd(x, *(__m128d*)pi64_inv_mantissa_mask);
    x = _mm_or_pd(x, *(__m128d*)pd_half_mask);

    return x;
}
__m128i shift_left_sse11(__m128i vec, __m128i next_vec, int shift_num) {
	if (shift_num % 4 == 0)
		return left_alignr_helper(vec, next_vec, shift_num / 4);

	__m128i carryover;
	__m128i shiftee;
	__m128i mask;

	carryover = left_alignr_helper(vec, next_vec, shift_num / 4 + 1);
//	print128_bit(carryover);
	carryover = _mm_slli_epi64(carryover, (4 - (shift_num % 4)) * 2);
//	print128_bit(carryover);

	if (shift_num > 4)
		shiftee = left_alignr_helper(vec, next_vec, shift_num / 4);
	else
		shiftee = vec;

	shiftee = _mm_srli_epi64(shiftee, (shift_num % 4) * 2);

	return _mm_or_si128(shiftee, carryover);
}
Ejemplo n.º 30
0
void fb_slvn_low(dig_t *c, const dig_t *a) {
	int i;
	dig_t *p, u0, u1, u2, u3;
	void *tab = fb_poly_get_slv();
	__m128i m0, m1, m2, m3, m4, sqrt0, sqrt1, mask0, mask1, mask2, r0, r1, t0, t1, perm;

	perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200);
	mask2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
	mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0);
	mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
	sqrt0 = _mm_set_epi32(0x03020302, 0x01000100, 0x03020302, 0x01000100);
	sqrt1 = _mm_set_epi32(0x0c080c08, 0x04000400, 0x0c080c08, 0x04000400);

	t0 = _mm_load_si128((__m128i *)a);
	t1 = _mm_load_si128((__m128i *)(a + 2));
	r0 = r1 = _mm_setzero_si128();

	m0 = _mm_shuffle_epi8(t1, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);

	m2 = _mm_slli_si128(m1, 8);
	m1 = _mm_and_si128(m1, mask2);
	m1 = _mm_slli_epi64(m1, 4);
	m1 = _mm_xor_si128(m1, m2);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m0 = _mm_and_si128(t0, mask2);
	m0 = _mm_shuffle_epi8(m0, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);

	m2 = _mm_srli_si128(m1, 8);
	m1 = _mm_andnot_si128(mask2, m1);
	m2 = _mm_slli_epi64(m2, 4);
	m1 = _mm_xor_si128(m1, m2);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_si128(t0, 4);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFFFFFF));
	m0 = _mm_shuffle_epi8(m1, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);
	m2 = _mm_slli_si128(m1, 8);
	m1 = _mm_slli_epi64(m1, 4);
	m1 = _mm_xor_si128(m1, m2);
	m1 = _mm_srli_si128(m1, 6);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_si128(t0, 2);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFF));
	m0 = _mm_shuffle_epi8(m1, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);
	m2 = _mm_slli_si128(m1, 8);
	m1 = _mm_slli_epi64(m1, 4);
	m1 = _mm_xor_si128(m1, m2);
	m1 = _mm_srli_si128(m1, 7);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_si128(t0, 1);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x55));
	m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1));
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x33));
	m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 2));
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x0F));
	m1 = _mm_slli_epi64(m1, 4);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_epi64(t0, 4);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x5));
	m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1));
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x3));
	m1 = _mm_slli_epi64(m1, 2);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_epi64(t0, 2);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x1));
	m1 = _mm_slli_epi64(m1, 1);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	sqrt0 = _mm_set_epi32(0x03030202, 0x03030202, 0x01010000, 0x01010000);
	sqrt1 = _mm_set_epi32(0x0C0C0808, 0x0C0C0808, 0x04040000, 0x04040000);

	m1 = _mm_and_si128(t0, mask0);
	m2 = _mm_and_si128(t0, mask1);
	m3 = _mm_and_si128(t1, mask0);
	m4 = _mm_and_si128(t1, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m4 = _mm_srli_epi64(m4, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m4 = _mm_shuffle_epi8(sqrt1, m4);
	m3 = _mm_shuffle_epi8(sqrt0, m3);
	m1 = _mm_or_si128(m1, m2);
	m3 = _mm_or_si128(m3, m4);
#ifndef __PCLMUL__
	align dig_t x[2];
	_mm_store_si128((__m128i *)x, m1);
	u0 = x[0];
	u1 = x[1];
	_mm_store_si128((__m128i *)x, m3);
	u2 = x[0];
	u3 = x[1];
#else
	u0 = _mm_extract_epi64(m1, 0);
	u1 = _mm_extract_epi64(m1, 1);
	u2 = _mm_extract_epi64(m3, 0);
	u3 = _mm_extract_epi64(m3, 1);
#endif

	for (i = 0; i < 8; i++) {
		p = (dig_t *)(tab + (16 * i + (u0 & 0x0F)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u0 >>= 8;
		p = (dig_t *)(tab + (16 * (i + 8) + (u1 & 0x0F)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u1 >>= 8;
		p = (dig_t *)(tab + (16 * (i + 16) + (u2 & 0x0F)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u2 >>= 8;
		p = (dig_t *)(tab + (16 * (i + 24) + (u3 & 0xF)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u3 >>= 8;
	}

	_mm_store_si128((__m128i *)c, r0);
	_mm_store_si128((__m128i *)(c + 2), r1);
}