void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
  const __m128i A0 = _mm_load_si128((const __m128i *)left);
  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
  const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
  (void)above;
  (void)bd;
  d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
  d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
  d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
  d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
}
示例#2
0
__m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2)
   {
   if(rcon)
      {
      input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15),
                             input2);

      *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon

      input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate
      input1 = _mm_alignr_epi8(input1, input1, 1);
      }

   __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4));
   smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B));

   __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4);

   input1 = _mm_and_si128(low_nibs, input1);

   __m128i t2 = _mm_shuffle_epi8(k_inv2, input1);

   input1 = _mm_xor_si128(input1, t);

   __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
   __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1));

   __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3));
   __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));

   return mm_xor3(_mm_shuffle_epi8(sb1u, t5),
                  _mm_shuffle_epi8(sb1t, t6),
                  smeared);
   }
void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
  const __m128i IXABCDEF =
      _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
  const __m128i XIJKLMNO =
      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
  const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
  __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
  __m128i rowa = avg2;
  __m128i rowb = avg3;
  int i;
  (void)bd;
  for (i = 0; i < 8; i += 2) {
    _mm_store_si128((__m128i *)dst, rowa);
    dst += stride;
    _mm_store_si128((__m128i *)dst, rowb);
    dst += stride;
    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
    rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
  }
}
static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
                   __m128i *sum_1) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
  const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);

  const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
  const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
  const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
  const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);

  const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
  const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
  const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
  const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);

  __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
  // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
  __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);

  __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

  *sum_0 = sum_u16;

  shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
  shift_right = _mm_srli_si128(diff_sq_1_u16, 2);

  sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

  *sum_1 = sum_u16;
}
void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
  const __m128i A0 = _mm_load_si128((const __m128i *)above);
  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  (void)left;
  (void)bd;
  _mm_store_si128((__m128i *)dst, avg3_0);
  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
  dst += stride;
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
}
示例#6
0
void
png_read_filter_row_avg4_sse(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_size_t i;
   __m128i* rp = (__m128i*)row;
   const __m128i* prp = (const __m128i*)prev_row;
   __m128i pixel = _mm_setzero_si128();
   const __m128i mask = _mm_set1_epi8(0x01);

   for (i = (row_info->rowbytes + 15) >> 4; i > 0; i--)
   {
      __m128i prb = _mm_load_si128(prp++);
      __m128i rb = _mm_load_si128(rp);

      // First pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 4);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      // Second pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 4);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      // Third pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 4);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      // Fourth pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      _mm_store_si128(rp++, rb);
   }
}
static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
                                __m128i *row_0, __m128i *row_1,
                                const __m128i *ar) {
  *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2);
  *row_1 = _mm_alignr_epi8(*ar, *row_1, 2);
  _mm_store_si128((__m128i *)*dst, *row_0);
  _mm_store_si128((__m128i *)(*dst + 8), *row_1);
  *dst += stride;
}
static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride,
                                  const __m128i *a, const __m128i *b) {
  _mm_store_si128((__m128i *)*dst, *a);
  *dst += stride;
  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
  *dst += stride;
  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
  *dst += stride;
  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
  *dst += stride;
}
void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
  __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
  __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
  (void)left;
  (void)bd;
  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
}
static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride,
                                 __m128i *a, __m128i *b, const __m128i *ar) {
  _mm_store_si128((__m128i *)*dst, *a);
  *dst += stride;
  _mm_store_si128((__m128i *)*dst, *b);
  *dst += stride;
  *a = _mm_alignr_epi8(*ar, *a, 2);
  *b = _mm_alignr_epi8(*ar, *b, 2);
  _mm_store_si128((__m128i *)*dst, *a);
  *dst += stride;
  _mm_store_si128((__m128i *)*dst, *b);
  *dst += stride;
  *a = _mm_alignr_epi8(*ar, *a, 2);
  *b = _mm_alignr_epi8(*ar, *b, 2);
}
示例#11
0
void
png_read_filter_row_sub4_sse(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_size_t i;
   __m128i racc = _mm_setzero_si128();
   __m128i* rp = (__m128i*)(row);

   PNG_UNUSED(prev_row)

   for (i = (row_info->rowbytes + 15) >> 4; i > 0; i--)
   {
      __m128i rb = _mm_load_si128(rp);

#ifndef __SSSE3__
      racc = _mm_srli_si128(racc, 12);
      racc = _mm_or_si128(racc, _mm_slli_si128(rb, 4));
#else
      racc =  _mm_alignr_epi8(rb, racc, 12);
#endif

      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 4);
      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 4);
      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 4);
      rb = _mm_add_epi8(rb, racc);
      racc = rb;

      _mm_store_si128(rp++, rb);
   }
}
// Assuming that vInput1 and vInput2 are sorted, produces a sorted output going from vecMin all the way to vecMax
// developed originally for merge sort using SIMD instructions.
//  Standard merge.  See, e.g., Inoue and Taura, SIMD- and Cache-Friendly Algorithm for Sorting an Array of Structures
static void sse_merge(__m128i *vInput1, __m128i *vInput2, // input 1 & 2
                      __m128i *vecMin, __m128i *vecMax) { // output
    __m128i vecTmp;
    vecTmp = _mm_min_epu32(*vInput1, *vInput2);
    *vecMax = _mm_max_epu32(*vInput1, *vInput2);
    vecTmp = _mm_alignr_epi8(vecTmp, vecTmp, 4);
    *vecMin = _mm_min_epu32(vecTmp, *vecMax);
    *vecMax = _mm_max_epu32(vecTmp, *vecMax);
    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 4);
    *vecMin = _mm_min_epu32(vecTmp, *vecMax);
    *vecMax = _mm_max_epu32(vecTmp, *vecMax);
    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 4);
    *vecMin = _mm_min_epu32(vecTmp, *vecMax);
    *vecMax = _mm_max_epu32(vecTmp, *vecMax);
    *vecMin = _mm_alignr_epi8(*vecMin, *vecMin, 4);
}
void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
  const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
  const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
  const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
  const __m128i XIJKLMNO =
      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
  const __m128i AXIJKLMN =
      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
  const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
  const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
  const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
  const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
  const __m128i row0 =
      _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
  const __m128i row1 =
      _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
  const __m128i row2 =
      _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
  const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
  const __m128i row4 =
      _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
  const __m128i row5 =
      _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
  const __m128i row6 =
      _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
  const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
  (void)bd;
  _mm_store_si128((__m128i *)dst, row0);
  dst += stride;
  _mm_store_si128((__m128i *)dst, row1);
  dst += stride;
  _mm_store_si128((__m128i *)dst, row2);
  dst += stride;
  _mm_store_si128((__m128i *)dst, row3);
  dst += stride;
  _mm_store_si128((__m128i *)dst, row4);
  dst += stride;
  _mm_store_si128((__m128i *)dst, row5);
  dst += stride;
  _mm_store_si128((__m128i *)dst, row6);
  dst += stride;
  _mm_store_si128((__m128i *)dst, row7);
}
示例#14
0
文件: arch.hpp 项目: huoyao/Hydro
inline void rotate_left_wm1(F64vec2 *v0, const F64vec2 v1)
{
    //v0 {1.0,  2.0};
    //v1 {3.0,  4.0};

    //v0 {2.0, 3.0, 4.0};
    *v0 = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(v1), _mm_castpd_si128(*v0), 8));
}
示例#15
0
/**
 * Internal subroutine for cnsResponse.
 * Load 2 x 8 image pixel and convert to 16 bit, also generates 1 pixel shifts for later filter computation.
 * img[i] contains src[i], imgL[i] contains src[i-1] and, imgR[i] contains src[i+1], i = 0..7
 * when interpreting __m128i as unsigned short[8].
 * lastScr is the __m128i directly before the current one (src), which is directly folllowed by nextSrc
 */
ALWAYSINLINE static void load2x8PixelUsingSSE(__m128i& imgL, __m128i& img, __m128i& imgR,
  __m128i& imgL2, __m128i& img2, __m128i& imgR2,
  __m128i& lastSrc, __m128i& src,  const __m128i* const nextSrcP)
{
  const __m128i nextSrc = _mm_load_si128(nextSrcP);

  //imgL = _mm_unpacklo_epi8(_mmauto_add_epi8(_mmauto_srli_si_all(lastSrc, 15), _mmauto_slli_si_all(src, 1)), _mm_setzero_si128());
  imgL = _mm_unpacklo_epi8(_mm_alignr_epi8(src, lastSrc, 15), _mm_setzero_si128());
  img = _mm_unpacklo_epi8(src, _mm_setzero_si128());
  imgR = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), _mm_setzero_si128());

  imgL2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), _mm_setzero_si128());
  img2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 8), _mm_setzero_si128());
  //imgR2 = _mm_unpacklo_epi8(_mmauto_add_epi8(_mmauto_srli_si_all(src, 9), _mmauto_slli_si_all(nextSrc, 7)), _mm_setzero_si128());
  imgR2 = _mm_unpacklo_epi8(_mm_alignr_epi8(nextSrc, src, 9), _mm_setzero_si128());

  lastSrc = src;
  src = nextSrc;
}
void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
  const __m128i A0 = _mm_load_si128((const __m128i *)above);
  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
  const __m128i L0 = _mm_load_si128((const __m128i *)left);
  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
  const __m128i L1_ = _mm_srli_si128(L1, 2);
  __m128i rowa_0 = avg2_0;
  __m128i rowa_1 = avg2_1;
  __m128i rowb_0 = avg3_0;
  __m128i rowb_1 = avg3_1;
  __m128i avg3_left[2];
  int i, j;
  (void)bd;
  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
  for (i = 0; i < 2; ++i) {
    __m128i avg_left = avg3_left[i];
    for (j = 0; j < 8; j += 2) {
      _mm_store_si128((__m128i *)dst, rowa_0);
      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
      dst += stride;
      _mm_store_si128((__m128i *)dst, rowb_0);
      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
      dst += stride;
      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
    }
  }
}
void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
  const __m128i XIJKLMNO =
      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
  const __m128i AXIJKLMN =
      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
  const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
  __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
  __m128i rowa = avg3;
  int i;
  (void)bd;
  for (i = 0; i < 8; ++i) {
    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
    _mm_store_si128((__m128i *)dst, rowa);
    dst += stride;
  }
}
示例#18
0
int main ()
{
#if defined( __SSSE3__ )

  typedef int16_t     vSInt16         __attribute__ ((__vector_size__ (16)));

  short   dtbl[] = {1,2,3,4,5,6,7,8};
  vSInt16 *vdtbl = (vSInt16*) dtbl;

  vSInt16 v0;
  v0 = *vdtbl;
  v0 = _mm_alignr_epi8(v0, v0, i); // expected-error {{argument to '__builtin_ia32_palignr128' must be a constant integer}}

  return 0;
#endif
}
void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
  const __m128i A0 = _mm_load_si128((const __m128i *)left);
  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
  const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
  const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
  const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
  const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
  const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
  const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
  const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
  const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
  const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
  (void)above;
  (void)bd;
  d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
  d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
  d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
  d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
  d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
  d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
  d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
  d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
}
size_t sse4_strstr_unrolled_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {

    assert(k > 4);
    assert(n > 0);

    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
    const __m128i zeros  = _mm_setzero_si128();

    __m128i prev = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
    __m128i curr;

    for (size_t i = 0; i < n; i += 16) {

        curr  = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + 16));

        const __m128i data0   = prev;
        const __m128i data1   = _mm_alignr_epi8(curr, prev, 8);
        const __m128i result0 = _mm_mpsadbw_epu8(data0, prefix, 0);
        const __m128i result1 = _mm_mpsadbw_epu8(data1, prefix, 0);
        prev = curr;

        const __m128i result  = _mm_packus_epi16(result0, result1);
        const __m128i cmp     = _mm_cmpeq_epi8(result, zeros);

        unsigned mask = _mm_movemask_epi8(cmp);

        while (mask != 0) {

            const auto bitpos = bits::get_first_bit_set(mask);

            if (memcmp_fun(s + i + bitpos + 4, needle + 4)) {
                return i + bitpos;
            }

            mask = bits::clear_leftmost_set(mask);
        }
    }

    return std::string::npos;
}
void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
  const __m128i A0 = _mm_load_si128((const __m128i *)above);
  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  __m128i avg2_0 = _mm_avg_epu16(A0, B0);
  __m128i avg2_1 = _mm_avg_epu16(A1, B1);
  int i;
  (void)left;
  (void)bd;
  for (i = 0; i < 14; i += 2) {
    _mm_store_si128((__m128i *)dst, avg2_0);
    _mm_store_si128((__m128i *)(dst + 8), avg2_1);
    dst += stride;
    _mm_store_si128((__m128i *)dst, avg3_0);
    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
    dst += stride;
    avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
    avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
    avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
  }
  _mm_store_si128((__m128i *)dst, avg2_0);
  _mm_store_si128((__m128i *)(dst + 8), avg2_1);
  dst += stride;
  _mm_store_si128((__m128i *)dst, avg3_0);
  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
}
示例#22
0
void
png_read_filter_row_sub3_sse(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_size_t i;
   png_bytep rp = row;
   __m128i racc = _mm_setzero_si128();

   PNG_UNUSED(prev_row)

   __m128i nrb = _mm_load_si128((__m128i*)(rp));

   for (i = 0; i < row_info->rowbytes; i += 15, rp += 15)
   {
      __m128i rb = nrb;
#ifndef __SSSE3__
      nrb = _mm_loadu_si128((__m128i*)(rp + 15));
      racc = _mm_srli_si128(_mm_slli_si128(racc, 1), 13);
      racc = _mm_or_si128(racc, _mm_slli_si128(rb, 3));
#else
      nrb = _mm_lddqu_si128((__m128i*)(rp + 15));
      racc =  _mm_alignr_epi8(rb, _mm_slli_si128(racc, 1), 13);
#endif

      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 3);
      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 3);
      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 3);
      rb = _mm_add_epi8(rb, racc);
      racc = _mm_slli_si128(racc, 3);
      rb = _mm_add_epi8(rb, racc);
      racc = rb;

      _mm_storeu_si128((__m128i*)rp, rb);
   }
}
示例#23
0
QT_BEGIN_NAMESPACE

// Convert a scanline of RGB888 (src) to RGB32 (dst)
// src must be at least len * 3 bytes
// dst must be at least len * 4 bytes
Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len)
{
    quint32 *const end = dst + len;

    // Prologue, align dst to 16 bytes. The alignment is done on dst because it has 4 store()
    // for each 3 load() of src.
    const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3;
    const int prologLength = qMin(len, offsetToAlignOn16Bytes);

    for (int i = 0; i < prologLength; ++i) {
        *dst++ = qRgb(src[0], src[1], src[2]);
        src += 3;
    }

    // Mask the 4 first colors of the RGB888 vector
    const __m128i shuffleMask = _mm_set_epi8(char(0xff), 9, 10, 11, char(0xff), 6, 7, 8, char(0xff), 3, 4, 5, char(0xff), 0, 1, 2);

    // Mask the 4 last colors of a RGB888 vector with an offset of 1 (so the last 3 bytes are RGB)
    const __m128i shuffleMaskEnd = _mm_set_epi8(char(0xff), 13, 14, 15, char(0xff), 10, 11, 12, char(0xff), 7, 8, 9, char(0xff), 4, 5, 6);

    // Mask to have alpha = 0xff
    const __m128i alphaMask = _mm_set1_epi32(0xff000000);

    __m128i *inVectorPtr = (__m128i *)src;
    __m128i *dstVectorPtr = (__m128i *)dst;

    const int simdRoundCount = (len - prologLength) / 16; // one iteration in the loop converts 16 pixels
    for (int i = 0; i < simdRoundCount; ++i) {
        /*
         RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is
         to load vectors of RGB888 and use palignr to select a vector out of two vectors.

         After 3 loads of RGB888 and 3 stores of RGB32, we have 4 pixels left in the last
         vector of RGB888, we can mask it directly to get a last store or RGB32. After that,
         the first next byte is a R, and we can loop for the next 16 pixels.

         The conversion itself is done with a byte permutation (pshufb).
         */
        __m128i firstSrcVector = _mm_lddqu_si128(inVectorPtr);
        __m128i outputVector = _mm_shuffle_epi8(firstSrcVector, shuffleMask);
        _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask));
        ++inVectorPtr;
        ++dstVectorPtr;

        // There are 4 unused bytes left in srcVector, we need to load the next 16 bytes
        // and load the next input with palignr
        __m128i secondSrcVector = _mm_lddqu_si128(inVectorPtr);
        __m128i srcVector = _mm_alignr_epi8(secondSrcVector, firstSrcVector, 12);
        outputVector = _mm_shuffle_epi8(srcVector, shuffleMask);
        _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask));
        ++inVectorPtr;
        ++dstVectorPtr;
        firstSrcVector = secondSrcVector;

        // We now have 8 unused bytes left in firstSrcVector
        secondSrcVector = _mm_lddqu_si128(inVectorPtr);
        srcVector = _mm_alignr_epi8(secondSrcVector, firstSrcVector, 8);
        outputVector = _mm_shuffle_epi8(srcVector, shuffleMask);
        _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask));
        ++inVectorPtr;
        ++dstVectorPtr;

        // There are now 12 unused bytes in firstSrcVector.
        // We can mask them directly, almost there.
        outputVector = _mm_shuffle_epi8(secondSrcVector, shuffleMaskEnd);
        _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask));
        ++dstVectorPtr;
    }
    src = (uchar *)inVectorPtr;
    dst = (quint32 *)dstVectorPtr;

    while (dst != end) {
        *dst++ = qRgb(src[0], src[1], src[2]);
        src += 3;
    }
}
void av1_highbd_wiener_convolve_add_src_ssse3(
    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
    const int16_t *filter_y, int y_step_q4, int w, int h,
    const ConvolveParams *conv_params, int bd) {
  assert(x_step_q4 == 16 && y_step_q4 == 16);
  assert(!(w & 7));
  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
  (void)x_step_q4;
  (void)y_step_q4;

  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);

  DECLARE_ALIGNED(16, uint16_t,
                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
  int intermediate_height = h + SUBPEL_TAPS - 1;
  int i, j;
  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;

  const __m128i zero = _mm_setzero_si128();
  // Add an offset to account for the "add_src" part of the convolve function.
  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);

  /* Horizontal filter */
  {
    const __m128i coeffs_x =
        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const = _mm_set1_epi32(
        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));

    for (i = 0; i < intermediate_height; ++i) {
      for (j = 0; j < w; j += 8) {
        const __m128i data =
            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
        const __m128i data2 =
            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);

        // Filter even-index pixels
        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
        const __m128i res_2 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
        const __m128i res_4 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
        const __m128i res_6 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);

        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                         _mm_add_epi32(res_2, res_6));
        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
                                  conv_params->round_0);

        // Filter odd-index pixels
        const __m128i res_1 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
        const __m128i res_3 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
        const __m128i res_5 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
        const __m128i res_7 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);

        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                        _mm_add_epi32(res_3, res_7));
        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
                                 conv_params->round_0);

        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
        const __m128i maxval =
            _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1);
        __m128i res = _mm_packs_epi32(res_even, res_odd);
        res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
      }
    }
  }

  /* Vertical filter */
  {
    const __m128i coeffs_y =
        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const =
        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
                       (1 << (bd + conv_params->round_1 - 1)));

    for (i = 0; i < h; ++i) {
      for (j = 0; j < w; j += 8) {
        // Filter even-index pixels
        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
        const __m128i src_0 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
        const __m128i src_2 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
        const __m128i src_4 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
        const __m128i src_6 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
                               *(__m128i *)(data + 7 * MAX_SB_SIZE));

        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);

        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
                                               _mm_add_epi32(res_4, res_6));

        // Filter odd-index pixels
        const __m128i src_1 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
        const __m128i src_3 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
        const __m128i src_5 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
        const __m128i src_7 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
                               *(__m128i *)(data + 7 * MAX_SB_SIZE));

        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);

        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
                                              _mm_add_epi32(res_5, res_7));

        // Rearrange pixels back into the order 0 ... 7
        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);

        const __m128i res_lo_round = _mm_srai_epi32(
            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
        const __m128i res_hi_round = _mm_srai_epi32(
            _mm_add_epi32(res_hi, round_const), conv_params->round_1);

        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
        res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);

        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
        _mm_storeu_si128(p, res_16bit);
      }
    }
  }
}
示例#25
0
__m128i foo (__m128i a)
{
  __m128i tmp = a;

  return _mm_alignr_epi8(tmp, a, 4);
}
示例#26
0
void
png_read_filter_row_avg3_sse(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_size_t i;
   png_bytep rp = row;
   png_const_bytep prp = prev_row;
   __m128i nrb = _mm_load_si128((__m128i*)(rp));
   __m128i pixel = _mm_setzero_si128();
   const __m128i mask = _mm_set1_epi8(0x01);

   for (i = 0; i < row_info->rowbytes; i += 15, rp += 15, prp += 15)
   {
#ifndef __SSSE3__
      __m128i prb = _mm_loadu_si128((__m128i*)prp);
#else
      __m128i prb = _mm_lddqu_si128((__m128i*)prp);
#endif
      __m128i rb = nrb;

      // First pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 3);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 3);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13));
#else
      rb = _mm_alignr_epi8(pixel, rb, 3);
#endif

      // Second pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 3);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 3);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13));
#else
      rb = _mm_alignr_epi8(pixel, rb, 3);
#endif

      // Third pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 3);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 3);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13));
#else
      rb = _mm_alignr_epi8(pixel, rb, 3);
#endif

      // Fourth pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 3);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 3);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13));
#else
      rb = _mm_alignr_epi8(pixel, rb, 3);
#endif

      // Fifth pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
#ifndef __SSSE3__
      nrb = _mm_loadu_si128((__m128i*)(rp + 15));
      rb = _mm_srli_si128(rb, 3);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13));
#else
      nrb = _mm_lddqu_si128((__m128i*)(rp + 15));
      rb = _mm_alignr_epi8(pixel, rb, 3);
#endif

      rb = _mm_srli_si128(rb, 1);
      _mm_storeu_si128((__m128i*)rp, rb);
   }
}
示例#27
0
// CHECK: palignr
int4 align1(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 15); }
示例#28
0
// CHECK: xor
int4 align4(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 32); }
示例#29
0
// CHECK: psrldq
int4 align3(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 17); }
示例#30
0
// CHECK: ret
// CHECK: ret
// CHECK-NOT: palignr
int4 align2(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 16); }