static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
                   __m128i *sum_1) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
  const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);

  const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
  const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
  const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
  const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);

  const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
  const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
  const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
  const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);

  __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
  // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
  __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);

  __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

  *sum_0 = sum_u16;

  shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
  shift_right = _mm_srli_si128(diff_sq_1_u16, 2);

  sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

  *sum_1 = sum_u16;
}
// Load values from 'a' and 'b'. Compute the difference squared and sum
// neighboring values such that:
// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
// Values to the left and right of the row are set to 0.
// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
  const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
  const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);

  const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
  const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);

  const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
  const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);

  // Shift all the values one place to the left/right so we can efficiently sum
  // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
  const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
  const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);

  // It becomes necessary to treat the values as unsigned at this point. The
  // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
  // forward since the filter is only applied to smooth small pixel changes.
  // Once the value has saturated to uint16_t it is well outside the useful
  // range.
  __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

  *sum = sum_u16;
}
Example #3
0
static void satd_8bit_4x4_dual_avx2(
  const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *satds_out) 
{

  __m256i original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)orig)));
  __m256i pred = _mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)preds[0]));
  pred = _mm256_inserti128_si256(pred, _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)preds[1])), 1);

  __m256i diff_lo = _mm256_sub_epi16(pred, original);

  original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(orig + 8))));
  pred = _mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(preds[0] + 8)));
  pred = _mm256_inserti128_si256(pred, _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(preds[1] + 8))), 1);

  __m256i diff_hi = _mm256_sub_epi16(pred, original);

  //Hor
  __m256i row0 = _mm256_hadd_epi16(diff_lo, diff_hi);
  __m256i row1 = _mm256_hsub_epi16(diff_lo, diff_hi);

  __m256i row2 = _mm256_hadd_epi16(row0, row1);
  __m256i row3 = _mm256_hsub_epi16(row0, row1);

  //Ver
  row0 = _mm256_hadd_epi16(row2, row3);
  row1 = _mm256_hsub_epi16(row2, row3);

  row2 = _mm256_hadd_epi16(row0, row1);
  row3 = _mm256_hsub_epi16(row0, row1);

  //Abs and sum
  row2 = _mm256_abs_epi16(row2);
  row3 = _mm256_abs_epi16(row3);

  row3 = _mm256_add_epi16(row2, row3);

  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
  row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));

  unsigned sum1 = _mm_extract_epi16(_mm256_castsi256_si128(row3), 0);
  sum1 = (sum1 + 1) >> 1;

  unsigned sum2 = _mm_extract_epi16(_mm256_extracti128_si256(row3, 1), 0);
  sum2 = (sum2 + 1) >> 1;

  satds_out[0] = sum1;
  satds_out[1] = sum2;
}
Example #4
0
static WEBP_INLINE __m128i SubtractAndAccumulate(const __m128i a,
        const __m128i b) {
    // take abs(a-b) in 8b
    const __m128i a_b = _mm_subs_epu8(a, b);
    const __m128i b_a = _mm_subs_epu8(b, a);
    const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
    // zero-extend to 16b
    const __m128i C0 = _mm_cvtepu8_epi16(abs_a_b);
    const __m128i C1 = _mm_cvtepu8_epi16(_mm_srli_si128(abs_a_b, 8));
    // multiply with self
    const __m128i D0 = _mm_madd_epi16(C0, C0);
    const __m128i D1 = _mm_madd_epi16(C1, C1);
    // accumulate
    const __m128i sum = _mm_add_epi32(D0, D1);
    return sum;
}
// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
                                   uint16_t *count, uint32_t *accumulator) {
  const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
  const __m128i zero = _mm_setzero_si128();
  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
  __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
  __m128i pred_0_u32, pred_1_u32;
  __m128i accum_0_u32, accum_1_u32;

  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
  _mm_storeu_si128((__m128i *)count, count_u16);

  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);

  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);

  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));

  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);

  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
}
Example #6
0
static unsigned satd_8bit_4x4_avx2(const kvz_pixel *org, const kvz_pixel *cur)
{

  __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org));
  __m128i current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)cur));

  __m128i diff_lo = _mm_sub_epi16(current, original);

  original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(org + 8)));
  current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(cur + 8)));

  __m128i diff_hi = _mm_sub_epi16(current, original);


  //Hor
  __m128i row0 = _mm_hadd_epi16(diff_lo, diff_hi);
  __m128i row1 = _mm_hsub_epi16(diff_lo, diff_hi);

  __m128i row2 = _mm_hadd_epi16(row0, row1);
  __m128i row3 = _mm_hsub_epi16(row0, row1);

  //Ver
  row0 = _mm_hadd_epi16(row2, row3);
  row1 = _mm_hsub_epi16(row2, row3);

  row2 = _mm_hadd_epi16(row0, row1);
  row3 = _mm_hsub_epi16(row0, row1);

  //Abs and sum
  row2 = _mm_abs_epi16(row2);
  row3 = _mm_abs_epi16(row3);

  row3 = _mm_add_epi16(row2, row3);

  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
  row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));

  unsigned sum = _mm_extract_epi16(row3, 0);
  unsigned satd = (sum + 1) >> 1;

  return satd;
}
size_t sse4_strstr_unrolled_len3(const char* s, size_t n, const char* needle) {

    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
    const __m128i zeros  = _mm_setzero_si128();

    for (size_t i = 0; i < n; i += 8) {

        const __m128i data     = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
        const __m128i lastbyte = _mm_cvtepu8_epi16(_mm_srli_si128(data, 3));
        const __m128i result   = _mm_mpsadbw_epu8(data, prefix, 0);

        const __m128i cmp    = _mm_cmpeq_epi16(_mm_sub_epi16(result, lastbyte), zeros);

        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;

        if (mask != 0) {

            return i + bits::get_first_bit_set(mask)/2;
        }
    }

    return std::string::npos;
}
Example #8
0
INLINE static __m128i diff_row_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2)
{
  __m128i buf1_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)buf1));
  __m128i buf2_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)buf2));
  return _mm_sub_epi16(buf1_row, buf2_row);
}
Example #9
0
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
static int TTransform(const uint8_t* inA, const uint8_t* inB,
                      const uint16_t* const w) {
    __m128i tmp_0, tmp_1, tmp_2, tmp_3;

    // Load, combine and transpose inputs.
    {
        const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
        const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
        const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
        const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
        const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
        const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
        const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
        const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);

        // Combine inA and inB (we'll do two transforms in parallel).
        const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
        const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
        const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
        const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
        // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
        // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
        // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
        // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0

        // Transpose the two 4x4, discarding the filling zeroes.
        const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
        const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
        // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
        // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
        const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
        const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
        // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
        // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33

        // Convert to 16b.
        tmp_0 = _mm_cvtepu8_epi16(transpose1_0);
        tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8));
        tmp_2 = _mm_cvtepu8_epi16(transpose1_1);
        tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8));
        // a00 a10 a20 a30   b00 b10 b20 b30
        // a01 a11 a21 a31   b01 b11 b21 b31
        // a02 a12 a22 a32   b02 b12 b22 b32
        // a03 a13 a23 a33   b03 b13 b23 b33
    }

    // Horizontal pass and subsequent transpose.
    {
        // Calculate a and b (two 4x4 at once).
        const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
        const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
        const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
        const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
        const __m128i b0 = _mm_add_epi16(a0, a1);
        const __m128i b1 = _mm_add_epi16(a3, a2);
        const __m128i b2 = _mm_sub_epi16(a3, a2);
        const __m128i b3 = _mm_sub_epi16(a0, a1);
        // a00 a01 a02 a03   b00 b01 b02 b03
        // a10 a11 a12 a13   b10 b11 b12 b13
        // a20 a21 a22 a23   b20 b21 b22 b23
        // a30 a31 a32 a33   b30 b31 b32 b33

        // Transpose the two 4x4.
        const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
        const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
        const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
        const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
        // a00 a10 a01 a11   a02 a12 a03 a13
        // a20 a30 a21 a31   a22 a32 a23 a33
        // b00 b10 b01 b11   b02 b12 b03 b13
        // b20 b30 b21 b31   b22 b32 b23 b33
        const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
        const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
        const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
        const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
        // a00 a10 a20 a30 a01 a11 a21 a31
        // b00 b10 b20 b30 b01 b11 b21 b31
        // a02 a12 a22 a32 a03 a13 a23 a33
        // b02 b12 a22 b32 b03 b13 b23 b33
        tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
        tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
        tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
        tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
        // a00 a10 a20 a30   b00 b10 b20 b30
        // a01 a11 a21 a31   b01 b11 b21 b31
        // a02 a12 a22 a32   b02 b12 b22 b32
        // a03 a13 a23 a33   b03 b13 b23 b33
    }

    // Vertical pass and difference of weighted sums.
    {
        // Load all inputs.
        const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
        const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);

        // Calculate a and b (two 4x4 at once).
        const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
        const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
        const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
        const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
        const __m128i b0 = _mm_add_epi16(a0, a1);
        const __m128i b1 = _mm_add_epi16(a3, a2);
        const __m128i b2 = _mm_sub_epi16(a3, a2);
        const __m128i b3 = _mm_sub_epi16(a0, a1);

        // Separate the transforms of inA and inB.
        __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
        __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
        __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
        __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);

        A_b0 = _mm_abs_epi16(A_b0);
        A_b2 = _mm_abs_epi16(A_b2);
        B_b0 = _mm_abs_epi16(B_b0);
        B_b2 = _mm_abs_epi16(B_b2);

        // weighted sums
        A_b0 = _mm_madd_epi16(A_b0, w_0);
        A_b2 = _mm_madd_epi16(A_b2, w_8);
        B_b0 = _mm_madd_epi16(B_b0, w_0);
        B_b2 = _mm_madd_epi16(B_b2, w_8);
        A_b0 = _mm_add_epi32(A_b0, A_b2);
        B_b0 = _mm_add_epi32(B_b0, B_b2);

        // difference of weighted sums
        A_b2 = _mm_sub_epi32(A_b0, B_b0);
        // cascading summation of the differences
        B_b0 = _mm_hadd_epi32(A_b2, A_b2);
        B_b2 = _mm_hadd_epi32(B_b0, B_b0);
        return _mm_cvtsi128_si32(B_b2);
    }
}
Example #10
0
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
                            const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;

  // Load and combine inputs.
  {
    const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
    const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
    const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
    // In SSE4.1, with gcc 4.8 at least (maybe other versions),
    // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
    // of inA and inB, _mm_loadl_epi64 is still used not to have an out of
    // bound read.
    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
    const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
    const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
    const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);

    // Combine inA and inB (we'll do two transforms in parallel).
    const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
    const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
    const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
    const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
    tmp_0 = _mm_cvtepu8_epi16(inAB_0);
    tmp_1 = _mm_cvtepu8_epi16(inAB_1);
    tmp_2 = _mm_cvtepu8_epi16(inAB_2);
    tmp_3 = _mm_cvtepu8_epi16(inAB_3);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33
  }

  // Vertical pass first to avoid a transpose (vertical and horizontal passes
  // are commutative because w/kWeightY is symmetric) and subsequent transpose.
  {
    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33

    // Transpose the two 4x4.
    VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
  }

  // Horizontal pass and difference of weighted sums.
  {
    // Load all inputs.
    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);

    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);

    // Separate the transforms of inA and inB.
    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);

    A_b0 = _mm_abs_epi16(A_b0);
    A_b2 = _mm_abs_epi16(A_b2);
    B_b0 = _mm_abs_epi16(B_b0);
    B_b2 = _mm_abs_epi16(B_b2);

    // weighted sums
    A_b0 = _mm_madd_epi16(A_b0, w_0);
    A_b2 = _mm_madd_epi16(A_b2, w_8);
    B_b0 = _mm_madd_epi16(B_b0, w_0);
    B_b2 = _mm_madd_epi16(B_b2, w_8);
    A_b0 = _mm_add_epi32(A_b0, A_b2);
    B_b0 = _mm_add_epi32(B_b0, B_b2);

    // difference of weighted sums
    A_b2 = _mm_sub_epi32(A_b0, B_b0);
    _mm_storeu_si128((__m128i*)&sum[0], A_b2);
  }
  return sum[0] + sum[1] + sum[2] + sum[3];
}
Example #11
0
__m128i test_mm_cvtepu8_epi16(__m128i a) {
  // CHECK-LABEL: test_mm_cvtepu8_epi16
  // CHECK: call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> {{.*}})
  // CHECK-ASM: pmovzxbw %xmm{{.*}}, %xmm{{.*}}
  return _mm_cvtepu8_epi16(a);
}
Example #12
0
__m128i test_mm_cvtepu8_epi16(__m128i a) {
  // CHECK-LABEL: test_mm_cvtepu8_epi16
  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  // CHECK: zext <8 x i8> {{.*}} to <8 x i16>
  return _mm_cvtepu8_epi16(a);
}