static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0, __m128i *sum_1) { const __m128i zero = _mm_setzero_si128(); const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a); const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b); const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8); const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero); const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8); const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero); const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16); const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16); const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16); const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16); __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2); // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8]. __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2); __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left); sum_u16 = _mm_adds_epu16(sum_u16, shift_right); *sum_0 = sum_u16; shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14); shift_right = _mm_srli_si128(diff_sq_1_u16, 2); sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left); sum_u16 = _mm_adds_epu16(sum_u16, shift_right); *sum_1 = sum_u16; }
// Load values from 'a' and 'b'. Compute the difference squared and sum // neighboring values such that: // sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2 // Values to the left and right of the row are set to 0. // The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values. static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) { const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a); const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b); const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8); const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8); const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16); const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16); // Shift all the values one place to the left/right so we can efficiently sum // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1]. const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2); const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2); // It becomes necessary to treat the values as unsigned at this point. The // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point // forward since the filter is only applied to smooth small pixel changes. // Once the value has saturated to uint16_t it is well outside the useful // range. __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left); sum_u16 = _mm_adds_epu16(sum_u16, shift_right); *sum = sum_u16; }
static void satd_8bit_4x4_dual_avx2( const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *satds_out) { __m256i original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)orig))); __m256i pred = _mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)preds[0])); pred = _mm256_inserti128_si256(pred, _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)preds[1])), 1); __m256i diff_lo = _mm256_sub_epi16(pred, original); original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(orig + 8)))); pred = _mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(preds[0] + 8))); pred = _mm256_inserti128_si256(pred, _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(preds[1] + 8))), 1); __m256i diff_hi = _mm256_sub_epi16(pred, original); //Hor __m256i row0 = _mm256_hadd_epi16(diff_lo, diff_hi); __m256i row1 = _mm256_hsub_epi16(diff_lo, diff_hi); __m256i row2 = _mm256_hadd_epi16(row0, row1); __m256i row3 = _mm256_hsub_epi16(row0, row1); //Ver row0 = _mm256_hadd_epi16(row2, row3); row1 = _mm256_hsub_epi16(row2, row3); row2 = _mm256_hadd_epi16(row0, row1); row3 = _mm256_hsub_epi16(row0, row1); //Abs and sum row2 = _mm256_abs_epi16(row2); row3 = _mm256_abs_epi16(row3); row3 = _mm256_add_epi16(row2, row3); row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) )); row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) )); row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) )); unsigned sum1 = _mm_extract_epi16(_mm256_castsi256_si128(row3), 0); sum1 = (sum1 + 1) >> 1; unsigned sum2 = _mm_extract_epi16(_mm256_extracti128_si256(row3, 1), 0); sum2 = (sum2 + 1) >> 1; satds_out[0] = sum1; satds_out[1] = sum2; }
static WEBP_INLINE __m128i SubtractAndAccumulate(const __m128i a, const __m128i b) { // take abs(a-b) in 8b const __m128i a_b = _mm_subs_epu8(a, b); const __m128i b_a = _mm_subs_epu8(b, a); const __m128i abs_a_b = _mm_or_si128(a_b, b_a); // zero-extend to 16b const __m128i C0 = _mm_cvtepu8_epi16(abs_a_b); const __m128i C1 = _mm_cvtepu8_epi16(_mm_srli_si128(abs_a_b, 8)); // multiply with self const __m128i D0 = _mm_madd_epi16(C0, C0); const __m128i D1 = _mm_madd_epi16(C1, C1); // accumulate const __m128i sum = _mm_add_epi32(D0, D1); return sum; }
// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, uint16_t *count, uint32_t *accumulator) { const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); const __m128i zero = _mm_setzero_si128(); __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); __m128i pred_0_u32, pred_1_u32; __m128i accum_0_u32, accum_1_u32; count_u16 = _mm_adds_epu16(count_u16, sum_u16); _mm_storeu_si128((__m128i *)count, count_u16); pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); }
static unsigned satd_8bit_4x4_avx2(const kvz_pixel *org, const kvz_pixel *cur) { __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org)); __m128i current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)cur)); __m128i diff_lo = _mm_sub_epi16(current, original); original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(org + 8))); current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(cur + 8))); __m128i diff_hi = _mm_sub_epi16(current, original); //Hor __m128i row0 = _mm_hadd_epi16(diff_lo, diff_hi); __m128i row1 = _mm_hsub_epi16(diff_lo, diff_hi); __m128i row2 = _mm_hadd_epi16(row0, row1); __m128i row3 = _mm_hsub_epi16(row0, row1); //Ver row0 = _mm_hadd_epi16(row2, row3); row1 = _mm_hsub_epi16(row2, row3); row2 = _mm_hadd_epi16(row0, row1); row3 = _mm_hsub_epi16(row0, row1); //Abs and sum row2 = _mm_abs_epi16(row2); row3 = _mm_abs_epi16(row3); row3 = _mm_add_epi16(row2, row3); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) )); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) )); row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) )); unsigned sum = _mm_extract_epi16(row3, 0); unsigned satd = (sum + 1) >> 1; return satd; }
size_t sse4_strstr_unrolled_len3(const char* s, size_t n, const char* needle) { const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle)); const __m128i zeros = _mm_setzero_si128(); for (size_t i = 0; i < n; i += 8) { const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i)); const __m128i lastbyte = _mm_cvtepu8_epi16(_mm_srli_si128(data, 3)); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(_mm_sub_epi16(result, lastbyte), zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; if (mask != 0) { return i + bits::get_first_bit_set(mask)/2; } } return std::string::npos; }
INLINE static __m128i diff_row_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2) { __m128i buf1_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)buf1)); __m128i buf2_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)buf2)); return _mm_sub_epi16(buf1_row, buf2_row); }
// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransform(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { __m128i tmp_0, tmp_1, tmp_2, tmp_3; // Load, combine and transpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_cvtepu8_epi16(transpose1_0); tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8)); tmp_2 = _mm_cvtepu8_epi16(transpose1_1); tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8)); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); A_b0 = _mm_abs_epi16(A_b0); A_b2 = _mm_abs_epi16(A_b2); B_b0 = _mm_abs_epi16(B_b0); B_b2 = _mm_abs_epi16(B_b2); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b2 = _mm_sub_epi32(A_b0, B_b0); // cascading summation of the differences B_b0 = _mm_hadd_epi32(A_b2, A_b2); B_b2 = _mm_hadd_epi32(B_b0, B_b0); return _mm_cvtsi128_si32(B_b2); } }
// Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; // Load and combine inputs. { const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]); // In SSE4.1, with gcc 4.8 at least (maybe other versions), // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump // of inA and inB, _mm_loadl_epi64 is still used not to have an out of // bound read. const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3); tmp_0 = _mm_cvtepu8_epi16(inAB_0); tmp_1 = _mm_cvtepu8_epi16(inAB_1); tmp_2 = _mm_cvtepu8_epi16(inAB_2); tmp_3 = _mm_cvtepu8_epi16(inAB_3); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 } // Vertical pass first to avoid a transpose (vertical and horizontal passes // are commutative because w/kWeightY is symmetric) and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3); } // Horizontal pass and difference of weighted sums. { // Load all inputs. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); A_b0 = _mm_abs_epi16(A_b0); A_b2 = _mm_abs_epi16(A_b2); B_b0 = _mm_abs_epi16(B_b0); B_b2 = _mm_abs_epi16(B_b2); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b2 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b2); } return sum[0] + sum[1] + sum[2] + sum[3]; }
__m128i test_mm_cvtepu8_epi16(__m128i a) { // CHECK-LABEL: test_mm_cvtepu8_epi16 // CHECK: call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> {{.*}}) // CHECK-ASM: pmovzxbw %xmm{{.*}}, %xmm{{.*}} return _mm_cvtepu8_epi16(a); }
__m128i test_mm_cvtepu8_epi16(__m128i a) { // CHECK-LABEL: test_mm_cvtepu8_epi16 // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> // CHECK: zext <8 x i8> {{.*}} to <8 x i16> return _mm_cvtepu8_epi16(a); }