static void satd_8bit_4x4_dual_avx2( const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *satds_out) { __m256i original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)orig))); __m256i pred = _mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)preds[0])); pred = _mm256_inserti128_si256(pred, _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)preds[1])), 1); __m256i diff_lo = _mm256_sub_epi16(pred, original); original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(orig + 8)))); pred = _mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(preds[0] + 8))); pred = _mm256_inserti128_si256(pred, _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(preds[1] + 8))), 1); __m256i diff_hi = _mm256_sub_epi16(pred, original); //Hor __m256i row0 = _mm256_hadd_epi16(diff_lo, diff_hi); __m256i row1 = _mm256_hsub_epi16(diff_lo, diff_hi); __m256i row2 = _mm256_hadd_epi16(row0, row1); __m256i row3 = _mm256_hsub_epi16(row0, row1); //Ver row0 = _mm256_hadd_epi16(row2, row3); row1 = _mm256_hsub_epi16(row2, row3); row2 = _mm256_hadd_epi16(row0, row1); row3 = _mm256_hsub_epi16(row0, row1); //Abs and sum row2 = _mm256_abs_epi16(row2); row3 = _mm256_abs_epi16(row3); row3 = _mm256_add_epi16(row2, row3); row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) )); row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) )); row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) )); unsigned sum1 = _mm_extract_epi16(_mm256_castsi256_si128(row3), 0); sum1 = (sum1 + 1) >> 1; unsigned sum2 = _mm_extract_epi16(_mm256_extracti128_si256(row3, 1), 0); sum2 = (sum2 + 1) >> 1; satds_out[0] = sum1; satds_out[1] = sum2; }
char *_base64_encode_avx2(char *out, const unsigned char *in, size_t n, int options) { size_t i; size_t o = 0; const char (*alphabet)[2] = _base64_alphabet_precombined; if (options & Base64UseUrlAlphabet) alphabet = _base64url_alphabet_precombined; for (i = 0; n - i >= 48; i += 48) { // read 48 bytes and duplicate each 16-byte chunk in the high part of the register __m256i chunk1 = _mm256_broadcastsi128_si256(* (const __m128i *)&in[i+0]); __m256i chunk2 = _mm256_broadcastsi128_si256(* (const __m128i *)&in[i+16]); __m256i chunk3 = _mm256_broadcastsi128_si256(* (const __m128i *)&in[i+32]); // first chunk of 12 bytes do_encode_12bytes(alphabet, out + o, chunk1); o += 16; // second chunk: 4 bytes left in chunk1 do_encode_12bytes(alphabet, out + o, _mm256_alignr_epi8(chunk2, chunk1, 12)); o += 16; // third chunk: 8 bytes left in chunk2 do_encode_12bytes(alphabet, out + o, _mm256_alignr_epi8(chunk3, chunk2, 8)); o += 16; // fourth chunk: 12 final bytes in chunk3 do_encode_12bytes(alphabet, out + o, _mm256_srli_si256(chunk3, 4)); o += 16; if (options & Base64InsertLineBreaks) out[o++] = '\n'; } return _base64_encode_tail(out, o, in, n, options); }
__m256i test_mm256_broadcastsi128_si256(__m128i a) { // CHECK-LABEL: test_mm256_broadcastsi128_si256 // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1> return _mm256_broadcastsi128_si256(a); }
__m256i test_mm256_broadcastsi128_si256(__m128i a) { // CHECK: @llvm.x86.avx2.vbroadcasti128 return _mm256_broadcastsi128_si256(a); }