int32_t avx2_sumsignedbytes(int8_t* array, size_t size) { __m256i accumulator = _mm256_setzero_si256(); for (size_t i=0; i < size; i += 32) { const __m256i v = _mm256_loadu_si256((__m256i*)(array + i)); const __m128i lo = _mm256_extracti128_si256(v, 0); const __m128i hi = _mm256_extracti128_si256(v, 1); const __m256i t0 = _mm256_cvtepi8_epi32(lo); const __m256i t1 = _mm256_cvtepi8_epi32(hi); const __m256i t2 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(lo, 8)); const __m256i t3 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(hi, 8)); accumulator = _mm256_add_epi32(accumulator, t0); accumulator = _mm256_add_epi32(accumulator, t1); accumulator = _mm256_add_epi32(accumulator, t2); accumulator = _mm256_add_epi32(accumulator, t3); } return int32_t(_mm256_extract_epi32(accumulator, 0)) + int32_t(_mm256_extract_epi32(accumulator, 1)) + int32_t(_mm256_extract_epi32(accumulator, 2)) + int32_t(_mm256_extract_epi32(accumulator, 3)) + int32_t(_mm256_extract_epi32(accumulator, 4)) + int32_t(_mm256_extract_epi32(accumulator, 5)) + int32_t(_mm256_extract_epi32(accumulator, 6)) + int32_t(_mm256_extract_epi32(accumulator, 7)); }
__m256i test_mm256_cvtepi8_epi32(__m128i a) { // CHECK: @llvm.x86.avx2.pmovsxbd return _mm256_cvtepi8_epi32(a); }
__m256i test_mm256_cvtepi8_epi32(__m128i a) { // CHECK-LABEL: test_mm256_cvtepi8_epi32 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> // CHECK: sext <8 x i8> %{{.*}} to <8 x i32> return _mm256_cvtepi8_epi32(a); }