Esempio n. 1
0
int32_t avx2_sumsignedbytes(int8_t* array, size_t size) {

    __m256i accumulator = _mm256_setzero_si256();

    for (size_t i=0; i < size; i += 32) {
        const __m256i v = _mm256_loadu_si256((__m256i*)(array + i));

        const __m128i lo = _mm256_extracti128_si256(v, 0);
        const __m128i hi = _mm256_extracti128_si256(v, 1);

        const __m256i t0 = _mm256_cvtepi8_epi32(lo);
        const __m256i t1 = _mm256_cvtepi8_epi32(hi);
        const __m256i t2 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(lo, 8));
        const __m256i t3 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(hi, 8));

        accumulator = _mm256_add_epi32(accumulator, t0);
        accumulator = _mm256_add_epi32(accumulator, t1);
        accumulator = _mm256_add_epi32(accumulator, t2);
        accumulator = _mm256_add_epi32(accumulator, t3);
    }

    return int32_t(_mm256_extract_epi32(accumulator, 0)) +
           int32_t(_mm256_extract_epi32(accumulator, 1)) +
           int32_t(_mm256_extract_epi32(accumulator, 2)) +
           int32_t(_mm256_extract_epi32(accumulator, 3)) +
           int32_t(_mm256_extract_epi32(accumulator, 4)) +
           int32_t(_mm256_extract_epi32(accumulator, 5)) +
           int32_t(_mm256_extract_epi32(accumulator, 6)) +
           int32_t(_mm256_extract_epi32(accumulator, 7));
}
Esempio n. 2
0
uint32_t sse_sumbytes(uint8_t* array, size_t size) {

    __m128i accumulator = _mm_setzero_si128();

    for (size_t i=0; i < size; i += 16) {
        const __m128i v = _mm_loadu_si128((__m128i*)(array + i));
        const __m128i v0_3   = v;
        const __m128i v4_7   = _mm_bsrli_si128(v, 1*4);
        const __m128i v8_11  = _mm_bsrli_si128(v, 2*4);
        const __m128i v12_15 = _mm_bsrli_si128(v, 3*4);
        
        const __m128i t0 = _mm_cvtepu8_epi32(v0_3);
        const __m128i t1 = _mm_cvtepu8_epi32(v4_7);
        const __m128i t2 = _mm_cvtepu8_epi32(v8_11);
        const __m128i t3 = _mm_cvtepu8_epi32(v12_15);

        const __m128i t01 = _mm_add_epi32(t0, t1);
        const __m128i t23 = _mm_add_epi32(t2, t3);

        accumulator = _mm_add_epi32(accumulator, t01);
        accumulator = _mm_add_epi32(accumulator, t23);
    }

    return uint32_t(_mm_extract_epi32(accumulator, 0)) +
           uint32_t(_mm_extract_epi32(accumulator, 1)) +
           uint32_t(_mm_extract_epi32(accumulator, 2)) +
           uint32_t(_mm_extract_epi32(accumulator, 3));
}
Esempio n. 3
0
__m128i test_mm_bsrli_si128(__m128i A) {
  // DAG-LABEL: test_mm_bsrli_si128
  // DAG: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
  //
  // ASM-LABEL: test_mm_bsrli_si128
  // ASM: psrldq $5, %xmm{{.*}}
  return _mm_bsrli_si128(A, 5);
}
Esempio n. 4
0
__m128 test_mm_bsrli_si128(__m128 a) {
  // CHECK-LABEL: @test_mm_bsrli_si128
  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
  return _mm_bsrli_si128(a, 5);
}