/* Shuffle bits within the bytes of eight element blocks. */
int64_t bshuf_shuffle_bit_eightelem_AVX(void* in, void* out, const size_t size,
         const size_t elem_size) {

    CHECK_MULT_EIGHT(size);

    // With a bit of care, this could be written such that such that it is
    // in_buf = out_buf safe.
    char* in_b = (char*) in;
    char* out_b = (char*) out;

    size_t ii, jj, kk;
    size_t nbyte = elem_size * size;

    __m256i ymm;
    int32_t bt;

    if (elem_size % 4) {
        return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size);
    } else {
        for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
            for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
                    ii += 8 * elem_size) {
                ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
                for (kk = 0; kk < 8; kk++) {
                    bt = _mm256_movemask_epi8(ymm);
                    ymm = _mm256_slli_epi16(ymm, 1);
                    size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
                    * (int32_t *) &out_b[ind] = bt;
                }
            }
        }
    }
    return size * elem_size;
}
Beispiel #2
0
 /*!
  * \brief Multiply the two given vectors of byte
  */
 ETL_STATIC_INLINE(avx_simd_byte) mul(avx_simd_byte lhs, avx_simd_byte rhs) {
     auto aodd    = _mm256_srli_epi16(lhs.value, 8);
     auto bodd    = _mm256_srli_epi16(rhs.value, 8);
     auto muleven = _mm256_mullo_epi16(lhs.value, rhs.value);
     auto mulodd  = _mm256_slli_epi16(_mm256_mullo_epi16(aodd, bodd), 8);
     return _mm256_blendv_epi8(mulodd, muleven, _mm256_set1_epi32(0x00FF00FF));
 }
/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_AVX(void* in, void* out, const size_t size,
         const size_t elem_size) {

    size_t ii, kk;
    char* in_b = (char*) in;
    char* out_b = (char*) out;
    int32_t* out_i32;

    size_t nbyte = elem_size * size;

    int64_t count;

    __m256i ymm;
    int32_t bt;

    for (ii = 0; ii + 31 < nbyte; ii += 32) {
        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
        for (kk = 0; kk < 8; kk++) {
            bt = _mm256_movemask_epi8(ymm);
            ymm = _mm256_slli_epi16(ymm, 1);
            out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
            *out_i32 = bt;
        }
    }
    count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
            nbyte - nbyte % 32);
    return count;
}
// 16bpp general R/G/B, usually 5/6/5 or 5/5/5
static inline __m256i stretchblt_line_bilinear_pixel_blend_avx_rgb16(const __m256i cur,const __m256i nxt,const __m256i mul,const __m256i rmask,const uint16_t rshift,const __m256i gmask,const uint16_t gshift,const __m256i bmask,const uint16_t bshift) {
    __m256i rc,gc,bc;
    __m256i rn,gn,bn;
    __m256i d,sum;

    rc = _mm256_and_si256(_mm256_srli_epi16(cur,rshift),rmask);
    gc = _mm256_and_si256(_mm256_srli_epi16(cur,gshift),gmask);
    bc = _mm256_and_si256(_mm256_srli_epi16(cur,bshift),bmask);

    rn = _mm256_and_si256(_mm256_srli_epi16(nxt,rshift),rmask);
    gn = _mm256_and_si256(_mm256_srli_epi16(nxt,gshift),gmask);
    bn = _mm256_and_si256(_mm256_srli_epi16(nxt,bshift),bmask);

    d = _mm256_sub_epi16(rn,rc);
    sum = _mm256_slli_epi16(_mm256_add_epi16(rc,_mm256_mulhi_epi16(_mm256_add_epi16(d,d),mul)),rshift);

    d = _mm256_sub_epi16(gn,gc);
    sum = _mm256_add_epi16(_mm256_slli_epi16(_mm256_add_epi16(gc,_mm256_mulhi_epi16(_mm256_add_epi16(d,d),mul)),gshift),sum);

    d = _mm256_sub_epi16(bn,bc);
    sum = _mm256_add_epi16(_mm256_slli_epi16(_mm256_add_epi16(bc,_mm256_mulhi_epi16(_mm256_add_epi16(d,d),mul)),bshift),sum);

    return sum;
}
// 32bpp optimized for 8-bit ARGB/RGBA. rmask should be 0x00FF,0x00FF,... etc
static inline __m256i stretchblt_line_bilinear_pixel_blend_avx_argb8(const __m256i cur,const __m256i nxt,const __m256i mul,const __m256i rmask) {
    __m256i rc,gc;
    __m256i rn,gn;
    __m256i d,sum;

    rc = _mm256_and_si256(                  cur   ,rmask);
    gc = _mm256_and_si256(_mm256_srli_epi16(cur,8),rmask);

    rn = _mm256_and_si256(               nxt   ,rmask);
    gn = _mm256_and_si256(_mm256_srli_epi16(nxt,8),rmask);

    d = _mm256_sub_epi16(rn,rc);
    sum = _mm256_add_epi16(rc,_mm256_mulhi_epi16(_mm256_add_epi16(d,d),mul));

    d = _mm256_sub_epi16(gn,gc);
    sum = _mm256_add_epi16(_mm256_slli_epi16(_mm256_add_epi16(gc,_mm256_mulhi_epi16(_mm256_add_epi16(d,d),mul)),8),sum);

    return sum;
}
Beispiel #6
0
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
{
    const int AB_BITS = MAX(10, (int)INTER_BITS);
    int x1 = 0;
    __m256i fxy_mask = _mm256_set1_epi32(INTER_TAB_SIZE - 1);
    __m256i XX = _mm256_set1_epi32(X0), YY = _mm256_set1_epi32(Y0);
    for (; x1 <= bw - 16; x1 += 16)
    {
        __m256i tx0, tx1, ty0, ty1;
        tx0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1)), XX);
        ty0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1)), YY);
        tx1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1 + 8)), XX);
        ty1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1 + 8)), YY);

        tx0 = _mm256_srai_epi32(tx0, AB_BITS - INTER_BITS);
        ty0 = _mm256_srai_epi32(ty0, AB_BITS - INTER_BITS);
        tx1 = _mm256_srai_epi32(tx1, AB_BITS - INTER_BITS);
        ty1 = _mm256_srai_epi32(ty1, AB_BITS - INTER_BITS);

        __m256i fx_ = _mm256_packs_epi32(_mm256_and_si256(tx0, fxy_mask),
            _mm256_and_si256(tx1, fxy_mask));
        __m256i fy_ = _mm256_packs_epi32(_mm256_and_si256(ty0, fxy_mask),
            _mm256_and_si256(ty1, fxy_mask));
        tx0 = _mm256_packs_epi32(_mm256_srai_epi32(tx0, INTER_BITS),
            _mm256_srai_epi32(tx1, INTER_BITS));
        ty0 = _mm256_packs_epi32(_mm256_srai_epi32(ty0, INTER_BITS),
            _mm256_srai_epi32(ty1, INTER_BITS));
        fx_ = _mm256_adds_epi16(fx_, _mm256_slli_epi16(fy_, INTER_BITS));
        fx_ = _mm256_permute4x64_epi64(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0);

        _mm256_storeu_si256((__m256i*)(xy + x1 * 2), _mm256_unpacklo_epi16(tx0, ty0));
        _mm256_storeu_si256((__m256i*)(xy + x1 * 2 + 16), _mm256_unpackhi_epi16(tx0, ty0));
        _mm256_storeu_si256((__m256i*)(alpha + x1), fx_);
    }
    _mm256_zeroupper();
    return x1;
}
Beispiel #7
0
__m256i test_mm256_slli_epi16(__m256i a) {
  // CHECK: @llvm.x86.avx2.pslli.w
  return _mm256_slli_epi16(a, 3);
}
__m256i test_mm256_slli_epi16(__m256i a) {
  // CHECK-LABEL: test_mm256_slli_epi16
  // CHECK: call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %{{.*}}, i32 %{{.*}})
  return _mm256_slli_epi16(a, 3);
}
Beispiel #9
0
void extern
avx2_test (void)
{
  x = _mm256_slli_epi16 (x, 13);
}