Esempio n. 1
0
int32_t sse_sadbw_unrolled4_sumsignedbytes(int8_t* array, size_t size) {

    const __m128i zero = _mm_setzero_si128();
    __m128i positive = zero;
    __m128i negative = zero;

    for (size_t i=0; i < size; i += 16*4) {
        const __m128i v0 = _mm_loadu_si128((__m128i*)(array + i + 0*16));
        const __m128i v1 = _mm_loadu_si128((__m128i*)(array + i + 1*16));
        const __m128i v2 = _mm_loadu_si128((__m128i*)(array + i + 2*16));
        const __m128i v3 = _mm_loadu_si128((__m128i*)(array + i + 3*16));

        {
            const __m128i v   = v0;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }

        {
            const __m128i v   = v1;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }

        {
            const __m128i v   = v2;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }

        {
            const __m128i v   = v3;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }
    }

    const __m128i accumulator = _mm_add_epi32(positive, negative);

    return int32_t(_mm_extract_epi32(accumulator, 0)) +
           int32_t(_mm_extract_epi32(accumulator, 2));
}
Esempio n. 2
0
__m128i test_mm_abs_epi8(__m128i a) {
  // CHECK-LABEL: test_mm_abs_epi8
  // CHECK: [[SUB:%.+]] = sub <16 x i8> zeroinitializer, [[A:%.+]]
  // CHECK: [[CMP:%.+]] = icmp sgt <16 x i8> [[A]], zeroinitializer
  // CHECK: %{{.*}} = select <16 x i1> [[CMP]], <16 x i8> [[A]], <16 x i8> [[SUB]]
  return _mm_abs_epi8(a);
}
Esempio n. 3
0
int32_t sse_sadbw_sumsignedbytes(int8_t* array, size_t size) {

    const __m128i zero = _mm_setzero_si128();
    __m128i positive = zero;
    __m128i negative = zero;

    for (size_t i=0; i < size; i += 16) {
        const __m128i v  = _mm_loadu_si128((__m128i*)(array + i));
        const __m128i m  = _mm_cmplt_epi8(v, zero);
        const __m128i va = _mm_abs_epi8(v);

        // sum just positive numbers
        const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);

        // sum just negative numbers
        const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
        
        positive = _mm_add_epi32(positive, t0);
        negative = _mm_sub_epi32(negative, t1);
    }

    const __m128i accumulator = _mm_add_epi32(positive, negative);

    return int32_t(_mm_extract_epi32(accumulator, 0)) +
           int32_t(_mm_extract_epi32(accumulator, 2));
}
Esempio n. 4
0
int main(int, char**)
{
    volatile __m128i a = _mm_set1_epi32(42);
    _mm_abs_epi8(a);
    volatile __m128i result = _mm_sign_epi16(a, _mm_set1_epi32(64));
    (void)result;
    return 0;
}
Esempio n. 5
0
void demod_16qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) {
  float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2, symbol3, symbol4;
  __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_12, symbol_34;
  __m128i offset = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM16/sqrt(10));
  __m128i result1n, result1a, result2n, result2a;
  __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM16);

  __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
  __m128i shuffle_abs_1     = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);

  __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
  __m128i shuffle_abs_2     = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);

  for (int i=0;i<nsymbols/8;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol3   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol4   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v));
    symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v));
    symbol_12  = _mm_packs_epi32(symbol_i1, symbol_i2);
    symbol_34  = _mm_packs_epi32(symbol_i3, symbol_i4);
    symbol_i   = _mm_packs_epi16(symbol_12, symbol_34);

    symbol_abs  = _mm_abs_epi8(symbol_i);
    symbol_abs  = _mm_sub_epi8(symbol_abs, offset);

    result1n = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
    result1a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);

    result2n = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
    result2a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);

    _mm_store_si128(resultPtr, _mm_or_si128(result1n, result1a)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(result2n, result2a)); resultPtr++;

  }
  // Demodulate last symbols
  for (int i=8*(nsymbols/8);i<nsymbols;i++) {
    short yre = (int8_t) (SCALE_BYTE_CONV_QAM16*crealf(symbols[i]));
    short yim = (int8_t) (SCALE_BYTE_CONV_QAM16*cimagf(symbols[i]));

    llr[4*i+0] = -yre;
    llr[4*i+1] = -yim;
    llr[4*i+2] = abs(yre)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
    llr[4*i+3] = abs(yim)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
  }
}
Esempio n. 6
0
void demod_64qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols)
{
  float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2, symbol3, symbol4;
  __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_abs2,symbol_12, symbol_34;
  __m128i offset1 = _mm_set1_epi8(4*SCALE_BYTE_CONV_QAM64/sqrt(42));
  __m128i offset2 = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM64/sqrt(42));
  __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM64);
  __m128i result11, result12, result13, result22, result21,result23, result31, result32, result33;

  __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0);
  __m128i shuffle_negated_2 = _mm_set_epi8(11,10,0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff);
  __m128i shuffle_negated_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff);

  __m128i shuffle_abs_1 = _mm_set_epi8(5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff);
  __m128i shuffle_abs_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff);
  __m128i shuffle_abs_3 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10);

  __m128i shuffle_abs2_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff,0xff,0xff);
  __m128i shuffle_abs2_2 = _mm_set_epi8(0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff,5,4);
  __m128i shuffle_abs2_3 = _mm_set_epi8(15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10,0xff,0xff);

  for (int i=0;i<nsymbols/8;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol3   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol4   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v));
    symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v));
    symbol_12  = _mm_packs_epi32(symbol_i1, symbol_i2);
    symbol_34  = _mm_packs_epi32(symbol_i3, symbol_i4);
    symbol_i   = _mm_packs_epi16(symbol_12, symbol_34);

    symbol_abs  = _mm_abs_epi8(symbol_i);
    symbol_abs  = _mm_sub_epi8(symbol_abs, offset1);
    symbol_abs2 = _mm_sub_epi8(_mm_abs_epi8(symbol_abs), offset2);

    result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
    result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);
    result13 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_1);

    result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
    result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);
    result23 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_2);

    result31 = _mm_shuffle_epi8(symbol_i, shuffle_negated_3);
    result32 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_3);
    result33 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_3);

    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++;

  }
  for (int i=8*(nsymbols/8);i<nsymbols;i++) {
    float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i]));
    float yim = (int8_t) (SCALE_BYTE_CONV_QAM64*cimagf(symbols[i]));

    llr[6*i+0] = -yre;
    llr[6*i+1] = -yim;
    llr[6*i+2] = abs(yre)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+3] = abs(yim)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
  }
}
Esempio n. 7
0
__m128i test_mm_abs_epi8(__m128i a) {
  // CHECK-LABEL: test_mm_abs_epi8
  // CHECK: call <16 x i8> @llvm.x86.ssse3.pabs.b.128
  return _mm_abs_epi8(a);
}