int32_t sse_sadbw_unrolled4_sumsignedbytes(int8_t* array, size_t size) { const __m128i zero = _mm_setzero_si128(); __m128i positive = zero; __m128i negative = zero; for (size_t i=0; i < size; i += 16*4) { const __m128i v0 = _mm_loadu_si128((__m128i*)(array + i + 0*16)); const __m128i v1 = _mm_loadu_si128((__m128i*)(array + i + 1*16)); const __m128i v2 = _mm_loadu_si128((__m128i*)(array + i + 2*16)); const __m128i v3 = _mm_loadu_si128((__m128i*)(array + i + 3*16)); { const __m128i v = v0; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } { const __m128i v = v1; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } { const __m128i v = v2; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } { const __m128i v = v3; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } } const __m128i accumulator = _mm_add_epi32(positive, negative); return int32_t(_mm_extract_epi32(accumulator, 0)) + int32_t(_mm_extract_epi32(accumulator, 2)); }
__m128i test_mm_abs_epi8(__m128i a) { // CHECK-LABEL: test_mm_abs_epi8 // CHECK: [[SUB:%.+]] = sub <16 x i8> zeroinitializer, [[A:%.+]] // CHECK: [[CMP:%.+]] = icmp sgt <16 x i8> [[A]], zeroinitializer // CHECK: %{{.*}} = select <16 x i1> [[CMP]], <16 x i8> [[A]], <16 x i8> [[SUB]] return _mm_abs_epi8(a); }
int32_t sse_sadbw_sumsignedbytes(int8_t* array, size_t size) { const __m128i zero = _mm_setzero_si128(); __m128i positive = zero; __m128i negative = zero; for (size_t i=0; i < size; i += 16) { const __m128i v = _mm_loadu_si128((__m128i*)(array + i)); const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i va = _mm_abs_epi8(v); // sum just positive numbers const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); // sum just negative numbers const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } const __m128i accumulator = _mm_add_epi32(positive, negative); return int32_t(_mm_extract_epi32(accumulator, 0)) + int32_t(_mm_extract_epi32(accumulator, 2)); }
int main(int, char**) { volatile __m128i a = _mm_set1_epi32(42); _mm_abs_epi8(a); volatile __m128i result = _mm_sign_epi16(a, _mm_set1_epi32(64)); (void)result; return 0; }
void demod_16qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) { float *symbolsPtr = (float*) symbols; __m128i *resultPtr = (__m128i*) llr; __m128 symbol1, symbol2, symbol3, symbol4; __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_12, symbol_34; __m128i offset = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM16/sqrt(10)); __m128i result1n, result1a, result2n, result2a; __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM16); __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); __m128i shuffle_abs_1 = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); __m128i shuffle_abs_2 = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); for (int i=0;i<nsymbols/8;i++) { symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol3 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol4 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v)); symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v)); symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v)); symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v)); symbol_12 = _mm_packs_epi32(symbol_i1, symbol_i2); symbol_34 = _mm_packs_epi32(symbol_i3, symbol_i4); symbol_i = _mm_packs_epi16(symbol_12, symbol_34); symbol_abs = _mm_abs_epi8(symbol_i); symbol_abs = _mm_sub_epi8(symbol_abs, offset); result1n = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result1a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); result2n = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); result2a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); _mm_store_si128(resultPtr, _mm_or_si128(result1n, result1a)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(result2n, result2a)); resultPtr++; } // Demodulate last symbols for (int i=8*(nsymbols/8);i<nsymbols;i++) { short yre = (int8_t) (SCALE_BYTE_CONV_QAM16*crealf(symbols[i])); short yim = (int8_t) (SCALE_BYTE_CONV_QAM16*cimagf(symbols[i])); llr[4*i+0] = -yre; llr[4*i+1] = -yim; llr[4*i+2] = abs(yre)-2*SCALE_BYTE_CONV_QAM16/sqrt(10); llr[4*i+3] = abs(yim)-2*SCALE_BYTE_CONV_QAM16/sqrt(10); } }
void demod_64qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) { float *symbolsPtr = (float*) symbols; __m128i *resultPtr = (__m128i*) llr; __m128 symbol1, symbol2, symbol3, symbol4; __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_abs2,symbol_12, symbol_34; __m128i offset1 = _mm_set1_epi8(4*SCALE_BYTE_CONV_QAM64/sqrt(42)); __m128i offset2 = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM64/sqrt(42)); __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM64); __m128i result11, result12, result13, result22, result21,result23, result31, result32, result33; __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0); __m128i shuffle_negated_2 = _mm_set_epi8(11,10,0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff); __m128i shuffle_negated_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff); __m128i shuffle_abs_1 = _mm_set_epi8(5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff); __m128i shuffle_abs_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff); __m128i shuffle_abs_3 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10); __m128i shuffle_abs2_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff,0xff,0xff); __m128i shuffle_abs2_2 = _mm_set_epi8(0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff,5,4); __m128i shuffle_abs2_3 = _mm_set_epi8(15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10,0xff,0xff); for (int i=0;i<nsymbols/8;i++) { symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol3 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol4 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v)); symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v)); symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v)); symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v)); symbol_12 = _mm_packs_epi32(symbol_i1, symbol_i2); symbol_34 = _mm_packs_epi32(symbol_i3, symbol_i4); symbol_i = _mm_packs_epi16(symbol_12, symbol_34); symbol_abs = _mm_abs_epi8(symbol_i); symbol_abs = _mm_sub_epi8(symbol_abs, offset1); symbol_abs2 = _mm_sub_epi8(_mm_abs_epi8(symbol_abs), offset2); result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); result13 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_1); result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); result23 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_2); result31 = _mm_shuffle_epi8(symbol_i, shuffle_negated_3); result32 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_3); result33 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_3); _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++; } for (int i=8*(nsymbols/8);i<nsymbols;i++) { float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i])); float yim = (int8_t) (SCALE_BYTE_CONV_QAM64*cimagf(symbols[i])); llr[6*i+0] = -yre; llr[6*i+1] = -yim; llr[6*i+2] = abs(yre)-4*SCALE_BYTE_CONV_QAM64/sqrt(42); llr[6*i+3] = abs(yim)-4*SCALE_BYTE_CONV_QAM64/sqrt(42); llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_BYTE_CONV_QAM64/sqrt(42); llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_BYTE_CONV_QAM64/sqrt(42); } }
__m128i test_mm_abs_epi8(__m128i a) { // CHECK-LABEL: test_mm_abs_epi8 // CHECK: call <16 x i8> @llvm.x86.ssse3.pabs.b.128 return _mm_abs_epi8(a); }