static void sse4_1_test (void) { union { __m128i x[NUM / 16]; char i[NUM]; } dst, src1, src2; int i, sign = 1; char min; for (i = 0; i < NUM; i++) { src1.i[i] = i * i * sign; src2.i[i] = (i + 20) * sign; sign = -sign; } for (i = 0; i < NUM; i += 16) dst.x[i / 16] = _mm_min_epi8 (src1.x[i / 16], src2.x[i / 16]); for (i = 0; i < NUM; i++) { min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i]; if (min != dst.i[i]) abort (); } }
void nibble_sort_tom(unsigned long *buf) { for (int i = 0; i < TEST_SIZE; ++i) { __m128i x = _mm_and_si128(_mm_set_epi64x(buf[i] >> 4, buf[i]), g_mask); x = S(x, 0); x = S(x, 1); x = S(x, 0); x = S(x, 2); x = S(x, 3); x = S(x, 0); x = S(x, 4); x = S(x, 5); x = S(x, 3); /* Final step is different; the output is in the right layout * for reassembling for the final write. */ const __m128i a0 = _mm_shuffle_epi8(x, g_shuffles[0][0]); const __m128i b0 = _mm_shuffle_epi8(x, g_shuffles[0][1]); const __m128i a1 = _mm_min_epi8(a0, b0); const __m128i b1 = _mm_max_epi8(a0, b0); const __m128i out = _mm_or_si128(a1, _mm_slli_epi64(b1, 4)); _mm_storel_epi64((__m128i *)&buf[i], out); } }
static __m128i S(__m128i x, int i) { const __m128i a0 = _mm_shuffle_epi8(x, g_shuffles[i][0]); const __m128i b0 = _mm_shuffle_epi8(x, g_shuffles[i][1]); const __m128i a1 = _mm_min_epi8(a0, b0); const __m128i b1 = _mm_max_epi8(a0, b0); const __m128i a2 = _mm_shuffle_epi8(a1, g_shuffles[i][2]); const __m128i b2 = _mm_shuffle_epi8(b1, g_shuffles[i][3]); return _mm_or_si128(a2, b2); }
__m128i test_mm_min_epi8(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_min_epi8 // CHECK: call <16 x i8> @llvm.x86.sse41.pminsb // CHECK-ASM: pminsb %xmm{{.*}}, %xmm{{.*}} return _mm_min_epi8(x, y); }
__m128i test_mm_min_epi8(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_min_epi8 // CHECK: [[CMP:%.*]] = icmp slt <16 x i8> [[X:%.*]], [[Y:%.*]] // CHECK-NEXT: select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]] return _mm_min_epi8(x, y); }