void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax) { uint32_t lengthdividedby4 = length / 4; uint32_t offset = lengthdividedby4 * 4; uint32_t k; *getmin = 0xFFFFFFFF; *getmax = 0; if (lengthdividedby4 > 0) { const __m128i* pin = (const __m128i*)(in); __m128i minaccumulator = _mm_loadu_si128(pin); __m128i maxaccumulator = minaccumulator; uint32_t k = 1; for(; 4*k < lengthdividedby4 * 4; ++k) { __m128i newvec = _mm_loadu_si128(pin+k); minaccumulator = _mm_min_epu32(minaccumulator,newvec); maxaccumulator = _mm_max_epu32(maxaccumulator,newvec); } *getmin = minasint(minaccumulator); *getmax = maxasint(maxaccumulator); } for (k = offset; k < length; ++k) { if (in[k] < *getmin) *getmin = in[k]; if (in[k] > *getmax) *getmax = in[k]; } }
static void sse4_1_test (void) { union { __m128i x[NUM / 4]; unsigned int i[NUM]; } dst, src1, src2; int i; unsigned int min; for (i = 0; i < NUM; i++) { src1.i[i] = i * i; src2.i[i] = i + 20; if ((i % 4)) src2.i[i] |= 0x80000000; } for (i = 0; i < NUM; i += 4) dst.x[i / 4] = _mm_min_epu32 (src1.x[i / 4], src2.x[i / 4]); for (i = 0; i < NUM; i++) { min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i]; if (min != dst.i[i]) abort (); } }
// Assuming that vInput1 and vInput2 are sorted, produces a sorted output going from vecMin all the way to vecMax // developed originally for merge sort using SIMD instructions. // Standard merge. See, e.g., Inoue and Taura, SIMD- and Cache-Friendly Algorithm for Sorting an Array of Structures static void sse_merge(__m128i *vInput1, __m128i *vInput2, // input 1 & 2 __m128i *vecMin, __m128i *vecMax) { // output __m128i vecTmp; vecTmp = _mm_min_epu32(*vInput1, *vInput2); *vecMax = _mm_max_epu32(*vInput1, *vInput2); vecTmp = _mm_alignr_epi8(vecTmp, vecTmp, 4); *vecMin = _mm_min_epu32(vecTmp, *vecMax); *vecMax = _mm_max_epu32(vecTmp, *vecMax); vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 4); *vecMin = _mm_min_epu32(vecTmp, *vecMax); *vecMax = _mm_max_epu32(vecTmp, *vecMax); vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 4); *vecMin = _mm_min_epu32(vecTmp, *vecMax); *vecMax = _mm_max_epu32(vecTmp, *vecMax); *vecMin = _mm_alignr_epi8(*vecMin, *vecMin, 4); }
uint32_t simdmin(const uint32_t * in) { const __m128i* pin = (const __m128i*)(in); __m128i accumulator = _mm_loadu_si128(pin); uint32_t k = 1; for(; 4*k < SIMDBlockSize; ++k) { __m128i newvec = _mm_loadu_si128(pin+k); accumulator = _mm_min_epu32(accumulator,newvec); } return minasint(accumulator); }
void _mesa_uint_array_min_max(const unsigned *ui_indices, unsigned *min_index, unsigned *max_index, const unsigned count) { unsigned max_ui = 0; unsigned min_ui = ~0U; unsigned i = 0; unsigned aligned_count = count; /* handle the first few values without SSE until the pointer is aligned */ while (((uintptr_t)ui_indices & 15) && aligned_count) { if (*ui_indices > max_ui) max_ui = *ui_indices; if (*ui_indices < min_ui) min_ui = *ui_indices; aligned_count--; ui_indices++; } /* TODO: The actual threshold for SSE begin useful may be higher than 8. * Some careful microbenchmarks and measurement are required to * find the actual tipping point. */ if (aligned_count >= 8) { unsigned max_arr[4] __attribute__ ((aligned (16))); unsigned min_arr[4] __attribute__ ((aligned (16))); unsigned vec_count; __m128i max_ui4 = _mm_setzero_si128(); __m128i min_ui4 = _mm_set1_epi32(~0U); __m128i ui_indices4; __m128i *ui_indices_ptr; vec_count = aligned_count & ~0x3; ui_indices_ptr = (__m128i *)ui_indices; for (i = 0; i < vec_count / 4; i++) { ui_indices4 = _mm_load_si128(&ui_indices_ptr[i]); max_ui4 = _mm_max_epu32(ui_indices4, max_ui4); min_ui4 = _mm_min_epu32(ui_indices4, min_ui4); } _mm_store_si128((__m128i *)max_arr, max_ui4); _mm_store_si128((__m128i *)min_arr, min_ui4); for (i = 0; i < 4; i++) { if (max_arr[i] > max_ui) max_ui = max_arr[i]; if (min_arr[i] < min_ui) min_ui = min_arr[i]; } i = vec_count; }
void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax) { const __m128i* pin = (const __m128i*)(in); __m128i minaccumulator = _mm_loadu_si128(pin); __m128i maxaccumulator = minaccumulator; uint32_t k = 1; for(; 4*k < SIMDBlockSize; ++k) { __m128i newvec = _mm_loadu_si128(pin+k); minaccumulator = _mm_min_epu32(minaccumulator,newvec); maxaccumulator = _mm_max_epu32(maxaccumulator,newvec); } *getmin = minasint(minaccumulator); *getmax = maxasint(maxaccumulator); }
uint32_t simdmin_length(const uint32_t * in, uint32_t length) { uint32_t currentmin = 0xFFFFFFFF; uint32_t lengthdividedby4 = length / 4; uint32_t offset = lengthdividedby4 * 4; uint32_t k; if (lengthdividedby4 > 0) { const __m128i* pin = (const __m128i*)(in); __m128i accumulator = _mm_loadu_si128(pin); uint32_t k = 1; for(; 4*k < lengthdividedby4 * 4; ++k) { __m128i newvec = _mm_loadu_si128(pin+k); accumulator = _mm_min_epu32(accumulator,newvec); } currentmin = minasint(accumulator); } for (k = offset; k < length; ++k) if (in[k] < currentmin) currentmin = in[k]; return currentmin; }
ConnectedComponent::ConnectedComponent( std::array<char, 8> value, boost::shared_ptr<pixel_list_type> pixelList, pixel_list_type::const_iterator begin, pixel_list_type::const_iterator end) : _pixels(pixelList), _value(value), _boundingBox(0, 0, 0, 0), _center(0, 0), _centerDirty(true), _pixelRange(begin, end), _bitmapDirty(true) { #ifdef __SSE4_1__ // if there is at least one pixel if (begin != end) { unsigned int*__restrict__ pixels = (unsigned int*)&*begin; unsigned int*__restrict__ pixelsEnd = (unsigned int*)&*end; // Prepare aligned, packed integer values. typedef union { __m128i v; unsigned int a[4]; } xmm_uints; enum {X1, Y1, X2, Y2}; __attribute__((aligned(16))) xmm_uints mins1; __attribute__((aligned(16))) xmm_uints maxs1; mins1.a[X1] = begin->x(); maxs1.a[X1] = begin->x(); mins1.a[Y1] = begin->y(); maxs1.a[Y1] = begin->y(); // Iterate through pixelList until 16-byte alignment is reached. while (((std::uintptr_t) pixels % 16) != 0 && pixels < pixelsEnd) { unsigned int x = pixels[X1]; unsigned int y = pixels[Y1]; mins1.a[X1] = std::min(mins1.a[X1], x); mins1.a[Y1] = std::min(mins1.a[Y1], y); maxs1.a[X1] = std::max(maxs1.a[X1], x); maxs1.a[Y1] = std::max(maxs1.a[Y1], y); pixels += 2; } // Guaranteed to have at least 8 XMM registers, so use 4 for cumulative // values and 2 for vector values. (Using 8+4 of 16 registers on 64-bit // arch yields no performance improvement.) mins1.a[X2] = mins1.a[X1]; mins1.a[Y2] = mins1.a[Y1]; maxs1.a[X2] = maxs1.a[X1]; maxs1.a[Y2] = maxs1.a[Y1]; __m128i mins2 = mins1.v; __m128i maxs2 = maxs1.v; // Vectorized loop. Strides two packed integer vectors, each containing // both X and Y for two pixels. while (pixels < pixelsEnd - 8) { __m128i pixelPair1 = _mm_load_si128((__m128i*)pixels); __m128i pixelPair2 = _mm_load_si128((__m128i*)(pixels + 4)); pixels += 8; // Hint compiler to iterate while loads stall. _mm_prefetch(pixels, _MM_HINT_T0); mins1.v = _mm_min_epu32(mins1.v, pixelPair1); maxs1.v = _mm_max_epu32(maxs1.v, pixelPair1); mins2 = _mm_min_epu32(mins2, pixelPair2); maxs2 = _mm_max_epu32(maxs2, pixelPair2); } // Combine stride results. mins1.v = _mm_min_epu32(mins1.v, mins2); maxs1.v = _mm_max_epu32(maxs1.v, maxs2); // Iterate through any remaining pixels. while (pixels < pixelsEnd) { unsigned int x = pixels[X1]; unsigned int y = pixels[Y1]; mins1.a[X1] = std::min(mins1.a[X1], x); mins1.a[Y1] = std::min(mins1.a[Y1], y); maxs1.a[X1] = std::max(maxs1.a[X1], x); maxs1.a[Y1] = std::max(maxs1.a[Y1], y); pixels += 2; } // Readout packed vectors, compare with remaining results, and store. _boundingBox.min().x() = (int)std::min(mins1.a[X1], mins1.a[X2]); _boundingBox.min().y() = (int)std::min(mins1.a[Y1], mins1.a[Y2]); _boundingBox.max().x() = (int)std::max(maxs1.a[X1], maxs1.a[X2]) + 1; _boundingBox.max().y() = (int)std::max(maxs1.a[Y1], maxs1.a[Y2]) + 1; }
/* merge "s+s" elements and return sorted result in "dest" array TODO(d'b): replace magic numbers with macro */ inline void bitonic_merge_kernel16n(float *dest, float *a, uint32_t sa, float *b /* must not be reversed*/, uint32_t sb) { __m128 ma[4]; __m128 mb[4]; __m128 lo[4]; __m128 hi[4]; #define LOAD16(arg) \ mb[3] = _mm_load_ps(arg); \ mb[2] = _mm_load_ps(arg + 4); \ mb[1] = _mm_load_ps(arg + 8); \ mb[0] = _mm_load_ps(arg + 12); arg+=16 float *last_a = a + sa; float *last_b = b + sb; float *last_dest = dest + sa + sb; ma[0] = _mm_load_ps(a); a+=4; ma[1] = _mm_load_ps(a); a+=4; ma[2] = _mm_load_ps(a); a+=4; ma[3] = _mm_load_ps(a); a+=4; for(; dest < (last_dest - 16); dest += 16) { /* Load either a or b */ if(a < last_a) { if(b < last_b) { if(*((uint32_t*)a) < *((uint32_t*)b)) { LOAD16(a); } else { LOAD16(b); } } else { LOAD16(a); } } else { LOAD16(b); } /* Reverse *b */ mb[0] = _mm_shuffle_ps(mb[0], mb[0], 0x1b); mb[1] = _mm_shuffle_ps(mb[1], mb[1], 0x1b); mb[2] = _mm_shuffle_ps(mb[2], mb[2], 0x1b); mb[3] = _mm_shuffle_ps(mb[3], mb[3], 0x1b); lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[0]), _mm_castps_si128(mb[0]))); hi[0] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[0]), _mm_castps_si128(mb[0]))); lo[1] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[1]), _mm_castps_si128(mb[1]))); hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[1]), _mm_castps_si128(mb[1]))); lo[2] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[2]), _mm_castps_si128(mb[2]))); hi[2] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[2]), _mm_castps_si128(mb[2]))); lo[3] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[3]), _mm_castps_si128(mb[3]))); hi[3] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[3]), _mm_castps_si128(mb[3]))); _mm_store_ps(&dest[0], lo[0]); _mm_store_ps(&dest[4], lo[1]); _mm_store_ps(&dest[8], lo[2]); _mm_store_ps(&dest[12], lo[3]); _mm_store_ps(&dest[16], hi[2]); _mm_store_ps(&dest[20], hi[3]); _mm_store_ps(&dest[24], hi[0]); _mm_store_ps(&dest[28], hi[1]); bitonic_merge_kernel8core(dest, dest + 8); bitonic_merge_kernel8core(dest + 16, dest + 24); ma[0] = _mm_load_ps(&dest[16]); ma[1] = _mm_load_ps(&dest[20]); ma[2] = _mm_load_ps(&dest[24]); ma[3] = _mm_load_ps(&dest[28]); } }
/* merge 2 sorted arrays (8 elements each) to 1 sorted array return result (16 elements) in the same arrays TODO(d'b): replace magic numbers with macro */ inline void bitonic_merge_kernel8core(float *a, float *b /* must be reversed*/) { __m128 map[2]; __m128 mbp[2]; __m128 lo[2]; __m128 hi[2]; map[0] = _mm_load_ps(a); mbp[0] = _mm_load_ps(b); map[1] = _mm_load_ps(a + 4); mbp[1] = _mm_load_ps(b + 4); lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(mbp[0]))); hi[0] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(mbp[0]))); lo[1] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[1]), _mm_castps_si128(mbp[1]))); hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[1]), _mm_castps_si128(mbp[1]))); map[0] = lo[0]; map[1] = lo[1]; mbp[0] = hi[0]; mbp[1] = hi[1]; /* L1 processing */ lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); map[0] = _mm_shuffle_ps(lo[0], lo[1], 0xe4); map[1] = _mm_shuffle_ps(lo[0], lo[1], 0x4e); mbp[0] = _mm_shuffle_ps(hi[0], hi[1], 0xe4); mbp[1] = _mm_shuffle_ps(hi[0], hi[1], 0x4e); /* L2 processing */ lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); map[0] = _mm_shuffle_ps(lo[0], lo[1], 0xd8); map[1] = _mm_shuffle_ps(lo[0], lo[1], 0x8d); mbp[0] = _mm_shuffle_ps(hi[0], hi[1], 0xd8); mbp[1] = _mm_shuffle_ps(hi[0], hi[1], 0x8d); /* L3 processing */ lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); map[0] = _mm_shuffle_ps(lo[1], lo[0], 0x88); map[1] = _mm_shuffle_ps(lo[1], lo[0], 0xdd); mbp[0] = _mm_shuffle_ps(hi[1], hi[0], 0x88); mbp[1] = _mm_shuffle_ps(hi[1], hi[0], 0xdd); map[0] = _mm_shuffle_ps(map[0], map[0], 0x72); map[1] = _mm_shuffle_ps(map[1], map[1], 0x72); mbp[0] = _mm_shuffle_ps(mbp[0], mbp[0], 0x72); mbp[1] = _mm_shuffle_ps(mbp[1], mbp[1], 0x72); _mm_store_ps(&a[0], map[0]); _mm_store_ps(&a[4], map[1]); _mm_store_ps(&b[0], mbp[0]); _mm_store_ps(&b[4], mbp[1]); CHECK_RAWS(a, b, 8); }
/* elements are given in 2 arrays (4 and 4), result will be returned in the same arrays with a straight order */ inline void bitonic_sort_kernel4(float *a, float *b) { __m128 ma; __m128 mb; __m128 map; __m128 mbp; __m128 lo; __m128 hi; /* load 8 elements to sse registers */ ma = _mm_load_ps(a); mb = _mm_load_ps(b); /* In-Register sort */ map = _mm_shuffle_ps(ma, mb, _MM_SHUFFLE(2, 0, 2, 0)); /* 0x88: */ mbp = _mm_shuffle_ps(ma, mb, _MM_SHUFFLE(3, 1, 3, 1)); /* 0xdd: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */ mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 3, 1)); /* 0x8d: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(lo, lo, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */ mbp = _mm_shuffle_ps(hi, hi, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(1, 0, 0, 1)); /* 0x41: */ mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 2, 2, 3)); /* 0xeb: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(3, 2, 1, 0)); /* 0xe4: */ mbp = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(1, 0, 3, 2)); /* 0x4e: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */ mbp = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(2, 0, 3, 1)); /* 0x8d: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)); /* 0x88: */ mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)); /* 0xdd: */ map = _mm_shuffle_ps(map, map, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */ mbp = _mm_shuffle_ps(mbp, mbp, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */ /* unload sorted elements to memory */ _mm_store_ps(a, map); _mm_store_ps(b, mbp); CHECK_RAWS(a, b, 4); }
static uint32_t minasint(const __m128i accumulator) { const __m128i _tmp1 = _mm_min_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ const __m128i _tmp2 = _mm_min_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ return _mm_cvtsi128_si32(_tmp2); }
__m128i test_mm_min_epu32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_min_epu32 // CHECK: call <4 x i32> @llvm.x86.sse41.pminud // CHECK-ASM: pminud %xmm{{.*}}, %xmm{{.*}} return _mm_min_epu32(x, y); }
__m128i test_mm_min_epu32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_min_epu32 // CHECK: [[CMP:%.*]] = icmp ult <4 x i32> [[X:%.*]], [[Y:%.*]] // CHECK-NEXT: select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]] return _mm_min_epu32(x, y); }