Exemplo n.º 1
0
void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax) {
	uint32_t lengthdividedby4 = length / 4;
	uint32_t offset = lengthdividedby4 * 4;
	uint32_t k;
	*getmin = 0xFFFFFFFF;
	*getmax = 0;
	if (lengthdividedby4 > 0) {
		const __m128i* pin = (const __m128i*)(in);
		__m128i minaccumulator = _mm_loadu_si128(pin);
		__m128i maxaccumulator = minaccumulator;
		uint32_t k = 1;
		for(; 4*k < lengthdividedby4 * 4; ++k) {
			__m128i newvec = _mm_loadu_si128(pin+k);
			minaccumulator = _mm_min_epu32(minaccumulator,newvec);
			maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
		}
		*getmin = minasint(minaccumulator);
		*getmax = maxasint(maxaccumulator);
	}
	for (k = offset; k < length; ++k) {
		if (in[k] < *getmin)
			*getmin = in[k];
		if (in[k] > *getmax)
			*getmax = in[k];
	}
}
Exemplo n.º 2
0
static void
sse4_1_test (void)
{
  union
    {
      __m128i x[NUM / 4];
      unsigned int i[NUM];
    } dst, src1, src2;
  int i;
  unsigned int min;

  for (i = 0; i < NUM; i++)
    {
      src1.i[i] = i * i;
      src2.i[i] = i + 20;
      if ((i % 4))
	src2.i[i] |= 0x80000000;
    }

  for (i = 0; i < NUM; i += 4)
    dst.x[i / 4] = _mm_min_epu32 (src1.x[i / 4], src2.x[i / 4]);

  for (i = 0; i < NUM; i++)
    {
      min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
      if (min != dst.i[i])
	abort ();
    }
}
// Assuming that vInput1 and vInput2 are sorted, produces a sorted output going from vecMin all the way to vecMax
// developed originally for merge sort using SIMD instructions.
//  Standard merge.  See, e.g., Inoue and Taura, SIMD- and Cache-Friendly Algorithm for Sorting an Array of Structures
static void sse_merge(__m128i *vInput1, __m128i *vInput2, // input 1 & 2
                      __m128i *vecMin, __m128i *vecMax) { // output
    __m128i vecTmp;
    vecTmp = _mm_min_epu32(*vInput1, *vInput2);
    *vecMax = _mm_max_epu32(*vInput1, *vInput2);
    vecTmp = _mm_alignr_epi8(vecTmp, vecTmp, 4);
    *vecMin = _mm_min_epu32(vecTmp, *vecMax);
    *vecMax = _mm_max_epu32(vecTmp, *vecMax);
    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 4);
    *vecMin = _mm_min_epu32(vecTmp, *vecMax);
    *vecMax = _mm_max_epu32(vecTmp, *vecMax);
    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 4);
    *vecMin = _mm_min_epu32(vecTmp, *vecMax);
    *vecMax = _mm_max_epu32(vecTmp, *vecMax);
    *vecMin = _mm_alignr_epi8(*vecMin, *vecMin, 4);
}
Exemplo n.º 4
0
uint32_t simdmin(const uint32_t * in) {
    const __m128i* pin = (const __m128i*)(in);
    __m128i accumulator =  _mm_loadu_si128(pin);
     uint32_t k = 1;
     for(; 4*k < SIMDBlockSize; ++k) {
    	 __m128i newvec = _mm_loadu_si128(pin+k);
         accumulator = _mm_min_epu32(accumulator,newvec);
     }
     return minasint(accumulator);
}
Exemplo n.º 5
0
void
_mesa_uint_array_min_max(const unsigned *ui_indices, unsigned *min_index,
                         unsigned *max_index, const unsigned count)
{
   unsigned max_ui = 0;
   unsigned min_ui = ~0U;
   unsigned i = 0;
   unsigned aligned_count = count;

   /* handle the first few values without SSE until the pointer is aligned */
   while (((uintptr_t)ui_indices & 15) && aligned_count) {
      if (*ui_indices > max_ui)
         max_ui = *ui_indices;
      if (*ui_indices < min_ui)
         min_ui = *ui_indices;

      aligned_count--;
      ui_indices++;
   }

   /* TODO: The actual threshold for SSE begin useful may be higher than 8.
    * Some careful microbenchmarks and measurement are required to
    * find the actual tipping point.
    */
   if (aligned_count >= 8) {
      unsigned max_arr[4] __attribute__ ((aligned (16)));
      unsigned min_arr[4] __attribute__ ((aligned (16)));
      unsigned vec_count;
      __m128i max_ui4 = _mm_setzero_si128();
      __m128i min_ui4 = _mm_set1_epi32(~0U);
      __m128i ui_indices4;
      __m128i *ui_indices_ptr;

      vec_count = aligned_count & ~0x3;
      ui_indices_ptr = (__m128i *)ui_indices;
      for (i = 0; i < vec_count / 4; i++) {
         ui_indices4 = _mm_load_si128(&ui_indices_ptr[i]);
         max_ui4 = _mm_max_epu32(ui_indices4, max_ui4);
         min_ui4 = _mm_min_epu32(ui_indices4, min_ui4);
      }

      _mm_store_si128((__m128i *)max_arr, max_ui4);
      _mm_store_si128((__m128i *)min_arr, min_ui4);

      for (i = 0; i < 4; i++) {
         if (max_arr[i] > max_ui)
            max_ui = max_arr[i];
         if (min_arr[i] < min_ui)
            min_ui = min_arr[i];
      }
      i = vec_count;
   }
Exemplo n.º 6
0
void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax) {
    const __m128i* pin = (const __m128i*)(in);
    __m128i minaccumulator =  _mm_loadu_si128(pin);
    __m128i maxaccumulator =  minaccumulator;
    uint32_t k = 1;
     for(; 4*k < SIMDBlockSize; ++k) {
    	 __m128i newvec = _mm_loadu_si128(pin+k);
         minaccumulator = _mm_min_epu32(minaccumulator,newvec);
         maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
     }
     *getmin = minasint(minaccumulator);
     *getmax = maxasint(maxaccumulator);
}
Exemplo n.º 7
0
uint32_t simdmin_length(const uint32_t * in, uint32_t length) {
	uint32_t currentmin = 0xFFFFFFFF;
	uint32_t lengthdividedby4 = length / 4;
	uint32_t offset = lengthdividedby4 * 4;
	uint32_t k;
	if (lengthdividedby4 > 0) {
		const __m128i* pin = (const __m128i*)(in);
		__m128i accumulator = _mm_loadu_si128(pin);
		uint32_t k = 1;
		for(; 4*k < lengthdividedby4 * 4; ++k) {
			__m128i newvec = _mm_loadu_si128(pin+k);
			accumulator = _mm_min_epu32(accumulator,newvec);
		}
		currentmin = minasint(accumulator);
	}
	for (k = offset; k < length; ++k)
		if (in[k] < currentmin)
			currentmin = in[k];
	return currentmin;
}
Exemplo n.º 8
0
ConnectedComponent::ConnectedComponent(
		std::array<char, 8> value,
		boost::shared_ptr<pixel_list_type> pixelList,
		pixel_list_type::const_iterator begin,
		pixel_list_type::const_iterator end) :

	_pixels(pixelList),
	_value(value),
	_boundingBox(0, 0, 0, 0),
	_center(0, 0),
	_centerDirty(true),
	_pixelRange(begin, end),
	_bitmapDirty(true) {

#ifdef __SSE4_1__

	// if there is at least one pixel
	if (begin != end) {

		unsigned int*__restrict__ pixels    = (unsigned int*)&*begin;
		unsigned int*__restrict__ pixelsEnd = (unsigned int*)&*end;

		// Prepare aligned, packed integer values.
		typedef union {
			__m128i v;
			unsigned int a[4];
		} xmm_uints;

		enum {X1, Y1, X2, Y2};

		__attribute__((aligned(16))) xmm_uints mins1;
		__attribute__((aligned(16))) xmm_uints maxs1;

		mins1.a[X1] = begin->x();
		maxs1.a[X1] = begin->x();
		mins1.a[Y1] = begin->y();
		maxs1.a[Y1] = begin->y();

		// Iterate through pixelList until 16-byte alignment is reached.
		while (((std::uintptr_t) pixels % 16) != 0 && pixels < pixelsEnd) {

			unsigned int x = pixels[X1];
			unsigned int y = pixels[Y1];

			mins1.a[X1] = std::min(mins1.a[X1], x);
			mins1.a[Y1] = std::min(mins1.a[Y1], y);
			maxs1.a[X1] = std::max(maxs1.a[X1], x);
			maxs1.a[Y1] = std::max(maxs1.a[Y1], y);

			pixels += 2;
		}

		// Guaranteed to have at least 8 XMM registers, so use 4 for cumulative
		// values and 2 for vector values. (Using 8+4 of 16 registers on 64-bit
		// arch yields no performance improvement.)
		mins1.a[X2] = mins1.a[X1];
		mins1.a[Y2] = mins1.a[Y1];
		maxs1.a[X2] = maxs1.a[X1];
		maxs1.a[Y2] = maxs1.a[Y1];
		__m128i mins2 = mins1.v;
		__m128i maxs2 = maxs1.v;

		// Vectorized loop. Strides two packed integer vectors, each containing
		// both X and Y for two pixels.
		while (pixels < pixelsEnd - 8) {

			__m128i pixelPair1 = _mm_load_si128((__m128i*)pixels);
			__m128i pixelPair2 = _mm_load_si128((__m128i*)(pixels + 4));
			pixels += 8; // Hint compiler to iterate while loads stall.
			_mm_prefetch(pixels, _MM_HINT_T0);
			mins1.v = _mm_min_epu32(mins1.v, pixelPair1);
			maxs1.v = _mm_max_epu32(maxs1.v, pixelPair1);
			mins2   = _mm_min_epu32(mins2,   pixelPair2);
			maxs2   = _mm_max_epu32(maxs2,   pixelPair2);
		}

		// Combine stride results.
		mins1.v = _mm_min_epu32(mins1.v, mins2);
		maxs1.v = _mm_max_epu32(maxs1.v, maxs2);

		// Iterate through any remaining pixels.
		while (pixels < pixelsEnd) {

			unsigned int x = pixels[X1];
			unsigned int y = pixels[Y1];

			mins1.a[X1] = std::min(mins1.a[X1], x);
			mins1.a[Y1] = std::min(mins1.a[Y1], y);
			maxs1.a[X1] = std::max(maxs1.a[X1], x);
			maxs1.a[Y1] = std::max(maxs1.a[Y1], y);

			pixels += 2;
		}

		// Readout packed vectors, compare with remaining results, and store.
		_boundingBox.min().x() = (int)std::min(mins1.a[X1], mins1.a[X2]);
		_boundingBox.min().y() = (int)std::min(mins1.a[Y1], mins1.a[Y2]);
		_boundingBox.max().x() = (int)std::max(maxs1.a[X1], maxs1.a[X2]) + 1;
		_boundingBox.max().y() = (int)std::max(maxs1.a[Y1], maxs1.a[Y2]) + 1;
	}
Exemplo n.º 9
0
/* merge "s+s" elements and return sorted result in "dest" array
   TODO(d'b): replace magic numbers with macro */
inline void bitonic_merge_kernel16n(float *dest, float *a, uint32_t sa, float *b /* must not be reversed*/, uint32_t sb)
{
	__m128 ma[4];
	__m128 mb[4];
	__m128 lo[4];
	__m128 hi[4];

#define LOAD16(arg) \
	mb[3] = _mm_load_ps(arg); \
	mb[2] = _mm_load_ps(arg + 4); \
	mb[1] = _mm_load_ps(arg + 8); \
	mb[0] = _mm_load_ps(arg + 12); arg+=16

	float *last_a = a + sa;
	float *last_b = b + sb;
	float *last_dest = dest + sa + sb;

	ma[0] = _mm_load_ps(a); a+=4;
	ma[1] = _mm_load_ps(a); a+=4;
	ma[2] = _mm_load_ps(a); a+=4;
	ma[3] = _mm_load_ps(a); a+=4;

	for(; dest < (last_dest - 16); dest += 16)
	{
		/* Load either a or b */
		if(a < last_a)
		{
			if(b < last_b)
			{
				if(*((uint32_t*)a) < *((uint32_t*)b))
				{
					LOAD16(a);
				} else
				{
					LOAD16(b);
				}
			} else
			{
				LOAD16(a);
			}
		} else
		{
			LOAD16(b);
		}

		/* Reverse *b */
		mb[0] = _mm_shuffle_ps(mb[0], mb[0], 0x1b);
		mb[1] = _mm_shuffle_ps(mb[1], mb[1], 0x1b);
		mb[2] = _mm_shuffle_ps(mb[2], mb[2], 0x1b);
		mb[3] = _mm_shuffle_ps(mb[3], mb[3], 0x1b);

		lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[0]), _mm_castps_si128(mb[0])));
		hi[0] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[0]), _mm_castps_si128(mb[0])));
		lo[1] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[1]), _mm_castps_si128(mb[1])));
		hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[1]), _mm_castps_si128(mb[1])));
		lo[2] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[2]), _mm_castps_si128(mb[2])));
		hi[2] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[2]), _mm_castps_si128(mb[2])));
		lo[3] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[3]), _mm_castps_si128(mb[3])));
		hi[3] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[3]), _mm_castps_si128(mb[3])));

		_mm_store_ps(&dest[0], lo[0]);
		_mm_store_ps(&dest[4], lo[1]);
		_mm_store_ps(&dest[8], lo[2]);
		_mm_store_ps(&dest[12], lo[3]);
		_mm_store_ps(&dest[16], hi[2]);
		_mm_store_ps(&dest[20], hi[3]);
		_mm_store_ps(&dest[24], hi[0]);
		_mm_store_ps(&dest[28], hi[1]);

		bitonic_merge_kernel8core(dest, dest + 8);
		bitonic_merge_kernel8core(dest + 16, dest + 24);

		ma[0] = _mm_load_ps(&dest[16]);
		ma[1] = _mm_load_ps(&dest[20]);
		ma[2] = _mm_load_ps(&dest[24]);
		ma[3] = _mm_load_ps(&dest[28]);
	}
}
Exemplo n.º 10
0
/* merge 2 sorted arrays (8 elements each) to 1 sorted array
   return result (16 elements) in the same arrays
   TODO(d'b): replace magic numbers with macro */
inline void bitonic_merge_kernel8core(float *a, float *b /* must be reversed*/)
{
	__m128	map[2];
	__m128	mbp[2];
	__m128	lo[2];
	__m128	hi[2];

	map[0] = _mm_load_ps(a);
	mbp[0] = _mm_load_ps(b);

	map[1] = _mm_load_ps(a + 4);
	mbp[1] = _mm_load_ps(b + 4);

	lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(mbp[0])));
	hi[0] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(mbp[0])));

	lo[1] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[1]), _mm_castps_si128(mbp[1])));
	hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[1]), _mm_castps_si128(mbp[1])));

	map[0] = lo[0];
	map[1] = lo[1];
	mbp[0] = hi[0];
	mbp[1] = hi[1];

	/* L1 processing */
	lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));
	hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));

	map[0] = _mm_shuffle_ps(lo[0], lo[1], 0xe4);
	map[1] = _mm_shuffle_ps(lo[0], lo[1], 0x4e);
	mbp[0] = _mm_shuffle_ps(hi[0], hi[1], 0xe4);
	mbp[1] = _mm_shuffle_ps(hi[0], hi[1], 0x4e);

	/* L2 processing */
	lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));
	hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));

	map[0] = _mm_shuffle_ps(lo[0], lo[1], 0xd8);
	map[1] = _mm_shuffle_ps(lo[0], lo[1], 0x8d);
	mbp[0] = _mm_shuffle_ps(hi[0], hi[1], 0xd8);
	mbp[1] = _mm_shuffle_ps(hi[0], hi[1], 0x8d);

	/* L3 processing */
	lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));
	hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));

	map[0] = _mm_shuffle_ps(lo[1], lo[0], 0x88);
	map[1] = _mm_shuffle_ps(lo[1], lo[0], 0xdd);
	mbp[0] = _mm_shuffle_ps(hi[1], hi[0], 0x88);
	mbp[1] = _mm_shuffle_ps(hi[1], hi[0], 0xdd);

	map[0] = _mm_shuffle_ps(map[0], map[0], 0x72);
	map[1] = _mm_shuffle_ps(map[1], map[1], 0x72);
	mbp[0] = _mm_shuffle_ps(mbp[0], mbp[0], 0x72);
	mbp[1] = _mm_shuffle_ps(mbp[1], mbp[1], 0x72);

	_mm_store_ps(&a[0], map[0]);
	_mm_store_ps(&a[4], map[1]);
	_mm_store_ps(&b[0], mbp[0]);
	_mm_store_ps(&b[4], mbp[1]);

	CHECK_RAWS(a, b, 8);
}
Exemplo n.º 11
0
/* elements are given in 2 arrays (4 and 4),
   result will be returned in the same arrays with a straight order */
inline void bitonic_sort_kernel4(float *a, float *b)
{
	__m128	ma;
	__m128	mb;
	__m128	map;
	__m128	mbp;
	__m128	lo;
	__m128	hi;

	/* load 8 elements to sse registers */
	ma = _mm_load_ps(a);
	mb = _mm_load_ps(b);

	/* In-Register sort */
	map = _mm_shuffle_ps(ma, mb, _MM_SHUFFLE(2, 0, 2, 0)); /* 0x88: */
	mbp = _mm_shuffle_ps(ma, mb, _MM_SHUFFLE(3, 1, 3, 1)); /* 0xdd: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */
	mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 3, 1)); /* 0x8d: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(lo, lo, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */
	mbp = _mm_shuffle_ps(hi, hi, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(1, 0, 0, 1)); /* 0x41: */
	mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 2, 2, 3)); /* 0xeb: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(3, 2, 1, 0)); /* 0xe4: */
	mbp = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(1, 0, 3, 2)); /* 0x4e: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */
	mbp = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(2, 0, 3, 1)); /* 0x8d: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)); /* 0x88: */
	mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)); /* 0xdd: */

	map = _mm_shuffle_ps(map, map, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */
	mbp = _mm_shuffle_ps(mbp, mbp, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */

	/* unload sorted elements to memory */
	_mm_store_ps(a, map);
	_mm_store_ps(b, mbp);

	CHECK_RAWS(a, b, 4);
}
Exemplo n.º 12
0
static uint32_t minasint(const __m128i accumulator) {
	const __m128i _tmp1 = _mm_min_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
	const __m128i _tmp2 = _mm_min_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
	return  _mm_cvtsi128_si32(_tmp2);
}
Exemplo n.º 13
0
__m128i test_mm_min_epu32(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_min_epu32
  // CHECK: call <4 x i32> @llvm.x86.sse41.pminud
  // CHECK-ASM: pminud %xmm{{.*}}, %xmm{{.*}}
  return _mm_min_epu32(x, y);
}
Exemplo n.º 14
0
__m128i test_mm_min_epu32(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_min_epu32
  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i32> [[X:%.*]], [[Y:%.*]]
  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
  return _mm_min_epu32(x, y);
}