Exemple #1
0
int _normoptimized32(const unsigned char* a, const unsigned char* b)
{
#ifdef USE_PENTIUM4
	return _normoptimized(a,b,32);
#else
	unsigned long int _dis0a, _dis0b, _dis1a, _dis1b;
	long int _cnt0a, _cnt0b, _cnt1a, _cnt1b;
	// first
	__m128i a0 = _mm_loadu_si128((const __m128i*)(a));
	__m128i a1 = _mm_loadu_si128((const __m128i*)(a+16));
	__m128i b0 = _mm_loadu_si128((const __m128i*)(b));
	__m128i b1 = _mm_loadu_si128((const __m128i*)(b+16));
	b0 = _mm_xor_si128(a0, b0);
	b1 = _mm_xor_si128(a1, b1);
	a0 = _mm_srli_si128(b0,8);
	a1 = _mm_srli_si128(b1,8);
	_dis0a = _mm_cvtsi128_si64(b0);
	_dis0b = _mm_cvtsi128_si64(a0);
	_dis1a = _mm_cvtsi128_si64(b1);
	_dis1b = _mm_cvtsi128_si64(a1);
	_cnt0a = _mm_popcnt_u64(_dis0a);
	_cnt0b = _mm_popcnt_u64(_dis0b);
	_cnt1a = _mm_popcnt_u64(_dis1a);
	_cnt1b = _mm_popcnt_u64(_dis1b);
	return _cnt0a + _cnt0b + _cnt1a + _cnt1b;
#endif
}
int hamming_distance(uint64_t* x, uint64_t* y, int64_t k) {
  int64_t niter = (k + 63) / 64;
  int accum = 0;
  for (int64_t i = 0; i < niter; ++i) {
    accum += _mm_popcnt_u64(x[i] ^ y[i]);
  }
  return accum;
}
void HammingBruteForce::match_all(const int64_t *a, int n_a, const int64_t *b, int n_b) {
    std::fill(index_ab, index_ab + n_a, -1);
    std::fill(distance_ab, distance_ab + n_a, threshold);
    for (int i = 0; i < n_a; i++) {
        for (int j = 0; j < n_b; j++) {
            int d_ab0 = _mm_popcnt_u64(a[i*4 + 0] ^ b[j*4 + 0]);
            int d_ab1 = _mm_popcnt_u64(a[i*4 + 1] ^ b[j*4 + 1]);
            int d_ab2 = _mm_popcnt_u64(a[i*4 + 2] ^ b[j*4 + 2]);
            int d_ab3 = _mm_popcnt_u64(a[i*4 + 3] ^ b[j*4 + 3]);
            int d_ab = d_ab0 + d_ab1 + d_ab2 + d_ab3;

            int is_closer = d_ab < distance_ab[i];
            index_ab[i] = is_closer ? j : index_ab[i];
            distance_ab[i] = is_closer ? d_ab : distance_ab[i];
        }
    }
}
Exemple #4
0
int _normoptimized(const unsigned char* a, const unsigned char* b, const int n)
{
#ifdef USE_PENTIUM4
	int _ddt = 0;
	unsigned int _dis, _dis2, _dis3, _dis4;
	long int _cnt, _cnt2, _cnt3, _cnt4;
	for (int _i = 0; _i < n; _i+=16){
		// xor 128 bits
		__m128i a0 = _mm_loadu_si128((const __m128i*)(a + _i));
		__m128i b0 = _mm_loadu_si128((const __m128i*)(b + _i));
		b0 = _mm_xor_si128(a0, b0);
		__m128i d = _mm_srli_si128(b0, 4);
	    __m128i e = _mm_srli_si128(d,4);
	    __m128i f = _mm_srli_si128(e,4);
	    _dis = _mm_cvtsi128_si32(b0);
	    _dis2 = _mm_cvtsi128_si32(d);
	    _dis3 = _mm_cvtsi128_si32(e);
	    _dis4 = _mm_cvtsi128_si32(f);
	    // now count
	    _cnt = _mm_popcnt_u32(_dis);
	    _cnt2 = _mm_popcnt_u32(_dis2);
	    _cnt3 = _mm_popcnt_u32(_dis3);
	    _cnt4 = _mm_popcnt_u32(_dis4);
	    _ddt += _cnt + _cnt2 + _cnt3 + _cnt4;
	}
	return _ddt;
#else	
	int _ddt = 0;
	unsigned long int _dis, _dis2;
	long int _cnt, _cnt2;
	for (int _i = 0; _i < n; _i+=16){
		// xor 128 bits
		__m128i a0 = _mm_loadu_si128((const __m128i*)(a + _i));
		__m128i b0 = _mm_loadu_si128((const __m128i*)(b + _i));
		b0 = _mm_xor_si128(a0, b0);
		a0 = _mm_srli_si128(b0,8);
		_dis = _mm_cvtsi128_si64(b0);
		_dis2 = _mm_cvtsi128_si64(a0);
		_cnt = _mm_popcnt_u64(_dis);
		_cnt2 = _mm_popcnt_u64(_dis2);
		_ddt +=  _cnt + _cnt2;		// other commmands don't give any advantage
	}
	return _ddt;
#endif
}
Exemple #5
0
size_t
BitKNN::popcount (std::vector<uint64_t> &x_vec)
{
  size_t c = 0;
  for (int i = 0; i < (int)x_vec.size(); i++) {
    c += _mm_popcnt_u64( x_vec[ i ] );
  }
  return c;
}
Exemple #6
0
int _normoptimized16(const unsigned char* a, const unsigned char* b)
{
#ifdef USE_PENTIUM4
	return _normoptimized(a,b,16);
#else
	unsigned long int _dis, _dis2;
	long int _cnt, _cnt2;
	__m128i a0 = _mm_loadu_si128((const __m128i*)(a));
	__m128i b0 = _mm_loadu_si128((const __m128i*)(b));
	__m128i c = _mm_xor_si128(a0, b0);
	__m128i d = _mm_srli_si128(c,8);
	_dis = _mm_cvtsi128_si64(c);
	_dis2 = _mm_cvtsi128_si64(d);
	_cnt = _mm_popcnt_u64(_dis);
	_cnt2 = _mm_popcnt_u64(_dis2);
	int _ddt =  _cnt + _cnt2;		// other commmands don't give any advantage
	return _ddt;
#endif
}
Exemple #7
0
void addKnight(solution *sol, int attackSquare, int* coverage) {
	knight* knightPrint = placements[attackSquare];
	if (knightPrint == NULL)
	{
		knightPrint = (knight*)calloc(1, sizeof(knight));
		int y = knightsYX[attackSquare][0];
		int x = knightsYX[attackSquare][1];
		knightPrint->placement[y] |= _rotr64(UNIT, x);
		knightPrint->coverage[y] |= _rotr64(UNIT, x);
		if ((y >= STEP_X) && (x >= STEP_Y))
		{
			knightPrint->coverage[y - STEP_X] |= _rotr64(UNIT, (x - STEP_Y));
		}
		if (y >= STEP_X) {
			knightPrint->coverage[y - STEP_X] |= _rotr64(UNIT, (x + STEP_Y));
		}
		if (x >= STEP_Y) {
			knightPrint->coverage[y + STEP_X] |= _rotr64(UNIT, (x - STEP_Y));
		}
		knightPrint->coverage[y + STEP_X] |= _rotr64(UNIT, (x + STEP_Y));

		if ((y >= STEP_Y) && (x >= STEP_X))
		{
			knightPrint->coverage[y - STEP_Y] |= _rotr64(UNIT, (x - STEP_X));
		}
		if (y >= STEP_Y)
		{
			knightPrint->coverage[y - STEP_Y] |= _rotr64(UNIT, (x + STEP_X));
		}
		if (x >= STEP_X)
		{
			knightPrint->coverage[y + STEP_Y] |= _rotr64(UNIT, (x - STEP_X));
		}
		knightPrint->coverage[y + STEP_Y] |= _rotr64(UNIT, (x + STEP_X));

		placements[attackSquare] = knightPrint;
	}
	long long* solPtr = (long long*)sol;
	long long* knightPtr = (long long*)knightPrint;
	long long cover = 0;
	for (int i = sizeof(solution) / sizeof(long long); i > 0; i--, solPtr++, knightPtr++)
	{
		*solPtr |= *knightPtr;
		if (i <= 64)
		{
			cover += _mm_popcnt_u64(*solPtr);
		}
	}
	*coverage = (int)cover;
}
Exemple #8
0
size_t
BitKNN::inner_prod (std::vector<uint64_t> &x_vec1, std::vector<uint64_t> &x_vec2)
{
  /*
  if (x_vec1.size() != x_vec2.size()) {
    std::cerr << "ERROR: The size of two vectors are inconsistent." << std::endl;
    exit(1);
  }
  */

  size_t c = 0;
  for (int i = 0; i < (int)x_vec1.size(); i++) {
    uint64_t x = x_vec1[ i ] & x_vec2[ i ];
    c += _mm_popcnt_u64( x );
  }
  return c;
}
Exemple #9
0
int _normoptimized64(const unsigned char*a, const unsigned char*b){
#ifdef USE_PENTIUM4
	return _normoptimized(a,b,64);
#else	
	unsigned long int _dis0a, _dis0b, _dis1a, _dis1b, _dis2a, _dis2b, _dis3a, _dis3b;
	long int _cnt0a, _cnt0b, _cnt1a, _cnt1b, _cnt2a, _cnt2b, _cnt3a, _cnt3b;
	// first
	__m128i a0 = _mm_loadu_si128((const __m128i*)(a));
	__m128i a1 = _mm_loadu_si128((const __m128i*)(a+16));
	__m128i a2 = _mm_loadu_si128((const __m128i*)(a+32));
	__m128i a3 = _mm_loadu_si128((const __m128i*)(a+48));
	__m128i b0 = _mm_loadu_si128((const __m128i*)(b));
	__m128i b1 = _mm_loadu_si128((const __m128i*)(b+16));
	__m128i b2 = _mm_loadu_si128((const __m128i*)(b+32));
	__m128i b3 = _mm_loadu_si128((const __m128i*)(b+48));
	b0 = _mm_xor_si128(a0, b0);
	b1 = _mm_xor_si128(a1, b1);
	b2 = _mm_xor_si128(a2, b2);
	b3 = _mm_xor_si128(a3, b3);
	a0 = _mm_srli_si128(b0,8);
	a1 = _mm_srli_si128(b1,8);
	a2 = _mm_srli_si128(b2,8);
	a3 = _mm_srli_si128(b3,8);
	_dis0a = _mm_cvtsi128_si64(b0);
	_dis0b = _mm_cvtsi128_si64(a0);
	_dis1a = _mm_cvtsi128_si64(b1);
	_dis1b = _mm_cvtsi128_si64(a1);
	_dis2a = _mm_cvtsi128_si64(b2);
	_dis2b = _mm_cvtsi128_si64(a2);
	_dis3a = _mm_cvtsi128_si64(b3);
	_dis3b = _mm_cvtsi128_si64(a3);
	_cnt0a = _mm_popcnt_u64(_dis0a);
	_cnt0b = _mm_popcnt_u64(_dis0b);
	_cnt1a = _mm_popcnt_u64(_dis1a);
	_cnt1b = _mm_popcnt_u64(_dis1b);
	_cnt2a = _mm_popcnt_u64(_dis2a);
	_cnt2b = _mm_popcnt_u64(_dis2b);
	_cnt3a = _mm_popcnt_u64(_dis3a);
	_cnt3b = _mm_popcnt_u64(_dis3b);
	return _cnt0a + _cnt0b + _cnt1a + _cnt1b + _cnt2a + _cnt2b + _cnt3a + _cnt3b;
#endif
}
uint64_t avx512f_harley_seal(const uint64_t * data, size_t size) {
  const unsigned int wordspervector = sizeof(__m512i) / sizeof(uint64_t);
  const unsigned int minvit = 16 * wordspervector;
  uint64_t total;
  size_t i;

  if (size >= minvit) {
    total = popcnt_harley_seal((const __m512i*) data, size / wordspervector);
    i = size - size % wordspervector;
  } else {
    total = 0;
    i = 0;
  }

  for (/**/; i < size; i++) {
    total += _mm_popcnt_u64(data[i]);
  }
  return total;
}
Exemple #11
0
/*
==================
==================
*/
void Process_Fragments(

	raster_output_& raster_output,
	shader_input_& shader_input
) {

	const __m128 zero = set_all(0.0f);

	shader_input.tile_mask_16x16 = 0x0;
	shader_input.tile_mask_64x64 = 0x0;

	//===============================================================================================

	{
		const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_64x64];
		for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) {

			raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_64x64][i_fragment];

			const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16;
			const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff;

			Process_Fragment_64x64(

				raster_fragment.w,
				i_buffer,
				coverage_mask,
				raster_output,
				shader_input
			);
		}
	}
	//===============================================================================================
	{
		const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_16x16];
		for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) {

			raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_16x16][i_fragment];

			const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16;
			const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff;

			Process_Fragment_16x16(

				raster_fragment.w,
				0,
				i_buffer,
				coverage_mask,
				raster_output,
				shader_input
			);
		}
	}
	//===============================================================================================
	{

		const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_4x4];
		for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) {

			raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_4x4][i_fragment];
			const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16;
			const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff;
			Process_Fragment_4x4(raster_fragment.w, 0, i_buffer, coverage_mask, raster_output, shader_input);
		}
	}
	//===============================================================================================
	{
		//const __int32 start = raster_output_::MAX_FRAGMENTS - 1;
		//const __int32 end = raster_output.n_fragments[raster_output_::PARTIAL_ACCEPT_4x4];
		//for (__int32 i_fragment = start; i_fragment > end; i_fragment--) {


		//	raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::PARTIAL_ACCEPT_4x4][i_fragment];
		//	const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16;
		//	const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff;
		//	Process_Fragment_4x4(raster_fragment.w, 0, i_buffer, coverage_mask, raster_output, shader_input);
		//}
	}
	//===============================================================================================
	{
		const __int32 n_fragments = raster_output.n_fragments_COMPLETE;
		__int32 n_depth_fragments = 0;
		for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) {

			raster_fragment_complete_& raster_fragment = raster_output.raster_fragment_complete[i_fragment];
			const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16;
			const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff;

			pixel_shader(i_buffer, coverage_mask, raster_fragment.bazza, shader_input);

			const __int32 i_buffer_depth_4x4 = i_buffer / (4 * 4);
			const __int32 i_buffer_depth_16x16 = i_buffer / (16 * 16);
			const __int32 i_buffer_depth_64x64 = i_buffer / (64 * 64);
			shader_input.depth_tiles_4x4[i_buffer_depth_4x4] = shader_input.z_max;
			shader_input.tile_mask_16x16 |= one_bit_64 << i_buffer_depth_16x16;
			shader_input.tile_mask_64x64 |= one_bit_64 << i_buffer_depth_64x64;
		}
	}
	//===============================================================================================
	{
		//printf_s(" %llu ", shader_input.tile_mask_16x16);

		__int64 n_tiles = _mm_popcnt_u64(shader_input.tile_mask_16x16);

		for (__int32 i_bit = 0; i_bit < n_tiles; i_bit++) {

			unsigned long i_tile_16x16;
			_BitScanForward64(&i_tile_16x16, shader_input.tile_mask_16x16);
			shader_input.tile_mask_16x16 ^= one_bit_64 << i_tile_16x16;

			const __int32 i_tile_4x4 = i_tile_16x16 * (4 * 4);

			__m128 depth_4x4[4];
			depth_4x4[0] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (0 * 4));
			depth_4x4[1] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (1 * 4));
			depth_4x4[2] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (2 * 4));
			depth_4x4[3] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (3 * 4));

			__m128 z_max;
			z_max = depth_4x4[0];
			z_max = min_vec(depth_4x4[1], z_max);
			z_max = min_vec(depth_4x4[2], z_max);
			z_max = min_vec(depth_4x4[3], z_max);

			__m128 z_out = z_max;
			z_max = rotate_left(z_max);
			z_out = min_vec(z_max, z_out);
			z_max = rotate_left(z_max);
			z_out = min_vec(z_max, z_out);
			z_max = rotate_left(z_max);
			z_out = min_vec(z_max, z_out);

			shader_input.depth_tiles_16x16[i_tile_16x16] = store_s(z_out);
		}
	}
	{
		__int64 n_tiles = _mm_popcnt_u64(shader_input.tile_mask_64x64);

		//printf_s(" %llu ", n_tiles);

		for (__int32 i_bit = 0; i_bit < n_tiles; i_bit++) {

			unsigned long i_tile_64x64;
			_BitScanForward64(&i_tile_64x64, shader_input.tile_mask_64x64);
			shader_input.tile_mask_64x64 ^= one_bit_64 << i_tile_64x64;

			const __int32 i_tile_16x16 = i_tile_64x64 * (4 * 4);

			__m128 depth_16x16[4];
			depth_16x16[0] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (0 * 4));
			depth_16x16[1] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (1 * 4));
			depth_16x16[2] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (2 * 4));
			depth_16x16[3] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (3 * 4));

			__m128 z_max;
			z_max = depth_16x16[0];
			z_max = min_vec(depth_16x16[1], z_max);
			z_max = min_vec(depth_16x16[2], z_max);
			z_max = min_vec(depth_16x16[3], z_max);

			__m128 z_out = z_max;
			z_max = rotate_left(z_max);
			z_out = min_vec(z_max, z_out);
			z_max = rotate_left(z_max);
			z_out = min_vec(z_max, z_out);
			z_max = rotate_left(z_max);
			z_out = min_vec(z_max, z_out);

			shader_input.depth_tiles_64x64[i_tile_64x64] = store_s(z_out);
		}
	}
}
Exemple #12
0
 __INTRIN_INLINE uint64_t popcount(uint64_t x)
 {
     return uint64_t(_mm_popcnt_u64(x));
 }
AbstractBuffer<int32_t> ADCensus::constructDisparityMap(const AbstractBuffer<pixel> *leftImage, const AbstractBuffer<pixel> *rightImage,
                                                        const AbstractBuffer<grayPixel> *leftGrayImage, const AbstractBuffer<grayPixel> *rightGrayImage) {
    // Initialization

    int width = leftImage->w;
    int height = leftImage->h;

    BaseTimeStatisticsCollector collector;
    Statistics outerStats;
    outerStats.setValue("H", height);
    outerStats.setValue("W", width);

    AbstractBuffer<int32_t> bestDisparities = AbstractBuffer<int32_t>(height, width);
    AbstractBuffer<COST_TYPE> minCosts = AbstractBuffer<COST_TYPE>(height, width);
    minCosts.fillWith(-1);

    // Disparity computation
    outerStats.startInterval();

    AbstractBuffer<int64_t> leftCensus  = AbstractBuffer<int64_t>(height, width);
    AbstractBuffer<int64_t> rightCensus = AbstractBuffer<int64_t>(height, width);
    makeCensus(leftGrayImage, leftCensus);
    makeCensus(rightGrayImage, rightCensus);
    outerStats.resetInterval("Making census");

    makeAggregationCrosses(leftImage);
    outerStats.resetInterval("Making aggregation crosses");

    for (uint i = 0; i < CORE_COUNT_OF(table1); i++)
    {
        table1[i] = robust(i, lambdaCT);
        table2[i] = robust(i, lambdaAD);
    }

    bool parallelDisp = true;

    parallelable_for(0, width / 3,
                     [this, &minCosts, &bestDisparities, &leftImage, &rightImage,
                     &leftCensus, &rightCensus, &collector, height, width, parallelDisp](const BlockedRange<int> &r)
    {
        for (int d = r.begin(); d != r.end(); ++d) {
            Statistics stats;
            stats.startInterval();
            AbstractBuffer<COST_TYPE> costs = AbstractBuffer<COST_TYPE>(height, width);
            stats.resetInterval("Matrix construction");

            parallelable_for(windowHh, height - windowHh,
                             [this, &costs, &leftImage, &rightImage, &leftCensus, &rightCensus, d, width](const BlockedRange<int> &r)
            {
                for (int y = r.begin(); y != r.end(); ++y) {
                    auto *im1 = &leftImage->element(y, windowWh + d);
                    auto *im2 = &rightImage->element(y, windowWh);

                    int64_t *cen1 = &leftCensus.element(y, windowWh + d);
                    int64_t *cen2 = &rightCensus.element(y, windowWh);

                    int x = windowWh + d;

#ifdef WITH_SSE
                    for (; x < width - windowWh; x += 8) {
                        FixedVector<Int16x8, 4> c1 = SSEReader8BBBB_DDDD::read((uint32_t *)im1);
                        FixedVector<Int16x8, 4> c2 = SSEReader8BBBB_DDDD::read((uint32_t *)im2);

                        UInt16x8 dr = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_R]), UInt16x8(c2[RGBColor::FIELD_R]));
                        UInt16x8 dg = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_G]), UInt16x8(c2[RGBColor::FIELD_G]));
                        UInt16x8 db = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_B]), UInt16x8(c2[RGBColor::FIELD_B]));

                        UInt16x8 ad = (dr + dg + db) >> 2;
                        Int16x8 cost_ad = Int16x8(robustLUTAD(ad[0]),
                                                  robustLUTAD(ad[1]),
                                                  robustLUTAD(ad[2]),
                                                  robustLUTAD(ad[3]),
                                                  robustLUTAD(ad[4]),
                                                  robustLUTAD(ad[5]),
                                                  robustLUTAD(ad[6]),
                                                  robustLUTAD(ad[7]));

                        Int64x2 cen10(&cen1[0]);
                        Int64x2 cen12(&cen1[2]);
                        Int64x2 cen14(&cen1[4]);
                        Int64x2 cen16(&cen1[6]);

                        Int64x2 cen20(&cen2[0]);
                        Int64x2 cen22(&cen2[2]);
                        Int64x2 cen24(&cen2[4]);
                        Int64x2 cen26(&cen2[6]);

                        Int64x2 diff0 = cen10 ^ cen20;
                        Int64x2 diff2 = cen12 ^ cen22;
                        Int64x2 diff4 = cen14 ^ cen24;
                        Int64x2 diff6 = cen16 ^ cen26;

                        Int16x8 cost_ct(robustLUTCen(_mm_popcnt_u64(diff0.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff0.getInt(1))),
                                        robustLUTCen(_mm_popcnt_u64(diff2.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff2.getInt(1))),
                                        robustLUTCen(_mm_popcnt_u64(diff4.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff4.getInt(1))),
                                        robustLUTCen(_mm_popcnt_u64(diff6.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff6.getInt(1))));

                        Int16x8 cost_total = cost_ad + cost_ct;
                        for (int i = 0; i < 8; ++i) {
                            costs.element(y, x + i) = cost_total[i];
                        }

                        im1 += 8;
                        im2 += 8;
                        cen1+= 8;
                        cen2+= 8;
                    }
#else
                    for (; x < width - windowWh; ++x) {
                        uint8_t c_ad = costAD(*im1, *im2);
                        uint8_t c_census = hammingDist(*cen1, *cen2);

                        costs.element(y, x) = robustLUTCen(c_census) + robustLUTAD(c_ad);

                        im1 ++;
                        im2 ++;
                        cen1++;
                        cen2++;
                    }
#endif
                }
            }, !parallelDisp
            );

            stats.resetInterval("Cost computation");

            aggregateCosts(&costs, windowWh + d, windowHh, width - windowWh, height - windowHh);

            stats.resetInterval("Cost aggregation");

            for (int x = windowWh + d; x < width - windowWh; ++x) {
                for (int y = windowHh; y < height - windowHh; ++y) {
                    tbb::mutex::scoped_lock(bestDisparitiesMutex);
                    if(costs.element(y, x) < minCosts.element(y, x)) {
                        minCosts.element(y, x) = costs.element(y, x);
                        bestDisparities.element(y, x) = d;

                        //result.element(y,x) = (bestDisparities.element(y, x) / (double)width * 255 * 3);
                    }
                }
            }
            //BMPLoader().save("../../result.bmp", result);

            stats.endInterval("Comparing with previous minimum");
            collector.addStatistics(stats);

        }
    }, parallelDisp);
 inline uint popCount(uint64_t value)
 {
     return static_cast<uint>(_mm_popcnt_u64(value));
 }
Exemple #15
0
int xm_popcount(XMM x)
{
    return _mm_popcnt_u64(((uint64_t const*)&x)[0])
         + _mm_popcnt_u64(((uint64_t const*)&x)[0]);
}