int _normoptimized32(const unsigned char* a, const unsigned char* b) { #ifdef USE_PENTIUM4 return _normoptimized(a,b,32); #else unsigned long int _dis0a, _dis0b, _dis1a, _dis1b; long int _cnt0a, _cnt0b, _cnt1a, _cnt1b; // first __m128i a0 = _mm_loadu_si128((const __m128i*)(a)); __m128i a1 = _mm_loadu_si128((const __m128i*)(a+16)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b)); __m128i b1 = _mm_loadu_si128((const __m128i*)(b+16)); b0 = _mm_xor_si128(a0, b0); b1 = _mm_xor_si128(a1, b1); a0 = _mm_srli_si128(b0,8); a1 = _mm_srli_si128(b1,8); _dis0a = _mm_cvtsi128_si64(b0); _dis0b = _mm_cvtsi128_si64(a0); _dis1a = _mm_cvtsi128_si64(b1); _dis1b = _mm_cvtsi128_si64(a1); _cnt0a = _mm_popcnt_u64(_dis0a); _cnt0b = _mm_popcnt_u64(_dis0b); _cnt1a = _mm_popcnt_u64(_dis1a); _cnt1b = _mm_popcnt_u64(_dis1b); return _cnt0a + _cnt0b + _cnt1a + _cnt1b; #endif }
int hamming_distance(uint64_t* x, uint64_t* y, int64_t k) { int64_t niter = (k + 63) / 64; int accum = 0; for (int64_t i = 0; i < niter; ++i) { accum += _mm_popcnt_u64(x[i] ^ y[i]); } return accum; }
void HammingBruteForce::match_all(const int64_t *a, int n_a, const int64_t *b, int n_b) { std::fill(index_ab, index_ab + n_a, -1); std::fill(distance_ab, distance_ab + n_a, threshold); for (int i = 0; i < n_a; i++) { for (int j = 0; j < n_b; j++) { int d_ab0 = _mm_popcnt_u64(a[i*4 + 0] ^ b[j*4 + 0]); int d_ab1 = _mm_popcnt_u64(a[i*4 + 1] ^ b[j*4 + 1]); int d_ab2 = _mm_popcnt_u64(a[i*4 + 2] ^ b[j*4 + 2]); int d_ab3 = _mm_popcnt_u64(a[i*4 + 3] ^ b[j*4 + 3]); int d_ab = d_ab0 + d_ab1 + d_ab2 + d_ab3; int is_closer = d_ab < distance_ab[i]; index_ab[i] = is_closer ? j : index_ab[i]; distance_ab[i] = is_closer ? d_ab : distance_ab[i]; } } }
int _normoptimized(const unsigned char* a, const unsigned char* b, const int n) { #ifdef USE_PENTIUM4 int _ddt = 0; unsigned int _dis, _dis2, _dis3, _dis4; long int _cnt, _cnt2, _cnt3, _cnt4; for (int _i = 0; _i < n; _i+=16){ // xor 128 bits __m128i a0 = _mm_loadu_si128((const __m128i*)(a + _i)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b + _i)); b0 = _mm_xor_si128(a0, b0); __m128i d = _mm_srli_si128(b0, 4); __m128i e = _mm_srli_si128(d,4); __m128i f = _mm_srli_si128(e,4); _dis = _mm_cvtsi128_si32(b0); _dis2 = _mm_cvtsi128_si32(d); _dis3 = _mm_cvtsi128_si32(e); _dis4 = _mm_cvtsi128_si32(f); // now count _cnt = _mm_popcnt_u32(_dis); _cnt2 = _mm_popcnt_u32(_dis2); _cnt3 = _mm_popcnt_u32(_dis3); _cnt4 = _mm_popcnt_u32(_dis4); _ddt += _cnt + _cnt2 + _cnt3 + _cnt4; } return _ddt; #else int _ddt = 0; unsigned long int _dis, _dis2; long int _cnt, _cnt2; for (int _i = 0; _i < n; _i+=16){ // xor 128 bits __m128i a0 = _mm_loadu_si128((const __m128i*)(a + _i)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b + _i)); b0 = _mm_xor_si128(a0, b0); a0 = _mm_srli_si128(b0,8); _dis = _mm_cvtsi128_si64(b0); _dis2 = _mm_cvtsi128_si64(a0); _cnt = _mm_popcnt_u64(_dis); _cnt2 = _mm_popcnt_u64(_dis2); _ddt += _cnt + _cnt2; // other commmands don't give any advantage } return _ddt; #endif }
size_t BitKNN::popcount (std::vector<uint64_t> &x_vec) { size_t c = 0; for (int i = 0; i < (int)x_vec.size(); i++) { c += _mm_popcnt_u64( x_vec[ i ] ); } return c; }
int _normoptimized16(const unsigned char* a, const unsigned char* b) { #ifdef USE_PENTIUM4 return _normoptimized(a,b,16); #else unsigned long int _dis, _dis2; long int _cnt, _cnt2; __m128i a0 = _mm_loadu_si128((const __m128i*)(a)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b)); __m128i c = _mm_xor_si128(a0, b0); __m128i d = _mm_srli_si128(c,8); _dis = _mm_cvtsi128_si64(c); _dis2 = _mm_cvtsi128_si64(d); _cnt = _mm_popcnt_u64(_dis); _cnt2 = _mm_popcnt_u64(_dis2); int _ddt = _cnt + _cnt2; // other commmands don't give any advantage return _ddt; #endif }
void addKnight(solution *sol, int attackSquare, int* coverage) { knight* knightPrint = placements[attackSquare]; if (knightPrint == NULL) { knightPrint = (knight*)calloc(1, sizeof(knight)); int y = knightsYX[attackSquare][0]; int x = knightsYX[attackSquare][1]; knightPrint->placement[y] |= _rotr64(UNIT, x); knightPrint->coverage[y] |= _rotr64(UNIT, x); if ((y >= STEP_X) && (x >= STEP_Y)) { knightPrint->coverage[y - STEP_X] |= _rotr64(UNIT, (x - STEP_Y)); } if (y >= STEP_X) { knightPrint->coverage[y - STEP_X] |= _rotr64(UNIT, (x + STEP_Y)); } if (x >= STEP_Y) { knightPrint->coverage[y + STEP_X] |= _rotr64(UNIT, (x - STEP_Y)); } knightPrint->coverage[y + STEP_X] |= _rotr64(UNIT, (x + STEP_Y)); if ((y >= STEP_Y) && (x >= STEP_X)) { knightPrint->coverage[y - STEP_Y] |= _rotr64(UNIT, (x - STEP_X)); } if (y >= STEP_Y) { knightPrint->coverage[y - STEP_Y] |= _rotr64(UNIT, (x + STEP_X)); } if (x >= STEP_X) { knightPrint->coverage[y + STEP_Y] |= _rotr64(UNIT, (x - STEP_X)); } knightPrint->coverage[y + STEP_Y] |= _rotr64(UNIT, (x + STEP_X)); placements[attackSquare] = knightPrint; } long long* solPtr = (long long*)sol; long long* knightPtr = (long long*)knightPrint; long long cover = 0; for (int i = sizeof(solution) / sizeof(long long); i > 0; i--, solPtr++, knightPtr++) { *solPtr |= *knightPtr; if (i <= 64) { cover += _mm_popcnt_u64(*solPtr); } } *coverage = (int)cover; }
size_t BitKNN::inner_prod (std::vector<uint64_t> &x_vec1, std::vector<uint64_t> &x_vec2) { /* if (x_vec1.size() != x_vec2.size()) { std::cerr << "ERROR: The size of two vectors are inconsistent." << std::endl; exit(1); } */ size_t c = 0; for (int i = 0; i < (int)x_vec1.size(); i++) { uint64_t x = x_vec1[ i ] & x_vec2[ i ]; c += _mm_popcnt_u64( x ); } return c; }
int _normoptimized64(const unsigned char*a, const unsigned char*b){ #ifdef USE_PENTIUM4 return _normoptimized(a,b,64); #else unsigned long int _dis0a, _dis0b, _dis1a, _dis1b, _dis2a, _dis2b, _dis3a, _dis3b; long int _cnt0a, _cnt0b, _cnt1a, _cnt1b, _cnt2a, _cnt2b, _cnt3a, _cnt3b; // first __m128i a0 = _mm_loadu_si128((const __m128i*)(a)); __m128i a1 = _mm_loadu_si128((const __m128i*)(a+16)); __m128i a2 = _mm_loadu_si128((const __m128i*)(a+32)); __m128i a3 = _mm_loadu_si128((const __m128i*)(a+48)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b)); __m128i b1 = _mm_loadu_si128((const __m128i*)(b+16)); __m128i b2 = _mm_loadu_si128((const __m128i*)(b+32)); __m128i b3 = _mm_loadu_si128((const __m128i*)(b+48)); b0 = _mm_xor_si128(a0, b0); b1 = _mm_xor_si128(a1, b1); b2 = _mm_xor_si128(a2, b2); b3 = _mm_xor_si128(a3, b3); a0 = _mm_srli_si128(b0,8); a1 = _mm_srli_si128(b1,8); a2 = _mm_srli_si128(b2,8); a3 = _mm_srli_si128(b3,8); _dis0a = _mm_cvtsi128_si64(b0); _dis0b = _mm_cvtsi128_si64(a0); _dis1a = _mm_cvtsi128_si64(b1); _dis1b = _mm_cvtsi128_si64(a1); _dis2a = _mm_cvtsi128_si64(b2); _dis2b = _mm_cvtsi128_si64(a2); _dis3a = _mm_cvtsi128_si64(b3); _dis3b = _mm_cvtsi128_si64(a3); _cnt0a = _mm_popcnt_u64(_dis0a); _cnt0b = _mm_popcnt_u64(_dis0b); _cnt1a = _mm_popcnt_u64(_dis1a); _cnt1b = _mm_popcnt_u64(_dis1b); _cnt2a = _mm_popcnt_u64(_dis2a); _cnt2b = _mm_popcnt_u64(_dis2b); _cnt3a = _mm_popcnt_u64(_dis3a); _cnt3b = _mm_popcnt_u64(_dis3b); return _cnt0a + _cnt0b + _cnt1a + _cnt1b + _cnt2a + _cnt2b + _cnt3a + _cnt3b; #endif }
uint64_t avx512f_harley_seal(const uint64_t * data, size_t size) { const unsigned int wordspervector = sizeof(__m512i) / sizeof(uint64_t); const unsigned int minvit = 16 * wordspervector; uint64_t total; size_t i; if (size >= minvit) { total = popcnt_harley_seal((const __m512i*) data, size / wordspervector); i = size - size % wordspervector; } else { total = 0; i = 0; } for (/**/; i < size; i++) { total += _mm_popcnt_u64(data[i]); } return total; }
/* ================== ================== */ void Process_Fragments( raster_output_& raster_output, shader_input_& shader_input ) { const __m128 zero = set_all(0.0f); shader_input.tile_mask_16x16 = 0x0; shader_input.tile_mask_64x64 = 0x0; //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_64x64]; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_64x64][i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; Process_Fragment_64x64( raster_fragment.w, i_buffer, coverage_mask, raster_output, shader_input ); } } //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_16x16]; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_16x16][i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; Process_Fragment_16x16( raster_fragment.w, 0, i_buffer, coverage_mask, raster_output, shader_input ); } } //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments[raster_output_::TRIVIAL_ACCEPT_4x4]; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::TRIVIAL_ACCEPT_4x4][i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; Process_Fragment_4x4(raster_fragment.w, 0, i_buffer, coverage_mask, raster_output, shader_input); } } //=============================================================================================== { //const __int32 start = raster_output_::MAX_FRAGMENTS - 1; //const __int32 end = raster_output.n_fragments[raster_output_::PARTIAL_ACCEPT_4x4]; //for (__int32 i_fragment = start; i_fragment > end; i_fragment--) { // raster_fragment_& raster_fragment = raster_output.raster_fragment[raster_output_::PARTIAL_ACCEPT_4x4][i_fragment]; // const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; // const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; // Process_Fragment_4x4(raster_fragment.w, 0, i_buffer, coverage_mask, raster_output, shader_input); //} } //=============================================================================================== { const __int32 n_fragments = raster_output.n_fragments_COMPLETE; __int32 n_depth_fragments = 0; for (__int32 i_fragment = 0; i_fragment < n_fragments; i_fragment++) { raster_fragment_complete_& raster_fragment = raster_output.raster_fragment_complete[i_fragment]; const __int32 i_buffer = raster_fragment.buffer_mask_packed >> 16; const unsigned __int32 coverage_mask = raster_fragment.buffer_mask_packed & 0xffff; pixel_shader(i_buffer, coverage_mask, raster_fragment.bazza, shader_input); const __int32 i_buffer_depth_4x4 = i_buffer / (4 * 4); const __int32 i_buffer_depth_16x16 = i_buffer / (16 * 16); const __int32 i_buffer_depth_64x64 = i_buffer / (64 * 64); shader_input.depth_tiles_4x4[i_buffer_depth_4x4] = shader_input.z_max; shader_input.tile_mask_16x16 |= one_bit_64 << i_buffer_depth_16x16; shader_input.tile_mask_64x64 |= one_bit_64 << i_buffer_depth_64x64; } } //=============================================================================================== { //printf_s(" %llu ", shader_input.tile_mask_16x16); __int64 n_tiles = _mm_popcnt_u64(shader_input.tile_mask_16x16); for (__int32 i_bit = 0; i_bit < n_tiles; i_bit++) { unsigned long i_tile_16x16; _BitScanForward64(&i_tile_16x16, shader_input.tile_mask_16x16); shader_input.tile_mask_16x16 ^= one_bit_64 << i_tile_16x16; const __int32 i_tile_4x4 = i_tile_16x16 * (4 * 4); __m128 depth_4x4[4]; depth_4x4[0] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (0 * 4)); depth_4x4[1] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (1 * 4)); depth_4x4[2] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (2 * 4)); depth_4x4[3] = load_u(shader_input.depth_tiles_4x4 + i_tile_4x4 + (3 * 4)); __m128 z_max; z_max = depth_4x4[0]; z_max = min_vec(depth_4x4[1], z_max); z_max = min_vec(depth_4x4[2], z_max); z_max = min_vec(depth_4x4[3], z_max); __m128 z_out = z_max; z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); shader_input.depth_tiles_16x16[i_tile_16x16] = store_s(z_out); } } { __int64 n_tiles = _mm_popcnt_u64(shader_input.tile_mask_64x64); //printf_s(" %llu ", n_tiles); for (__int32 i_bit = 0; i_bit < n_tiles; i_bit++) { unsigned long i_tile_64x64; _BitScanForward64(&i_tile_64x64, shader_input.tile_mask_64x64); shader_input.tile_mask_64x64 ^= one_bit_64 << i_tile_64x64; const __int32 i_tile_16x16 = i_tile_64x64 * (4 * 4); __m128 depth_16x16[4]; depth_16x16[0] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (0 * 4)); depth_16x16[1] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (1 * 4)); depth_16x16[2] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (2 * 4)); depth_16x16[3] = load_u(shader_input.depth_tiles_16x16 + i_tile_16x16 + (3 * 4)); __m128 z_max; z_max = depth_16x16[0]; z_max = min_vec(depth_16x16[1], z_max); z_max = min_vec(depth_16x16[2], z_max); z_max = min_vec(depth_16x16[3], z_max); __m128 z_out = z_max; z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); shader_input.depth_tiles_64x64[i_tile_64x64] = store_s(z_out); } } }
__INTRIN_INLINE uint64_t popcount(uint64_t x) { return uint64_t(_mm_popcnt_u64(x)); }
AbstractBuffer<int32_t> ADCensus::constructDisparityMap(const AbstractBuffer<pixel> *leftImage, const AbstractBuffer<pixel> *rightImage, const AbstractBuffer<grayPixel> *leftGrayImage, const AbstractBuffer<grayPixel> *rightGrayImage) { // Initialization int width = leftImage->w; int height = leftImage->h; BaseTimeStatisticsCollector collector; Statistics outerStats; outerStats.setValue("H", height); outerStats.setValue("W", width); AbstractBuffer<int32_t> bestDisparities = AbstractBuffer<int32_t>(height, width); AbstractBuffer<COST_TYPE> minCosts = AbstractBuffer<COST_TYPE>(height, width); minCosts.fillWith(-1); // Disparity computation outerStats.startInterval(); AbstractBuffer<int64_t> leftCensus = AbstractBuffer<int64_t>(height, width); AbstractBuffer<int64_t> rightCensus = AbstractBuffer<int64_t>(height, width); makeCensus(leftGrayImage, leftCensus); makeCensus(rightGrayImage, rightCensus); outerStats.resetInterval("Making census"); makeAggregationCrosses(leftImage); outerStats.resetInterval("Making aggregation crosses"); for (uint i = 0; i < CORE_COUNT_OF(table1); i++) { table1[i] = robust(i, lambdaCT); table2[i] = robust(i, lambdaAD); } bool parallelDisp = true; parallelable_for(0, width / 3, [this, &minCosts, &bestDisparities, &leftImage, &rightImage, &leftCensus, &rightCensus, &collector, height, width, parallelDisp](const BlockedRange<int> &r) { for (int d = r.begin(); d != r.end(); ++d) { Statistics stats; stats.startInterval(); AbstractBuffer<COST_TYPE> costs = AbstractBuffer<COST_TYPE>(height, width); stats.resetInterval("Matrix construction"); parallelable_for(windowHh, height - windowHh, [this, &costs, &leftImage, &rightImage, &leftCensus, &rightCensus, d, width](const BlockedRange<int> &r) { for (int y = r.begin(); y != r.end(); ++y) { auto *im1 = &leftImage->element(y, windowWh + d); auto *im2 = &rightImage->element(y, windowWh); int64_t *cen1 = &leftCensus.element(y, windowWh + d); int64_t *cen2 = &rightCensus.element(y, windowWh); int x = windowWh + d; #ifdef WITH_SSE for (; x < width - windowWh; x += 8) { FixedVector<Int16x8, 4> c1 = SSEReader8BBBB_DDDD::read((uint32_t *)im1); FixedVector<Int16x8, 4> c2 = SSEReader8BBBB_DDDD::read((uint32_t *)im2); UInt16x8 dr = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_R]), UInt16x8(c2[RGBColor::FIELD_R])); UInt16x8 dg = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_G]), UInt16x8(c2[RGBColor::FIELD_G])); UInt16x8 db = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_B]), UInt16x8(c2[RGBColor::FIELD_B])); UInt16x8 ad = (dr + dg + db) >> 2; Int16x8 cost_ad = Int16x8(robustLUTAD(ad[0]), robustLUTAD(ad[1]), robustLUTAD(ad[2]), robustLUTAD(ad[3]), robustLUTAD(ad[4]), robustLUTAD(ad[5]), robustLUTAD(ad[6]), robustLUTAD(ad[7])); Int64x2 cen10(&cen1[0]); Int64x2 cen12(&cen1[2]); Int64x2 cen14(&cen1[4]); Int64x2 cen16(&cen1[6]); Int64x2 cen20(&cen2[0]); Int64x2 cen22(&cen2[2]); Int64x2 cen24(&cen2[4]); Int64x2 cen26(&cen2[6]); Int64x2 diff0 = cen10 ^ cen20; Int64x2 diff2 = cen12 ^ cen22; Int64x2 diff4 = cen14 ^ cen24; Int64x2 diff6 = cen16 ^ cen26; Int16x8 cost_ct(robustLUTCen(_mm_popcnt_u64(diff0.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff0.getInt(1))), robustLUTCen(_mm_popcnt_u64(diff2.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff2.getInt(1))), robustLUTCen(_mm_popcnt_u64(diff4.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff4.getInt(1))), robustLUTCen(_mm_popcnt_u64(diff6.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff6.getInt(1)))); Int16x8 cost_total = cost_ad + cost_ct; for (int i = 0; i < 8; ++i) { costs.element(y, x + i) = cost_total[i]; } im1 += 8; im2 += 8; cen1+= 8; cen2+= 8; } #else for (; x < width - windowWh; ++x) { uint8_t c_ad = costAD(*im1, *im2); uint8_t c_census = hammingDist(*cen1, *cen2); costs.element(y, x) = robustLUTCen(c_census) + robustLUTAD(c_ad); im1 ++; im2 ++; cen1++; cen2++; } #endif } }, !parallelDisp ); stats.resetInterval("Cost computation"); aggregateCosts(&costs, windowWh + d, windowHh, width - windowWh, height - windowHh); stats.resetInterval("Cost aggregation"); for (int x = windowWh + d; x < width - windowWh; ++x) { for (int y = windowHh; y < height - windowHh; ++y) { tbb::mutex::scoped_lock(bestDisparitiesMutex); if(costs.element(y, x) < minCosts.element(y, x)) { minCosts.element(y, x) = costs.element(y, x); bestDisparities.element(y, x) = d; //result.element(y,x) = (bestDisparities.element(y, x) / (double)width * 255 * 3); } } } //BMPLoader().save("../../result.bmp", result); stats.endInterval("Comparing with previous minimum"); collector.addStatistics(stats); } }, parallelDisp);
inline uint popCount(uint64_t value) { return static_cast<uint>(_mm_popcnt_u64(value)); }
int xm_popcount(XMM x) { return _mm_popcnt_u64(((uint64_t const*)&x)[0]) + _mm_popcnt_u64(((uint64_t const*)&x)[0]); }