void test2(int N, int rep) { WallClockTimer timer; uint64_t total = 0; uint64_t sum = 0; for (int j = 0; j < rep; ++j) { timer.reset(); for (int i = 0; i < N; i++) { stringstream str; str << i << " " << j; sum += reinterpret_cast<size_t>(str.str().c_str()); } total += timer.split(); } cout << "Ignore: " << sum << endl; cout << " total # of construct/destruct/proc: " << rep * N << ", time " << total / 1e3 << " ms" << " Construct/destruct/proc per sec: " << (rep * N * 1e6 / total ) << endl; }
void test(size_t N ) { WallClockTimer time; for(int t = 0; t<2;++t) { cout <<" test # "<< t<<endl; vector<int> data = givemeanarray(N) ; { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); slowishinverseDelta1<mindist>(data); cout<<"Slowish(1) inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); slowishinverseDelta2<mindist>(data); cout<<"Slowish(2) inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDelta<mindist>(data); cout<<"Unroll2 inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaMem<mindist>(data); cout<<"Unroll2 (mem) inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaMy1<mindist>(data); cout<<"My1 inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } #if 0 { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaMy2<mindist>(data); cout<<"My2 inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaMy3<mindist>(data); cout<<"My3 inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } #endif { int* pCopyData = (int*)memalign(16, data.size() * sizeof data[0]); if (!pCopyData) { throw runtime_error("Not enough memory"); } memcpy(pCopyData, &data[0], data.size() * sizeof data[0]); cout << "min distance between ints is "<<mindist<<endl; time.reset(); deltaForSIMD<mindist>(pCopyData, data.size()); cout<<"for SIMD delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaSIMD<mindist>(pCopyData, data.size()); cout<<"SIMD inverse delta speed "<<N/(1000.0*time.split())<<endl; for (size_t i = 0; i < data.size(); ++i) { if (data[i] != pCopyData[i]) { cerr << "Elem index: " << i << " orig: " << data[i] << " obtained: " << pCopyData[i] << endl; throw runtime_error("bug"); } } free(pCopyData); cout<<endl; } { int* pCopyData = (int*)memalign(16, data.size() * sizeof data[0]); if (!pCopyData) { throw runtime_error("Not enough memory"); } memcpy(pCopyData, &data[0], data.size() * sizeof data[0]); cout << "min distance between ints is "<<mindist<<endl; time.reset(); deltaForSIMD<mindist>(pCopyData, data.size()); cout<<"for SIMD delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaSIMDUnrolled<mindist>(pCopyData, data.size()); cout<<"SIMD inverse UNROLLED delta speed "<<N/(1000.0*time.split())<<endl; for (size_t i = 0; i < data.size(); ++i) { if (data[i] != pCopyData[i]) { cerr << "Elem index: " << i << " orig: " << data[i] << " obtained: " << pCopyData[i] << endl; throw runtime_error("bug"); } } free(pCopyData); cout<<endl; } cout<<endl<<endl<<endl; } }
int main(int argc, char **argv) { size_t howmany = 0; size_t loop = 3; bool uniform = false; uint32_t Big = 22; float intersectionratio = 0.3f; uint32_t MaxBit = 26; int c; while ((c = getopt(argc, argv, "uns:m:R:M:S:l:h")) != -1) switch (c) { case 'h': printusage(); return 0; case 'S': Big = atoi(optarg); break; case 'R': intersectionratio = atof(optarg); break; case 'M': MaxBit = atoi(optarg); if (MaxBit < 1) { printusage(); return -1; } break; case 'm': howmany = atoi(optarg); if (howmany < 1) { printusage(); return -1; } break; case 'l': loop = atoi(optarg); if (loop < 1) { printusage(); return -1; } break; case 'u': uniform = true; break; default: abort(); } if (howmany == 0) { howmany = 5; } cout << "# howmany : " << howmany << endl; cout << "# loop : " << loop << endl; cout << "# distribution : " << (uniform ? "uniform" : "clustered") << endl; cout << "# Big : " << Big << endl; cout << "# intersectionratio : " << intersectionratio << endl; cout << "# MaxBit : " << MaxBit << endl; UniformDataGenerator udg; ClusteredDataGenerator cdg; WallClockTimer z; size_t bogus = 0; vector<uint32_t> buffer(2 * (1U << Big)); #ifdef LIKWID_MARKERS char currentMarker[64]; likwid_markerInit(); #endif cout << "# size-ratio\t"; for (string intername : IntersectionFactory::allNames()) { cout << intername << "\t"; } cout << " partioned (Schlegel et al.: improved, original) 16-bitV1 " "16-bitscalar "; cout << "relative-intersection-size " << endl; for (float ir = 1.001; ir <= 10000; ir = ir * sqrt(1.9)) { vector<pair<vector<uint32_t>, vector<uint32_t>>> data(howmany); uint32_t smallsize = static_cast<uint32_t>(round(static_cast<float>(1 << Big) / ir)); cout << "#generating data..."; cout.flush(); for (size_t k = 0; k < howmany; ++k) { data[k] = uniform ? getNaivePair(udg, smallsize, 1U << MaxBit, ir, intersectionratio) : getNaivePair(cdg, smallsize, 1U << MaxBit, ir, intersectionratio); } cout << "ok." << endl; cout << "#partitions..."; vector<pair<vector<uint16_t>, vector<uint16_t>>> datapart(howmany); for (size_t k = 0; k < howmany; ++k) { vector<uint16_t> part1(data[k].first.size() * 4); size_t p1length = partition(data[k].first.data(), data[k].first.size(), part1.data(), part1.size()); part1.resize(p1length); part1.shrink_to_fit(); vector<uint16_t> part2(data[k].second.size() * 4); size_t p2length = partition(data[k].second.data(), data[k].second.size(), part2.data(), part2.size()); part2.resize(p2length); part2.shrink_to_fit(); datapart[k] = make_pair(part1, part2); } cout << "ok." << endl; cout << ir << "\t"; float aratio = 0.0f; for (string intername : IntersectionFactory::allNames()) { intersectionfunction interfnc = IntersectionFactory::getFromName(intername); size_t volume = 0; #ifdef LIKWID_MARKERS snprintf(currentMarker, sizeof(currentMarker), "%s %.2f", intername.c_str(), ir); likwid_markerStartRegion(currentMarker); #endif z.reset(); for (size_t k = 0; k < data.size(); ++k) { volume += (data[k].first.size() + data[k].second.size()) * loop; for (size_t L = 0; L < loop; ++L) { aratio = interfnc(data[k].first.data(), (data[k].first).size(), data[k].second.data(), (data[k].second).size(), buffer.data()); bogus += aratio; } } cout << setw(10) << setprecision(5) << (volume / (static_cast<double>(z.split()))) << "\t"; #ifdef LIKWID_MARKERS likwid_markerStopRegion(currentMarker); #endif } z.reset(); size_t volume = 0; for (size_t k = 0; k < data.size(); ++k) { volume += (data[k].first.size() + data[k].second.size()) * loop; for (size_t L = 0; L < loop; ++L) { aratio = intersect_partitioned( datapart[k].first.data(), (datapart[k].first).size(), datapart[k].second.data(), (datapart[k].second).size(), (uint16_t *)buffer.data()); bogus += aratio; } } cout << setw(10) << setprecision(5) << (volume / (static_cast<double>(z.split()))) << "\t"; z.reset(); volume = 0; for (size_t k = 0; k < data.size(); ++k) { volume += (data[k].first.size() + data[k].second.size()) * loop; for (size_t L = 0; L < loop; ++L) { aratio = original_intersect_partitioned( datapart[k].first.data(), (datapart[k].first).size(), datapart[k].second.data(), (datapart[k].second).size(), (uint16_t *)buffer.data()); bogus += aratio; } } cout << setw(10) << setprecision(5) << (volume / (static_cast<double>(z.split()))) << "\t"; z.reset(); volume = 0; for (size_t k = 0; k < data.size(); ++k) { volume += (data[k].first.size() + data[k].second.size()) * loop; for (size_t L = 0; L < loop; ++L) { aratio = intersect_partitionedV1( datapart[k].first.data(), (datapart[k].first).size(), datapart[k].second.data(), (datapart[k].second).size(), (uint16_t *)buffer.data()); bogus += aratio; } } cout << setw(10) << setprecision(5) << (volume / (static_cast<double>(z.split()))) << "\t"; z.reset(); volume = 0; for (size_t k = 0; k < data.size(); ++k) { volume += (data[k].first.size() + data[k].second.size()) * loop; for (size_t L = 0; L < loop; ++L) { aratio = intersect_partitionedscalar( datapart[k].first.data(), (datapart[k].first).size(), datapart[k].second.data(), (datapart[k].second).size(), (uint16_t *)buffer.data()); bogus += aratio; } } cout << setw(10) << setprecision(5) << (volume / (static_cast<double>(z.split()))) << "\t"; cout << "\t\t" << aratio / smallsize; cout << endl; } #ifdef LIKWID_MARKERS likwid_markerClose(); #endif cout << "# bogus = " << bogus << endl; }
int main() { assert(sizeof(long)==8); assert(sizeof(int)==4); WallClockTimer timer; int repeat = 100; int N = 10000; cout<<"# We report bits-per-integer speed-of-naive speed-of-popcnt1 speed-of-popcnt2 speed-of-table speed-of-tzcnt1 speed-of-tzcnt2 where speeds are in millions of integers per second "<<endl; for(int sb = 1; sb<=64; sb*=2) { int setbitsmax = sb*N; vector<long> bitmap(N); for (int k = 0; k < setbitsmax; ++k) { int bit = rand() % (N*64); bitmap[bit/64] |= (1L<<(bit%64)); } int bitcount = 0; for(int k = 0; k <N; ++k) { bitcount += __builtin_popcountl(bitmap[k]); } double bitsperinteger = N*sizeof(long)*8.0/bitcount; vector<int> outputnaive(bitcount); vector<int> outputpopcnt1(bitcount); vector<int> outputpopcnt2(bitcount); vector<int> outputtable(bitcount); vector<int> outputctz1(bitcount); vector<int> outputctz2(bitcount); cout<<"# Stored "<<bitcount<<" unary numbers in "; cout<< N*sizeof(long)<<" bytes " ; cout<<" ("<<bitsperinteger<<" bits per number)"<<endl; timer.reset(); int c0 = 0; for(int t1=0; t1<repeat; ++t1) c0 = bitscanunary_naive(bitmap.data(),N,outputnaive.data()); int tinaive = timer.split(); timer.reset(); int c1 = 0; for(int t1=0; t1<repeat; ++t1) c1 = bitscanunary_popcnt1(bitmap.data(),N,outputpopcnt1.data()); assert(c1 == c0); int tipopcnt1 = timer.split(); timer.reset(); int c12 = 0; for(int t1=0; t1<repeat; ++t1) c12 = bitscanunary_popcnt2(bitmap.data(),N,outputpopcnt2.data()); assert(c12 == c0); int tipopcnt2 = timer.split(); timer.reset(); int c2 = 0; for(int t1=0; t1<repeat; ++t1) c2 = bitscanunary_table(bitmap.data(),N,outputtable.data()); assert(c2 == c0); int titable = timer.split(); timer.reset(); int c3 = 0; for(int t1=0; t1<repeat; ++t1) c3 = bitscanunary_ctzl1(bitmap.data(),N,outputctz1.data()); assert(c3 == c0); int tictz1 = timer.split(); timer.reset(); int c32 = 0; for(int t1=0; t1<repeat; ++t1) c32 = bitscanunary_ctzl2(bitmap.data(),N,outputctz2.data()); assert(c32 == c0); int tictz2 = timer.split(); assert (outputnaive == outputpopcnt1); assert (outputnaive == outputpopcnt2); assert (outputnaive == outputtable); assert (outputnaive == outputctz1); assert (outputnaive == outputctz2); cout << bitsperinteger<<" " ; cout << bitcount * repeat * 0.001 /tinaive <<" "; cout << bitcount * repeat * 0.001 /tipopcnt1 <<" "; cout << bitcount * repeat * 0.001 /tipopcnt2 <<" "; cout << bitcount * repeat * 0.001 /titable <<" "; cout << bitcount * repeat * 0.001 /tictz1 <<" "; cout << bitcount * repeat * 0.001 /tictz2 <<" "; cout << endl ; } return 0; }
void simplebenchmark(uint32_t N = 1U << 16, uint32_t T = 1U << 9) { T = T + 1; // we have a warming up pass vector<uint32_t, cacheallocator> data = generateArray32(N); vector<uint32_t, cacheallocator> compressed(N, 0); vector<uint32_t, cacheallocator> recovered(N, 0); WallClockTimer z; double packtime, packtimewm, unpacktime; double simdpacktime, simdpacktimewm, simdunpacktime; double horizontalunpacktime; cout << "#million of integers per second: higher is better" << endl; cout << "#bit, pack, pack without mask, unpack" << endl; for (uint32_t bitindex = 0; bitindex < 32; ++bitindex) { uint32_t bit = 32 - bitindex; maskfnc(data, bit); for (uint32_t repeat = 0; repeat < 1; ++repeat) { packtime = 0; packtimewm = 0; unpacktime = 0; simdpacktime = 0; simdpacktimewm = 0; simdunpacktime = 0; horizontalunpacktime = 0; for (uint32_t t = 0; t < T; ++t) { compressed.clear(); compressed.resize(N * bit / 32, 0); recovered.clear(); recovered.resize(N, 0); simdpack(data, compressed, bit); simdunpack(compressed, recovered, bit); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bugs!" << bit << endl; return; } z.reset(); simdpack(data, compressed, bit); if (t > 0) simdpacktime += z.split(); simdunpack(compressed, recovered, bit); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bugs!" << bit << endl; return; } z.reset(); simdpackwithoutmask(data, compressed, bit); if (t > 0) simdpacktimewm += z.split(); z.reset(); simdunpack(compressed, recovered, bit); if (t > 0) simdunpacktime += z.split(); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bugs!" << bit << endl; return; } z.reset(); fastpack(data, compressed, bit); if (t > 0) packtime += z.split(); fastunpack(compressed, recovered, bit); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bug1!" << endl; return; } z.reset(); fastpackwithoutmask(data, compressed, bit); if (t > 0) packtimewm += z.split(); z.reset(); fastunpack(compressed, recovered, bit); if (t > 0) unpacktime += z.split(); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bug1!" << endl; return; } z.reset(); horizontalunpack(compressed, recovered, bit); if (t > 0) horizontalunpacktime += z.split(); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bug1!" << endl; return; } } cout << std::setprecision(4) << bit << "\t\t" << N * (T - 1) / (packtime) << "\t\t" << N * (T - 1) / (packtimewm) << "\t\t\t" << N * (T - 1) / (unpacktime) << "\t\t"; cout << std::setprecision(4) << bit << "\t\t" << N * (T - 1) / (simdpacktime) << "\t\t" << N * (T - 1) / (simdpacktimewm) << "\t\t" << N * (T - 1) / (simdunpacktime) << "\t\t"; cout<< N * (T - 1) / (horizontalunpacktime) << "\t\t"; cout << endl; } } }
void simplebenchmark(uint32_t N = 1U << 16, uint32_t T = 1U << 9) { T = T + 1; // we have a warming up pass uint32_t bogus = 0; vector<uint32_t> data(N); vector<uint32_t> compressed(N); vector<uint32_t> icompressed(N); vector<uint32_t> recovered(N); WallClockTimer z; double unpacktime; double iunpacktime; cout << "#million of integers per second: higher is better" << endl; cout << "#bit, unpack,iunpack" << endl; for (uint32_t bitindex = 0; bitindex < 32; ++bitindex) { uint32_t bit = bitindex + 1; vector<uint32_t> initdata(N); for (size_t i = 0; 4 * i < data.size(); i += 4) { initdata[i] = random(bit) + (i >= 4 ? initdata[i - 4] : 0); for (size_t j = 1; j < 4; ++j) { initdata[i + j] = initdata[i]; } } const vector<uint32_t> refdata = initdata; vector<uint32_t>().swap(initdata); icompressed.clear(); // 4 * N should be enough for all schemes icompressed.resize(4 * N, 0); compressed.clear(); // 4 * N should be enough for all schemes compressed.resize(4 * N, 0); recovered.clear(); recovered.resize(N, 0); if (needPaddingTo128Bits(recovered.data())) { throw logic_error("Array is not aligned on 128 bit boundary!"); } if (needPaddingTo128Bits(icompressed.data())) { throw logic_error("Array is not aligned on 128 bit boundary!"); } if (needPaddingTo128Bits(compressed.data())) { throw logic_error("Array is not aligned on 128 bit boundary!"); } if (needPaddingTo128Bits(refdata.data())) { throw logic_error("Array is not aligned on 128 bit boundary!"); } for (uint32_t repeat = 0; repeat < 1; ++repeat) { unpacktime = 0; iunpacktime = 0; for (uint32_t t = 0; t <= T; ++t) { assert(data.size() == refdata.size()); fill(icompressed.begin(), icompressed.end(), 0); fill(recovered.begin(), recovered.end(), 0); memcpy(data.data(), refdata.data(), data.size() * sizeof(uint32_t)); // memcpy can be slow Helper::pack(data.data(), data.size(), icompressed.data(), bit); z.reset(); Helper::unpack(icompressed.data(), refdata.size(), recovered.data(), bit); if (t > 0) // we don't count the first run unpacktime += static_cast<double>(z.split()); if (!equalOnFirstBits(refdata, recovered, bit)) { cout << " Bug 1a " << bit << endl; return; } memcpy(data.data(), refdata.data(), data.size() * sizeof(uint32_t)); // memcpy can be slow Helper::pack(data.data(), data.size(), icompressed.data(), bit); z.reset(); Helper::iunpack(icompressed.data(), refdata.size(), recovered.data(), bit); if (t > 0) // we don't count the first run iunpacktime += static_cast<double>(z.split()); if (!equalOnFirstBits(refdata, recovered, bit)) { cout << " Bug 2 " << bit << endl; return; } } cout << std::setprecision(4) << bit << "\t\t"; cout << "\t\t" << N * (T - 1) / (unpacktime) << "\t\t"; cout << "\t\t" << N * (T - 1) / (iunpacktime); cout << endl; } } cout << "# ignore this " << bogus << endl; }