void testPow(int N, int rep) { cout << "================================ " << endl; vector<T> data1(N*4); vector<T> data2(N*4); for (int i = 0; i < 4*N; ++i) { data1[i] = 1 + (rand() % 10000) / 10.0; data2[i] = 1 + (rand() % 10000) / 10.0; } WallClockTimer timer; T sum = 0; for (int j = 0; j < rep; ++j) { for (int i = 0; i < N*4; i+=4) { sum += pow(data1[i], data2[i]); sum += pow(data1[i+1], data2[i+1]); sum += pow(data1[i+2], data2[i+2]); sum += pow(data1[i+3], data2[i+3]); } } timer.split(); uint64_t t = timer.elapsed(); uint64_t TotalQty = rep * N * 4; cout << "Ignore: " << sum << endl; cout << "Pows computed: " << TotalQty << ", time " << t / 1e3 << " ms, type: " << typeid(T).name() << endl; cout << "Milllions of Pows per sec: " << (float(TotalQty) / t) << endl; }
void testAtan(int N, int rep) { vector<T> data(N*4); for (int i = 0; i < 4*N; ++i) { data[i] = 1 + (rand() % 10000) / 1000.0; } WallClockTimer timer; T sum = 0; for (int j = 0; j < rep; ++j) { for (int i = 0; i < N*4; i+=4) { sum += atan(data[i]); sum += atan(data[i+1]); sum += atan(data[i+2]); sum += atan(data[i+3]); } sum /= N*4; } timer.split(); uint64_t t = timer.elapsed(); uint64_t TotalQty = rep * N * 4; cout << "Ignore: " << sum << endl; cout << "Atans computed: " << TotalQty << ", time " << t / 1e3 << " ms, type: " << typeid(T).name() << endl; cout << "Milllions of Atans per sec: " << (float(TotalQty) / t) << endl; }
void test3(int N, int rep) { WallClockTimer timer; uint64_t total = 0; uint64_t sum = 0; string emptyStr; stringstream str; for (int j = 0; j < rep; ++j) { timer.reset(); for (int i = 0; i < N; i++) { str.str(emptyStr); str << i << " " << j; sum += reinterpret_cast<size_t>(str.str().c_str()); } total += timer.split(); } cout << "Ignore: " << sum << endl; cout << " total # of proc without construct/deconstruct: " << rep * N << ", time " << total / 1e3 << " ms" << " proc per sec: " << (rep * N * 1e6 / total ) << endl; }
void testRoot(int N, size_t MaxRoot, int rep) { vector<T> data(N*4); for (int i = 0; i < 4*N; ++i) { data[i] = 1 + (rand() % (10 * MaxRoot)) / 10.0; } WallClockTimer timer; T sum = 0; for (int j = 0; j < rep; ++j) { for (int i = 0; i < N*4; i+=4) { sum += sqrt(data[i]); sum += sqrt(data[i+1]); sum += sqrt(data[i+2]); sum += sqrt(data[i+3]); } } timer.split(); uint64_t t = timer.elapsed(); uint64_t TotalQty = uint64_t(rep) * N * 4LL; cout << "Ignore: " << sum << endl; cout << "max root val.: " << MaxRoot << " Roots computed: " << TotalQty << ", time " << t / 1e3 << " ms, type: " << typeid(T).name() << endl; cout << "Milllions of Roots per sec: " << (float(TotalQty) / t) << endl; }
void testIntPowExplicitTemplate(int IntExp, int N, int rep) { cout << "================================ " << endl; vector<T> data(N*4); for (int i = 0; i < 4*N; ++i) { data[i] = 1 + (rand() % 10000) / 1000.0; } WallClockTimer timer; T sum = 0; for (int j = 0; j < rep; ++j) { for (int i = 0; i < N*4; i+=4) { sum += pow(data[i], (unsigned)IntExp); sum += pow(data[i+1], (unsigned)IntExp); sum += pow(data[i+2], (unsigned)IntExp); sum += pow(data[i+3], (unsigned)IntExp); } sum /= N*4; } timer.split(); uint64_t t = timer.elapsed(); uint64_t TotalQty = rep * N * 4; cout << "Ignore: " << sum << endl; cout << "Pows (expl arguments) computed, degree: " << IntExp << " TotalQty: " << TotalQty << ", time " << t / 1e3 << " ms, type: " << typeid(T).name() << endl; cout << "Milllions of integer Pows (expl arguments) per sec: " << (float(TotalQty) / t) << endl; }
void testEfficientFractPow(int N, int rep, unsigned FuncNumDig, unsigned DataNumDig, bool bRootOnly) { cout << "================================ " << endl; vector<T> data1(N*4); vector<T> data2(N*4); uint64_t MaxK = uint64_t(1)<<FuncNumDig; uint64_t DataMaxK = uint64_t(1)<<DataNumDig; for (int i = 0; i < 4*N; ++i) { data1[i] = 1 + (rand() % 10000) / 10.0; data2[i] = bRootOnly ? T(1) / T(DataMaxK):(rand() % MaxK) / T(DataMaxK); } WallClockTimer timer; T sum = 0; T fract = T(1)/N; for (int j = 0; j < rep; ++j) { for (int i = 0; i < N*4; i+=4) { sum += 0.01 * EfficientFractPow(data1[i], data2[i], FuncNumDig); sum += 0.01 * EfficientFractPow(data1[i+1], data2[i+1], FuncNumDig); sum += 0.01 * EfficientFractPow(data1[i+2], data2[i+2], FuncNumDig); sum += 0.01 * EfficientFractPow(data1[i+3], data2[i+3], FuncNumDig); } sum *= fract; } timer.split(); uint64_t t = timer.elapsed(); uint64_t TotalQty = rep * N * 4; cout << "Ignore: " << sum << endl; cout << "Pows computed: " << TotalQty << ", time " << t / 1e3 << " ms, type: " << typeid(T).name() << endl; cout << "Milllions of efficient fract Pows (bRootOnly = " << bRootOnly << " per sec: " << (float(TotalQty) / t) << " FuncNumDig = " << FuncNumDig << " DataNumDig = " << DataNumDig << endl; }
void testIntPowOptim2(int IntExp, int N, int rep) { cout << "================================ " << endl; vector<T> data(N*4); for (int i = 0; i < 4*N; ++i) { data[i] = 1 + (rand() % 10000) / 1000.0; } WallClockTimer timer; T sum = 0; for (int j = 0; j < rep; ++j) { for (int i = 0; i < N*4; i+=4) { sum += PowOptimPosExp2(data[i], IntExp); sum += PowOptimPosExp2(data[i+1], IntExp); sum += PowOptimPosExp2(data[i+2], IntExp); sum += PowOptimPosExp2(data[i+3], IntExp); } } timer.split(); uint64_t t = timer.elapsed(); uint64_t TotalQty = rep * N * 4; cout << "Ignore: " << sum << endl; cout << "Pows (optimized2) computed, degree: " << IntExp << " TotalQty: " << TotalQty << ", time " << t / 1e3 << " ms, type: " << typeid(T).name() << endl; cout << "Milllions of integer (optimized2) Pows per sec: " << (float(TotalQty) / t) << endl; }
void test(size_t N ) { WallClockTimer time; for(int t = 0; t<2;++t) { cout <<" test # "<< t<<endl; vector<short> data = givemeanarray(N) ; vector<short> copydata(data); time.reset(); straightsum(&data[0],N); cout<<"straight sum (C-like) "<<N/(1000.0*time.split())<<endl; time.reset(); slowishSum(data); cout<<"basic sum (C++-like) "<<N/(1000.0*time.split())<<endl; data = copydata; time.reset(); sum(data); cout<<"smarter sum "<<N/(1000.0*time.split())<<endl; data = copydata; time.reset(); fastSum(data); cout<<"fast sum "<<N/(1000.0*time.split())<<endl; cout<<endl<<endl<<endl; } }
int overall(size_t N) { int bogus = 0; WallClockTimer t; t.reset(); bogus += testSTL(N); int delay = t.split(); cout << "STL vector " << N /(delay * 1000.0) << endl; vector<double> idelays; for(size_t T = 0 ; T < 20 ; ++T ) { t.reset(); bogus += straight(N); int tdelay = t.split(); idelays.push_back(tdelay); } cout << "static array : " << N /(median(idelays) * 1000.0) << endl; for(size_t factor = 1; factor <= 6; ++ factor) { vector<double> delays; for(size_t T = 0 ; T < 20 ; ++T ) { t.reset(); bogus += testManual(N,2+factor,2); int tdelay = t.split(); delays.push_back(tdelay); } cout << "pointer-based "<< (factor +2)/2.0<< " : " << N /(median(delays) * 1000.0) << endl; } return bogus; }
int test(const size_t N) { int * a = new int[N]; for(size_t k = 0; k< N; ++k) a[k] = k - 2 + k * k; int fakecounter = 0; cout<<" Buffer size = "<< N*sizeof(int) /(1024.0*1024.0)<<" MB "<<endl; WallClockTimer t; double besttime1 = numeric_limits<double>::max(); double besttime2 = numeric_limits<double>::max(); double besttime3 = numeric_limits<double>::max(); for(int k = 0; k<20;++k) { t.reset(); fakecounter += totalsum(a,N); double thistime1 = t.split(); if(thistime1 < besttime1) besttime1 = thistime1; t.reset(); fakecounter += sum<2>(a,N); double thistime2 = t.split(); if(thistime2 < besttime2) besttime2 = thistime2; t.reset(); fakecounter += sum<16>(a,N); double thistime3 = t.split(); if(thistime3 < besttime3) besttime3 = thistime3; } cout<<" total sum speed = "<<N/(1000*1000*besttime1) <<" mis or "<< N*sizeof(int)/(1024.0*1024.0*besttime1)<<" MB/s"<<endl; cout<<" partial sum speed = "<<N/(1000*1000*besttime2) <<" mis or "<< N*sizeof(int)/(1024.0*1024.0*besttime2)<<" MB/s"<<endl; cout<<" speed ratio = "<< besttime1 /besttime2<<endl; cout<<" partial sum speed = "<<N/(1000*1000*besttime3) <<" mis or "<< N*sizeof(int)/(1024.0*1024.0*besttime3)<<" MB/s"<<endl; cout<<" speed ratio = "<< besttime1 /besttime3<<endl; return fakecounter; }
void testPackUnpackC(size_t N = 2048 * 32 * 2048) { WallClockTimer timer; bool* data = new bool[N]; for(size_t i = 0; i<N; ++i) data[i] = static_cast<bool>(i & 1); vector<char> comp(N/8); for(size_t t = 0; t< 3; ++t) { timer.reset(); pack(data, &comp[0], N); cout<<" pack time = "<<timer.split()<<endl; timer.reset(); unpack(&comp[0], data, N); cout<<" unpack time = "<<timer.split()<<endl; for(size_t i = 0; i<N; ++i) assert(data[i] == static_cast<bool>(i & 1)); } delete[] data; }
int testStoreLoadC(size_t M = 2048 * 4, size_t N = 2048 * 8, size_t repeat = 1) { WallClockTimer timer; vector<int> data; int bogus; for(size_t i = 0; i<M; ++i) data.push_back(i); vector<int> bigdata; bigdata.resize(M * N); for(size_t t = 0; t< 3; ++t) { timer.reset(); for (size_t r = 0; r < repeat; ++r) bogus += storeTestC(&data[0],&bigdata[0],N,M); if(t>0) cout<<" store time = "<<timer.split()<<endl; timer.reset(); for (size_t r = 0; r < repeat; ++r) bogus += loadTestC(&data[0],&bigdata[0],N,M); if(t>0) cout<<" load time = "<<timer.split()<<endl; for(int i = 0; i<M; ++i) assert(data[i] == i); } return bogus; }
void test(size_t N ) { cout << "min distance between ints is "<<mindist<<endl; WallClockTimer time; for(int t = 0; t<2;++t) { cout <<" test # "<< t<<endl; vector<int> data = givemeanarray(N) ; vector<int> copydata(data); time.reset(); cdelta<mindist>(&data[0],data.size()); cout<<"c delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); cinverseDelta<mindist>(&data[0],data.size()); cout<<"c inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDelta<mindist>(data); cout<<"inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; delta<mindist>(data); time.reset(); slowishinverseDelta<mindist>(data); cout<<"slowish inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; delta<mindist>(data); time.reset(); bufferedinverseDelta<mindist>(data); cout<<"buffered inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; delta<mindist>(data); time.reset(); inverseDeltaVolkov<mindist>(data); cout<<"inverse delta speed (volkov-lemire) "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; cout<<endl<<endl<<endl; } }
int main(int argc, char * argv[]) { std::string usage = EXECUTABLE " in LTP " LTP_VERSION " - " LTP_COPYRIGHT "\n"; usage += DESCRIPTION "\n\n"; usage += "usage: ./" EXECUTABLE " <options>\n\n"; usage += "options"; options_description optparser = options_description(usage); optparser.add_options() ("threads", value<int>(), "The number of threads [default=1].") ("input", value<std::string>(), "The path to the input file. " "Input data should contain one sentence each line. " "Words should be separated by space with POS tag appended by " "'_' (e.g. \"w1_p1 w2_p2 w3_p3 w4_p4\").") ("ner-model", value<std::string>(), "The path to the postag model [default=ltp_data/ner.model].") ("help,h", "Show help information"); if (argc == 1) { std::cerr << optparser << std::endl; return 1; } variables_map vm; store(parse_command_line(argc, argv, optparser), vm); if (vm.count("help")) { std::cerr << optparser << std::endl; return 0; } int threads = 1; if (vm.count("threads")) { threads = vm["threads"].as<int>(); if (threads < 0) { std::cerr << "number of threads should not less than 0, reset to 1." << std::endl; threads = 1; } } std::string input = ""; if (vm.count("input")) { input = vm["input"].as<std::string>(); } std::string ner_model = "ltp_data/ner.model"; if (vm.count("ner-model")) { ner_model = vm["ner-model"].as<std::string>(); } void *engine = ner_create_recognizer(ner_model.c_str()); if (!engine) { return 1; } std::cerr << "TRACE: Model is loaded" << std::endl; std::cerr << "TRACE: Running " << threads << " thread(s)" << std::endl; std::ifstream ifs(input.c_str()); std::istream* is = NULL; if (!ifs.good()) { std::cerr << "WARN: Cann't open file! use stdin instead." << std::endl; is = (&std::cin); } else { is = (&ifs); } Dispatcher * dispatcher = new Dispatcher( engine, (*is), std::cout ); WallClockTimer t; std::list<tthread::thread *> thread_list; for (int i = 0; i < threads; ++ i) { tthread::thread * t = new tthread::thread( multithreaded_recognize, (void *)dispatcher ); thread_list.push_back( t ); } for (std::list<tthread::thread *>::iterator i = thread_list.begin(); i != thread_list.end(); ++ i) { tthread::thread * t = *i; t->join(); delete t; } std::cerr << "TRACE: consume " << t.elapsed() << " seconds." << std::endl; delete dispatcher; ner_release_recognizer(engine); return 0; }
int main() { assert(sizeof(long)==8); assert(sizeof(int)==4); WallClockTimer timer; int repeat = 100; int N = 10000; cout<<"# We report bits-per-integer speed-of-naive speed-of-popcnt1 speed-of-popcnt2 speed-of-table speed-of-tzcnt1 speed-of-tzcnt2 where speeds are in millions of integers per second "<<endl; for(int sb = 1; sb<=64; sb*=2) { int setbitsmax = sb*N; vector<long> bitmap(N); for (int k = 0; k < setbitsmax; ++k) { int bit = rand() % (N*64); bitmap[bit/64] |= (1L<<(bit%64)); } int bitcount = 0; for(int k = 0; k <N; ++k) { bitcount += __builtin_popcountl(bitmap[k]); } double bitsperinteger = N*sizeof(long)*8.0/bitcount; vector<int> outputnaive(bitcount); vector<int> outputpopcnt1(bitcount); vector<int> outputpopcnt2(bitcount); vector<int> outputtable(bitcount); vector<int> outputctz1(bitcount); vector<int> outputctz2(bitcount); cout<<"# Stored "<<bitcount<<" unary numbers in "; cout<< N*sizeof(long)<<" bytes " ; cout<<" ("<<bitsperinteger<<" bits per number)"<<endl; timer.reset(); int c0 = 0; for(int t1=0; t1<repeat; ++t1) c0 = bitscanunary_naive(bitmap.data(),N,outputnaive.data()); int tinaive = timer.split(); timer.reset(); int c1 = 0; for(int t1=0; t1<repeat; ++t1) c1 = bitscanunary_popcnt1(bitmap.data(),N,outputpopcnt1.data()); assert(c1 == c0); int tipopcnt1 = timer.split(); timer.reset(); int c12 = 0; for(int t1=0; t1<repeat; ++t1) c12 = bitscanunary_popcnt2(bitmap.data(),N,outputpopcnt2.data()); assert(c12 == c0); int tipopcnt2 = timer.split(); timer.reset(); int c2 = 0; for(int t1=0; t1<repeat; ++t1) c2 = bitscanunary_table(bitmap.data(),N,outputtable.data()); assert(c2 == c0); int titable = timer.split(); timer.reset(); int c3 = 0; for(int t1=0; t1<repeat; ++t1) c3 = bitscanunary_ctzl1(bitmap.data(),N,outputctz1.data()); assert(c3 == c0); int tictz1 = timer.split(); timer.reset(); int c32 = 0; for(int t1=0; t1<repeat; ++t1) c32 = bitscanunary_ctzl2(bitmap.data(),N,outputctz2.data()); assert(c32 == c0); int tictz2 = timer.split(); assert (outputnaive == outputpopcnt1); assert (outputnaive == outputpopcnt2); assert (outputnaive == outputtable); assert (outputnaive == outputctz1); assert (outputnaive == outputctz2); cout << bitsperinteger<<" " ; cout << bitcount * repeat * 0.001 /tinaive <<" "; cout << bitcount * repeat * 0.001 /tipopcnt1 <<" "; cout << bitcount * repeat * 0.001 /tipopcnt2 <<" "; cout << bitcount * repeat * 0.001 /titable <<" "; cout << bitcount * repeat * 0.001 /tictz1 <<" "; cout << bitcount * repeat * 0.001 /tictz2 <<" "; cout << endl ; } return 0; }
void CurlStreamFile::fillCache(std::streampos size){ #if 1 assert(size >= 0); if(! _running || _cached >=size){ return ; } fd_set readfd, writefd, exceptfd; int maxfd; CURLMcode mcode; timeval tv; //hard-coded slect timeout //this number is kept low to give more thread switch //opportunities while waitting for a load const long maxSleepUsec = 10000; //1/100 of a second const unsigned int userTimeout = 60000; WallClockTimer lastProgress; while(_running){ fillCacheNonBlocking(); if(_cached>=size || !_running) break; FD_ZERO(&readfd); FD_ZERO(&writefd); FD_ZERO(&exceptfd); mcode = curl_multi_fdset(_mCurlHandle, &readfd, &writefd, &exceptfd, &maxfd); if(mcode != CURLM_OK){ throw SnailException(curl_multi_strerror(mcode)); } if(maxfd<0){ //as of libcurl 7.21.x, the DNS resolving appears to be //going on in the background, so curl_multi_fdset fails to //return anything useful, So we use the user timeout value //to give DNS enough time to resolve the lookup if(userTimeout && lastProgress.elapsed()>userTimeout){ return ; }else{ continue; } }//if(maxfd<0) tv.tv_sec = 0; tv.tv_usec = maxSleepUsec; //wait for data on the filedescriptors until a timeout set in rc file int ret = select(maxfd+1, &readfd, &writefd, &exceptfd, &tv); #if !defined(WIN32) if(ret == -1){ if(errno == EINTR){ cout<<"select() was interrupted by a singal"<<endl; ret = 0; }else{ std::ostringstream os; os<<"error polling data from connection to"<<_url<<":"<<strerror(errno); throw SnailException(os.str()); } } #endif if(!ret){ //timeout check the clock to see //if we expired the user timeout if(userTimeout && lastProgress.elapsed() > userTimeout){ cout<<"timeout ("<<userTimeout<<") while loading from URL"<<_url<<endl; return ; } }else{ lastProgress.restart(); } }//while(.... processMessages(); #endif }
int main(int argc, char **argv) { size_t howmany = 0; size_t loop = 3; bool uniform = false; uint32_t Big = 22; float intersectionratio = 0.3f; uint32_t MaxBit = 26; int c; while ((c = getopt(argc, argv, "uns:m:R:M:S:l:h")) != -1) switch (c) { case 'h': printusage(); return 0; case 'S': Big = atoi(optarg); break; case 'R': intersectionratio = atof(optarg); break; case 'M': MaxBit = atoi(optarg); if (MaxBit < 1) { printusage(); return -1; } break; case 'm': howmany = atoi(optarg); if (howmany < 1) { printusage(); return -1; } break; case 'l': loop = atoi(optarg); if (loop < 1) { printusage(); return -1; } break; case 'u': uniform = true; break; default: abort(); } if (howmany == 0) { howmany = 5; } cout << "# howmany : " << howmany << endl; cout << "# loop : " << loop << endl; cout << "# distribution : " << (uniform ? "uniform" : "clustered") << endl; cout << "# Big : " << Big << endl; cout << "# intersectionratio : " << intersectionratio << endl; cout << "# MaxBit : " << MaxBit << endl; UniformDataGenerator udg; ClusteredDataGenerator cdg; WallClockTimer z; size_t bogus = 0; vector<uint32_t> buffer(2 * (1U << Big)); #ifdef LIKWID_MARKERS char currentMarker[64]; likwid_markerInit(); #endif cout << "# size-ratio\t"; for (string intername : IntersectionFactory::allNames()) { cout << intername << "\t"; } cout << " partioned (Schlegel et al.: improved, original) 16-bitV1 " "16-bitscalar "; cout << "relative-intersection-size " << endl; for (float ir = 1.001; ir <= 10000; ir = ir * sqrt(1.9)) { vector<pair<vector<uint32_t>, vector<uint32_t>>> data(howmany); uint32_t smallsize = static_cast<uint32_t>(round(static_cast<float>(1 << Big) / ir)); cout << "#generating data..."; cout.flush(); for (size_t k = 0; k < howmany; ++k) { data[k] = uniform ? getNaivePair(udg, smallsize, 1U << MaxBit, ir, intersectionratio) : getNaivePair(cdg, smallsize, 1U << MaxBit, ir, intersectionratio); } cout << "ok." << endl; cout << "#partitions..."; vector<pair<vector<uint16_t>, vector<uint16_t>>> datapart(howmany); for (size_t k = 0; k < howmany; ++k) { vector<uint16_t> part1(data[k].first.size() * 4); size_t p1length = partition(data[k].first.data(), data[k].first.size(), part1.data(), part1.size()); part1.resize(p1length); part1.shrink_to_fit(); vector<uint16_t> part2(data[k].second.size() * 4); size_t p2length = partition(data[k].second.data(), data[k].second.size(), part2.data(), part2.size()); part2.resize(p2length); part2.shrink_to_fit(); datapart[k] = make_pair(part1, part2); } cout << "ok." << endl; cout << ir << "\t"; float aratio = 0.0f; for (string intername : IntersectionFactory::allNames()) { intersectionfunction interfnc = IntersectionFactory::getFromName(intername); size_t volume = 0; #ifdef LIKWID_MARKERS snprintf(currentMarker, sizeof(currentMarker), "%s %.2f", intername.c_str(), ir); likwid_markerStartRegion(currentMarker); #endif z.reset(); for (size_t k = 0; k < data.size(); ++k) { volume += (data[k].first.size() + data[k].second.size()) * loop; for (size_t L = 0; L < loop; ++L) { aratio = interfnc(data[k].first.data(), (data[k].first).size(), data[k].second.data(), (data[k].second).size(), buffer.data()); bogus += aratio; } } cout << setw(10) << setprecision(5) << (volume / (static_cast<double>(z.split()))) << "\t"; #ifdef LIKWID_MARKERS likwid_markerStopRegion(currentMarker); #endif } z.reset(); size_t volume = 0; for (size_t k = 0; k < data.size(); ++k) { volume += (data[k].first.size() + data[k].second.size()) * loop; for (size_t L = 0; L < loop; ++L) { aratio = intersect_partitioned( datapart[k].first.data(), (datapart[k].first).size(), datapart[k].second.data(), (datapart[k].second).size(), (uint16_t *)buffer.data()); bogus += aratio; } } cout << setw(10) << setprecision(5) << (volume / (static_cast<double>(z.split()))) << "\t"; z.reset(); volume = 0; for (size_t k = 0; k < data.size(); ++k) { volume += (data[k].first.size() + data[k].second.size()) * loop; for (size_t L = 0; L < loop; ++L) { aratio = original_intersect_partitioned( datapart[k].first.data(), (datapart[k].first).size(), datapart[k].second.data(), (datapart[k].second).size(), (uint16_t *)buffer.data()); bogus += aratio; } } cout << setw(10) << setprecision(5) << (volume / (static_cast<double>(z.split()))) << "\t"; z.reset(); volume = 0; for (size_t k = 0; k < data.size(); ++k) { volume += (data[k].first.size() + data[k].second.size()) * loop; for (size_t L = 0; L < loop; ++L) { aratio = intersect_partitionedV1( datapart[k].first.data(), (datapart[k].first).size(), datapart[k].second.data(), (datapart[k].second).size(), (uint16_t *)buffer.data()); bogus += aratio; } } cout << setw(10) << setprecision(5) << (volume / (static_cast<double>(z.split()))) << "\t"; z.reset(); volume = 0; for (size_t k = 0; k < data.size(); ++k) { volume += (data[k].first.size() + data[k].second.size()) * loop; for (size_t L = 0; L < loop; ++L) { aratio = intersect_partitionedscalar( datapart[k].first.data(), (datapart[k].first).size(), datapart[k].second.data(), (datapart[k].second).size(), (uint16_t *)buffer.data()); bogus += aratio; } } cout << setw(10) << setprecision(5) << (volume / (static_cast<double>(z.split()))) << "\t"; cout << "\t\t" << aratio / smallsize; cout << endl; } #ifdef LIKWID_MARKERS likwid_markerClose(); #endif cout << "# bogus = " << bogus << endl; }
void test(size_t N ) { WallClockTimer time; for(int t = 0; t<2;++t) { cout <<" test # "<< t<<endl; vector<int> data = givemeanarray(N) ; { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); slowishinverseDelta1<mindist>(data); cout<<"Slowish(1) inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); slowishinverseDelta2<mindist>(data); cout<<"Slowish(2) inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDelta<mindist>(data); cout<<"Unroll2 inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaMem<mindist>(data); cout<<"Unroll2 (mem) inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaMy1<mindist>(data); cout<<"My1 inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } #if 0 { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaMy2<mindist>(data); cout<<"My2 inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } { vector<int> copydata(data); cout << "min distance between ints is "<<mindist<<endl; time.reset(); delta<mindist>(data); cout<<"delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaMy3<mindist>(data); cout<<"My3 inverse delta speed "<<N/(1000.0*time.split())<<endl; if(data != copydata) throw runtime_error("bug!"); cout<<endl; } #endif { int* pCopyData = (int*)memalign(16, data.size() * sizeof data[0]); if (!pCopyData) { throw runtime_error("Not enough memory"); } memcpy(pCopyData, &data[0], data.size() * sizeof data[0]); cout << "min distance between ints is "<<mindist<<endl; time.reset(); deltaForSIMD<mindist>(pCopyData, data.size()); cout<<"for SIMD delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaSIMD<mindist>(pCopyData, data.size()); cout<<"SIMD inverse delta speed "<<N/(1000.0*time.split())<<endl; for (size_t i = 0; i < data.size(); ++i) { if (data[i] != pCopyData[i]) { cerr << "Elem index: " << i << " orig: " << data[i] << " obtained: " << pCopyData[i] << endl; throw runtime_error("bug"); } } free(pCopyData); cout<<endl; } { int* pCopyData = (int*)memalign(16, data.size() * sizeof data[0]); if (!pCopyData) { throw runtime_error("Not enough memory"); } memcpy(pCopyData, &data[0], data.size() * sizeof data[0]); cout << "min distance between ints is "<<mindist<<endl; time.reset(); deltaForSIMD<mindist>(pCopyData, data.size()); cout<<"for SIMD delta speed "<<N/(1000.0*time.split())<<endl; time.reset(); inverseDeltaSIMDUnrolled<mindist>(pCopyData, data.size()); cout<<"SIMD inverse UNROLLED delta speed "<<N/(1000.0*time.split())<<endl; for (size_t i = 0; i < data.size(); ++i) { if (data[i] != pCopyData[i]) { cerr << "Elem index: " << i << " orig: " << data[i] << " obtained: " << pCopyData[i] << endl; throw runtime_error("bug"); } } free(pCopyData); cout<<endl; } cout<<endl<<endl<<endl; } }
void simplebenchmark(uint32_t N = 1U << 16, uint32_t T = 1U << 9) { T = T + 1; // we have a warming up pass vector<uint32_t, cacheallocator> data = generateArray32(N); vector<uint32_t, cacheallocator> compressed(N, 0); vector<uint32_t, cacheallocator> recovered(N, 0); WallClockTimer z; double packtime, packtimewm, unpacktime; double simdpacktime, simdpacktimewm, simdunpacktime; double horizontalunpacktime; cout << "#million of integers per second: higher is better" << endl; cout << "#bit, pack, pack without mask, unpack" << endl; for (uint32_t bitindex = 0; bitindex < 32; ++bitindex) { uint32_t bit = 32 - bitindex; maskfnc(data, bit); for (uint32_t repeat = 0; repeat < 1; ++repeat) { packtime = 0; packtimewm = 0; unpacktime = 0; simdpacktime = 0; simdpacktimewm = 0; simdunpacktime = 0; horizontalunpacktime = 0; for (uint32_t t = 0; t < T; ++t) { compressed.clear(); compressed.resize(N * bit / 32, 0); recovered.clear(); recovered.resize(N, 0); simdpack(data, compressed, bit); simdunpack(compressed, recovered, bit); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bugs!" << bit << endl; return; } z.reset(); simdpack(data, compressed, bit); if (t > 0) simdpacktime += z.split(); simdunpack(compressed, recovered, bit); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bugs!" << bit << endl; return; } z.reset(); simdpackwithoutmask(data, compressed, bit); if (t > 0) simdpacktimewm += z.split(); z.reset(); simdunpack(compressed, recovered, bit); if (t > 0) simdunpacktime += z.split(); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bugs!" << bit << endl; return; } z.reset(); fastpack(data, compressed, bit); if (t > 0) packtime += z.split(); fastunpack(compressed, recovered, bit); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bug1!" << endl; return; } z.reset(); fastpackwithoutmask(data, compressed, bit); if (t > 0) packtimewm += z.split(); z.reset(); fastunpack(compressed, recovered, bit); if (t > 0) unpacktime += z.split(); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bug1!" << endl; return; } z.reset(); horizontalunpack(compressed, recovered, bit); if (t > 0) horizontalunpacktime += z.split(); if (!equalOnFirstBits(data, recovered, bit)) { cout << " Bug1!" << endl; return; } } cout << std::setprecision(4) << bit << "\t\t" << N * (T - 1) / (packtime) << "\t\t" << N * (T - 1) / (packtimewm) << "\t\t\t" << N * (T - 1) / (unpacktime) << "\t\t"; cout << std::setprecision(4) << bit << "\t\t" << N * (T - 1) / (simdpacktime) << "\t\t" << N * (T - 1) / (simdpacktimewm) << "\t\t" << N * (T - 1) / (simdunpacktime) << "\t\t"; cout<< N * (T - 1) / (horizontalunpacktime) << "\t\t"; cout << endl; } } }
void simplebenchmark(uint32_t N = 1U << 16, uint32_t T = 1U << 9) { T = T + 1; // we have a warming up pass uint32_t bogus = 0; vector<uint32_t> data(N); vector<uint32_t> compressed(N); vector<uint32_t> icompressed(N); vector<uint32_t> recovered(N); WallClockTimer z; double unpacktime; double iunpacktime; cout << "#million of integers per second: higher is better" << endl; cout << "#bit, unpack,iunpack" << endl; for (uint32_t bitindex = 0; bitindex < 32; ++bitindex) { uint32_t bit = bitindex + 1; vector<uint32_t> initdata(N); for (size_t i = 0; 4 * i < data.size(); i += 4) { initdata[i] = random(bit) + (i >= 4 ? initdata[i - 4] : 0); for (size_t j = 1; j < 4; ++j) { initdata[i + j] = initdata[i]; } } const vector<uint32_t> refdata = initdata; vector<uint32_t>().swap(initdata); icompressed.clear(); // 4 * N should be enough for all schemes icompressed.resize(4 * N, 0); compressed.clear(); // 4 * N should be enough for all schemes compressed.resize(4 * N, 0); recovered.clear(); recovered.resize(N, 0); if (needPaddingTo128Bits(recovered.data())) { throw logic_error("Array is not aligned on 128 bit boundary!"); } if (needPaddingTo128Bits(icompressed.data())) { throw logic_error("Array is not aligned on 128 bit boundary!"); } if (needPaddingTo128Bits(compressed.data())) { throw logic_error("Array is not aligned on 128 bit boundary!"); } if (needPaddingTo128Bits(refdata.data())) { throw logic_error("Array is not aligned on 128 bit boundary!"); } for (uint32_t repeat = 0; repeat < 1; ++repeat) { unpacktime = 0; iunpacktime = 0; for (uint32_t t = 0; t <= T; ++t) { assert(data.size() == refdata.size()); fill(icompressed.begin(), icompressed.end(), 0); fill(recovered.begin(), recovered.end(), 0); memcpy(data.data(), refdata.data(), data.size() * sizeof(uint32_t)); // memcpy can be slow Helper::pack(data.data(), data.size(), icompressed.data(), bit); z.reset(); Helper::unpack(icompressed.data(), refdata.size(), recovered.data(), bit); if (t > 0) // we don't count the first run unpacktime += static_cast<double>(z.split()); if (!equalOnFirstBits(refdata, recovered, bit)) { cout << " Bug 1a " << bit << endl; return; } memcpy(data.data(), refdata.data(), data.size() * sizeof(uint32_t)); // memcpy can be slow Helper::pack(data.data(), data.size(), icompressed.data(), bit); z.reset(); Helper::iunpack(icompressed.data(), refdata.size(), recovered.data(), bit); if (t > 0) // we don't count the first run iunpacktime += static_cast<double>(z.split()); if (!equalOnFirstBits(refdata, recovered, bit)) { cout << " Bug 2 " << bit << endl; return; } } cout << std::setprecision(4) << bit << "\t\t"; cout << "\t\t" << N * (T - 1) / (unpacktime) << "\t\t"; cout << "\t\t" << N * (T - 1) / (iunpacktime); cout << endl; } } cout << "# ignore this " << bogus << endl; }