int main() { const int N = 100 * 1000 * 1000; vector<int> array; array.resize(N); for(int k = 0; k<N;++k) array[k]=k; random_shuffle(array.begin(), array.end()); ZTimer z; for(int j = 0; j<10;++j) { int max = INT_MIN; int bestk = 0; for (int k = 0; k<N; ++k) { if(array[k]>max) { max = array[k]; bestk = k; } } cout<<" "<< bestk<<endl; } cout<<z.split()<<endl; z.reset(); for(int j = 0; j<10;++j) { int max = INT_MIN; for (int k = 0; k<N; ++k) { if(array[k]>max) max = array[k]; } for (int k = 0; k<N;++k) if(array[k] == max) { cout<<" "<< k<<endl; break; } } cout<<z.split()<<endl; return 0; }
double testSpeed(const T & hasher, const vector<INTEGER> & data, uint64 & answer, const uint mN) { ZTimer t; double timeelapsed; t.reset(); answer += hasher.hash(&data[0],&data[0]+mN); timeelapsed = t.split()/(1000.0); return timeelapsed; }
double testSpeedManyTimes(const T & hasher, const vector<INTEGER> & data, uint64 & answer, const uint mN,const uint times) { ZTimer t; double timeelapsed; t.reset(); for(uint k = 0; k<times;++k) answer += hasher.hash(&data[0],&data[0]+mN); timeelapsed = t.split()/(1000.0); return timeelapsed; }
void __displayStats(CSVFlatFile & ff) { ZTimer z; cout<<"#Loading into row store..."<<endl; RowStore<c> rs(ff,0); ff.close(); cout << "# " << z.split() << " ms to load " << rs.size() << " bytes into row store" << endl; cout<<"# fraction of tuples with zeroes = "<< rs.countZeroes() * 1. / (rs.data.size() * c)<<endl; cout<<"# number of attribute values = "<< ff.numberOfAttributeValues()<<endl; cout<<"# excepted fraction = "<<c * 1.0 / ff.numberOfAttributeValues()<<endl; }
void testCodec(SimpleCODEC & mycodec, MyNaiveColumnStore & n, vector<Results> & v, const uint smallsetrepeats) { const uint uncompressedsize = n.size(); cout << "# computing " << mycodec.name() << " ... " << endl; if (uncompressedsize == 0) return; Results r(mycodec.name()); for (uint columnindex = 0; columnindex < n.data.size(); ++columnindex) { uint compressiontime(0), decompressiontime(0); double sizeinmb(0); vector<uint> incolumn; const uint MAXSIZE=10*1024*1024;// 50 million or about 50MB for(uint64 begin = 0; begin<n.data[columnindex].size(); begin+=MAXSIZE) { uint64 end = begin+MAXSIZE; if(end > n.data[columnindex].size()) end = n.data[columnindex].size(); n.data[columnindex].loadACopy(incolumn,begin,end); ZTimer z; columntype out; for (uint k = 0; k < smallsetrepeats; ++k) { out.clear(); mycodec.compress(incolumn,out); } compressiontime += z.split(); sizeinmb += (out.size() * 1.0 / (1024.0 * 1024.0)); z.reset(); for (uint k = 0; k < smallsetrepeats; ++k) { columntype recovered; mycodec.uncompress(out, recovered); } decompressiontime += z.split(); } r.add(sizeinmb, compressiontime, decompressiontime); } v.push_back(r); }
void __growCSV(CSVFlatFile & ff, int columnorderheuristic) { ZTimer z; cout<<"#Loading into row store..."<<endl; //printMemoryUsage(); RowStore<c> rs(ff,0); ff.close(); cout << "# " << z.split() << " ms to load " << rs.size() << " bytes into row store" << endl; cout << "# detected " << c << " columns" << endl; vector<uint> indexes = ff.computeColumnOrderAndReturnColumnIndexes( columnorderheuristic); cout<<"# clearing histogram memory..."<<endl; ff.clear(); rs.sortRows(indexes); NaiveColumnStore<c> ncs; for(uint k = 131072*20; k<=rs.data.size();k+=131072*20) { //rstmp.sortRows(indexes); ncs.reloadFromRowStore(rs,k); cout<<"#=============================#"<<endl; cout<<"# number of rows = "<<k<<endl; cout<<"#=============================#"<<endl; cout<<"# lexico "<<endl; runtests(ncs, true, true); cout << "# got RunCount = " << ncs.computeRunCount() << endl; ncs.clear(); cout<<"############################"<<endl; cout<<"# multiple list "<<endl; RowStore<c> rstmp; rs.top(k,rstmp); rstmp.MultipleListsSortRowsPerBlock(indexes, 131072); ncs.reloadFromRowStore(rstmp); runtests(ncs, true, true); rstmp.clear(); cout << "# got RunCount = " << ncs.computeRunCount() << endl; ncs.clear(); cout<<endl; } rs.clear(); ncs.clear(); }
int main(int params, char ** args) { uint N; if (params >= 2) N = atoi(args[1]); else { N = 1024*1024; } cout<<"#sizeof(uint64)="<<sizeof(uint64)<<endl; assert(sizeof(uint64)==8); cout<<"# Initializing data..."<<endl; vector<uint32>data(N); vector<uint64>randombuffer64(N+1); //MTRand mt; ZRandom zr; for(uint k = 0;k<N;++k) { data[k] = zr.getValue();//mt.randInt(); } data.push_back(1);// so that it never ends with a zero if( (data.size() & 1) != 0) data.push_back(0);// make sure it is even cout<<"# done generating data"<<endl; cout<<"# "<<data.size()*sizeof(uint)/(1024.0*1024.0)<<" MB"<<endl; ZTimer t; for(uint k = 0;k<=N+2;++k) { randombuffer64[k] =zr.getValue() & (static_cast<uint64>(zr.getValue())<<32 ) ; } double timeelapsed = t.split()/(1000.0); cout<<"# random generated in "<<timeelapsed<< " or "<<N*sizeof(uint64)/(1024.0*1024*1024*timeelapsed)<<" GB/s"<<endl; cout<<"# "<<data.size()*sizeof(uint64)/(1024.0*1024.0)<<" MB"<<endl; const uint times = 20; uint64 answer = 0; Silly silly; Thorup thorup(randombuffer64); StrongMultilinear sm(randombuffer64); PyramidalMultilinear pm(randombuffer64); XAMA xama(randombuffer64); #if defined (__PCLMUL__) && (__SSE2__) CLStrongMultilinear clsm(randombuffer64); #endif NoMultiplication testing(randombuffer64); StrongMultilinearTwoByTwo sm2by2(randombuffer64); const uint shorttimes =2000000; cout<<"# Starting tests... repeating each run "<<shorttimes<<" times"<<endl; cout<<"# N silly thorup09(not-strongly-universal) xama strong-multilinear strong-multilinear-2by2 pyramidalmultilinear clmulti*"<<endl; for(uint mN = 1024; mN<=2048; mN*=2) { vector<double> counter(6,0); counter[0]+=testSpeedManyTimes(silly,data,answer,mN,shorttimes); counter[1]+=testSpeedManyTimes(thorup,data,answer,mN,shorttimes); counter[2]+=testSpeedManyTimes(xama,data,answer,mN,shorttimes); counter[3]+=testSpeedManyTimes(sm,data,answer,mN,shorttimes); counter[4]+=testSpeedManyTimes(sm2by2,data,answer,mN,shorttimes); counter[5]+=testSpeedManyTimes(pm,data,answer,mN,shorttimes); #if defined (__PCLMUL__) && (__SSE2__) counter[6]+=testSpeedManyTimes(clsm,data,answer,mN,shorttimes); #endif cout<<mN<<" "; for(uint k = 0; k<counter.size();++k) cout<<counter[k]<<" "; cout<<endl; } cout<<endl; cout<<"# Starting tests... repeating each run "<<times<<" times"<<endl; cout<<"# N silly thorup09(not-strongly-universal) xama strong-multilinear strong-multilinear-2by2 pyramidalmultilinear clmulti*"<<endl; for(uint mN = 1048576; mN<=data.size(); mN*=2) { vector<double> counter(6,0); for(uint k = 0;k<times;++k) { counter[0]+=testSpeed(silly,data,answer,mN); counter[1]+=testSpeed(thorup,data,answer,mN); counter[2]+=testSpeedManyTimes(xama,data,answer,mN,shorttimes); counter[3]+=testSpeed(sm,data,answer,mN); counter[4]+=testSpeed(sm2by2,data,answer,mN); counter[5]+=testSpeed(pm,data,answer,mN); #if defined (__PCLMUL__) && (__SSE2__) counter[6]+=testSpeedManyTimes(clsm,data,answer,mN,shorttimes); #endif } cout<<mN<<" "; for(uint k = 0; k<counter.size();++k) cout<<counter[k]<<" "; cout<<endl; } cout<<endl; return answer; }
void __scaleCSV(CSVFlatFile & ff) { ZTimer z; cout<<"#Loading into row store..."<<endl; RowStore<c> rs(ff,0); ff.close(); cout << "# " << z.split() << " ms to load " << rs.size() << " bytes into row store" << endl; cout << "# detected " << c << " columns" << endl; vector<uint> indexes = ff.computeColumnOrderAndReturnColumnIndexes( INCREASINGCARDINALITY); cout<<"# clearing histogram memory..."<<endl; ff.clear(); cout<<"# sorting..."<<endl; if (true) { z.reset(); rs.sortRows(indexes); cout << "# " << z.split() << " ms to sort rows" << endl; z.reset(); NaiveColumnStore<c> ncs; ncs.reloadFromRowStore(rs); //rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; cout << "# got RunCount = " << ncs.computeRunCount() << endl; cout << "# got RunCount" << BLOCKSIZE << " = " << ncs.computeRunCountp( BLOCKSIZE) << endl; cout << "# block size = " << BLOCKSIZE << endl; runtests(ncs, true,true); cout<<endl; } uint numberofrows = rs.data.size(); cout << "# detected " << numberofrows << " rows" << endl; if (true) { for (uint blocksize = 16; blocksize <= min(8388608,numberofrows); blocksize *= 2) { cout << "# blocksize " << blocksize << " rows" << endl; z.reset(); rs.MultipleListsSortRowsPerBlock(indexes, blocksize);//65536); cout << "# " << z.split() << " ms to sort rows in multiplelists order with blocksize = " << blocksize << endl; z.reset(); NaiveColumnStore<c> ncs; ncs.reloadFromRowStore(rs); //rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; cout << "# got RunCount = " << ncs.computeRunCount() << endl; cout << "# got RunCount" << BLOCKSIZE << " = " << ncs.computeRunCountp(BLOCKSIZE) << endl; cout << "# block size = " << BLOCKSIZE << endl; runtests(ncs, true,true); cout << endl; } } if(false) { z.reset(); rs.vortexSortRows(indexes); cout << "# " << z.split() << " ms to sort rows in vortex order" << endl; z.reset(); NaiveColumnStore<c> ncs; ncs.reloadFromRowStore(rs); rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; cout << "# got RunCount = " << ncs.computeRunCount() << endl; cout << "# got RunCount" << BLOCKSIZE << " = " << ncs.computeRunCountp( BLOCKSIZE) << endl; cout << "# block size = " << BLOCKSIZE << endl; runtests(ncs, true,true); cout<<endl; } }
void __readCSV(CSVFlatFile & ff, int sort, int columnorderheuristic, bool skiprepeats, const uint sample, const uint64 maxsize, const bool makeColumnIndependent) { ZTimer z; cout<<"#Loading into row store..."<<endl; //printMemoryUsage(); RowStore<c> rs(ff,0); ff.close(); cout << "# " << z.split() << " ms to load " << rs.size() << " bytes into row store" << endl; if(sample>0) { z.reset(); RowStore<c> rstmp; rs.fillWithSample(sample,rstmp); rs.data.swap(rstmp.data); cout << "# " << z.split() << " ms to extract sample containing "<< sample<<" tuples" << endl; } cout << "# detected " << c << " columns" << endl; vector<uint> indexes = ff.computeColumnOrderAndReturnColumnIndexes( columnorderheuristic); cout<<"# clearing histogram memory..."<<endl; ff.clear(); //cout<<"# fraction of tuples with zeroes = "<< rs.countZeroes() * 1. / (rs.data.size() * c)<<endl; cout<<"# sorting..."<<endl; NaiveColumnStore<c> ncs; if(makeColumnIndependent) { cout<<"# shuffling columns independently"<<endl; cout<<"# shuffling columns independently (part 1: loading into column store)"<<endl; ncs.reloadFromRowStore(rs); cout<<"# shuffling columns independently (part 2: shuffling)"<<endl; ncs.makeColumnsIndependent(); cout<<"# shuffling columns independently (part 3: copying back to row store)"<<endl; ncs.copyToRowStore(rs); } if (sort == LEXICO) { z.reset(); rs.sortRows(indexes); cout << "# " << z.split() << " ms to sort rows" << endl; if(maxsize>0) rs.top(maxsize,rs); z.reset(); ncs.reloadFromRowStore(rs); rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; } else if (sort == MULTIPLELISTS) { cout << "not supported" << endl; } else if (sort == BLOCKWISEMULTIPLELISTS) { z.reset(); rs.sortRows(indexes); cout << "# " << z.split() << " ms to sort rows lexicographically" << endl; if(maxsize>0) rs.top(maxsize,rs); z.reset(); rs.MultipleListsSortRowsPerBlock(indexes, 131072);//65536); cout << "# " << z.split() << " ms to sort rows in multiplelists order" << endl; z.reset(); ncs.reloadFromRowStore(rs); rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; } else if (sort == VORTEX) { z.reset(); rs.vortexSortRows(indexes); cout << "# " << z.split() << " ms to sort rows in vortex order" << endl; if(maxsize>0) rs.top(maxsize,rs); z.reset(); ncs.reloadFromRowStore(rs); rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; } else if (sort == GRAYCODED) { cerr << "not supported" << endl; } else {// shuffling z.reset(); rs.shuffleRows(); cout << "# " << z.split() << " ms to shuffle rows" << endl; if(maxsize>0) rs.top(maxsize,rs); z.reset(); ncs.reloadFromRowStore(rs); rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; } cout << "# got RunCount = " << ncs.computeRunCount() << endl; cout << "# got RunCount" << BLOCKSIZE << " = " << ncs.computeRunCountp( BLOCKSIZE) << endl; cout << "# block size = " << BLOCKSIZE << endl; runtests(ncs, skiprepeats); ncs.clear(); }