int main() { const int N = 100 * 1000 * 1000; vector<int> array; array.resize(N); for(int k = 0; k<N;++k) array[k]=k; random_shuffle(array.begin(), array.end()); ZTimer z; for(int j = 0; j<10;++j) { int max = INT_MIN; int bestk = 0; for (int k = 0; k<N; ++k) { if(array[k]>max) { max = array[k]; bestk = k; } } cout<<" "<< bestk<<endl; } cout<<z.split()<<endl; z.reset(); for(int j = 0; j<10;++j) { int max = INT_MIN; for (int k = 0; k<N; ++k) { if(array[k]>max) max = array[k]; } for (int k = 0; k<N;++k) if(array[k] == max) { cout<<" "<< k<<endl; break; } } cout<<z.split()<<endl; return 0; }
double testSpeed(const T & hasher, const vector<INTEGER> & data, uint64 & answer, const uint mN) { ZTimer t; double timeelapsed; t.reset(); answer += hasher.hash(&data[0],&data[0]+mN); timeelapsed = t.split()/(1000.0); return timeelapsed; }
double testSpeedManyTimes(const T & hasher, const vector<INTEGER> & data, uint64 & answer, const uint mN,const uint times) { ZTimer t; double timeelapsed; t.reset(); for(uint k = 0; k<times;++k) answer += hasher.hash(&data[0],&data[0]+mN); timeelapsed = t.split()/(1000.0); return timeelapsed; }
void testCodec(SimpleCODEC & mycodec, MyNaiveColumnStore & n, vector<Results> & v, const uint smallsetrepeats) { const uint uncompressedsize = n.size(); cout << "# computing " << mycodec.name() << " ... " << endl; if (uncompressedsize == 0) return; Results r(mycodec.name()); for (uint columnindex = 0; columnindex < n.data.size(); ++columnindex) { uint compressiontime(0), decompressiontime(0); double sizeinmb(0); vector<uint> incolumn; const uint MAXSIZE=10*1024*1024;// 50 million or about 50MB for(uint64 begin = 0; begin<n.data[columnindex].size(); begin+=MAXSIZE) { uint64 end = begin+MAXSIZE; if(end > n.data[columnindex].size()) end = n.data[columnindex].size(); n.data[columnindex].loadACopy(incolumn,begin,end); ZTimer z; columntype out; for (uint k = 0; k < smallsetrepeats; ++k) { out.clear(); mycodec.compress(incolumn,out); } compressiontime += z.split(); sizeinmb += (out.size() * 1.0 / (1024.0 * 1024.0)); z.reset(); for (uint k = 0; k < smallsetrepeats; ++k) { columntype recovered; mycodec.uncompress(out, recovered); } decompressiontime += z.split(); } r.add(sizeinmb, compressiontime, decompressiontime); } v.push_back(r); }
void __scaleCSV(CSVFlatFile & ff) { ZTimer z; cout<<"#Loading into row store..."<<endl; RowStore<c> rs(ff,0); ff.close(); cout << "# " << z.split() << " ms to load " << rs.size() << " bytes into row store" << endl; cout << "# detected " << c << " columns" << endl; vector<uint> indexes = ff.computeColumnOrderAndReturnColumnIndexes( INCREASINGCARDINALITY); cout<<"# clearing histogram memory..."<<endl; ff.clear(); cout<<"# sorting..."<<endl; if (true) { z.reset(); rs.sortRows(indexes); cout << "# " << z.split() << " ms to sort rows" << endl; z.reset(); NaiveColumnStore<c> ncs; ncs.reloadFromRowStore(rs); //rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; cout << "# got RunCount = " << ncs.computeRunCount() << endl; cout << "# got RunCount" << BLOCKSIZE << " = " << ncs.computeRunCountp( BLOCKSIZE) << endl; cout << "# block size = " << BLOCKSIZE << endl; runtests(ncs, true,true); cout<<endl; } uint numberofrows = rs.data.size(); cout << "# detected " << numberofrows << " rows" << endl; if (true) { for (uint blocksize = 16; blocksize <= min(8388608,numberofrows); blocksize *= 2) { cout << "# blocksize " << blocksize << " rows" << endl; z.reset(); rs.MultipleListsSortRowsPerBlock(indexes, blocksize);//65536); cout << "# " << z.split() << " ms to sort rows in multiplelists order with blocksize = " << blocksize << endl; z.reset(); NaiveColumnStore<c> ncs; ncs.reloadFromRowStore(rs); //rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; cout << "# got RunCount = " << ncs.computeRunCount() << endl; cout << "# got RunCount" << BLOCKSIZE << " = " << ncs.computeRunCountp(BLOCKSIZE) << endl; cout << "# block size = " << BLOCKSIZE << endl; runtests(ncs, true,true); cout << endl; } } if(false) { z.reset(); rs.vortexSortRows(indexes); cout << "# " << z.split() << " ms to sort rows in vortex order" << endl; z.reset(); NaiveColumnStore<c> ncs; ncs.reloadFromRowStore(rs); rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; cout << "# got RunCount = " << ncs.computeRunCount() << endl; cout << "# got RunCount" << BLOCKSIZE << " = " << ncs.computeRunCountp( BLOCKSIZE) << endl; cout << "# block size = " << BLOCKSIZE << endl; runtests(ncs, true,true); cout<<endl; } }
void __readCSV(CSVFlatFile & ff, int sort, int columnorderheuristic, bool skiprepeats, const uint sample, const uint64 maxsize, const bool makeColumnIndependent) { ZTimer z; cout<<"#Loading into row store..."<<endl; //printMemoryUsage(); RowStore<c> rs(ff,0); ff.close(); cout << "# " << z.split() << " ms to load " << rs.size() << " bytes into row store" << endl; if(sample>0) { z.reset(); RowStore<c> rstmp; rs.fillWithSample(sample,rstmp); rs.data.swap(rstmp.data); cout << "# " << z.split() << " ms to extract sample containing "<< sample<<" tuples" << endl; } cout << "# detected " << c << " columns" << endl; vector<uint> indexes = ff.computeColumnOrderAndReturnColumnIndexes( columnorderheuristic); cout<<"# clearing histogram memory..."<<endl; ff.clear(); //cout<<"# fraction of tuples with zeroes = "<< rs.countZeroes() * 1. / (rs.data.size() * c)<<endl; cout<<"# sorting..."<<endl; NaiveColumnStore<c> ncs; if(makeColumnIndependent) { cout<<"# shuffling columns independently"<<endl; cout<<"# shuffling columns independently (part 1: loading into column store)"<<endl; ncs.reloadFromRowStore(rs); cout<<"# shuffling columns independently (part 2: shuffling)"<<endl; ncs.makeColumnsIndependent(); cout<<"# shuffling columns independently (part 3: copying back to row store)"<<endl; ncs.copyToRowStore(rs); } if (sort == LEXICO) { z.reset(); rs.sortRows(indexes); cout << "# " << z.split() << " ms to sort rows" << endl; if(maxsize>0) rs.top(maxsize,rs); z.reset(); ncs.reloadFromRowStore(rs); rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; } else if (sort == MULTIPLELISTS) { cout << "not supported" << endl; } else if (sort == BLOCKWISEMULTIPLELISTS) { z.reset(); rs.sortRows(indexes); cout << "# " << z.split() << " ms to sort rows lexicographically" << endl; if(maxsize>0) rs.top(maxsize,rs); z.reset(); rs.MultipleListsSortRowsPerBlock(indexes, 131072);//65536); cout << "# " << z.split() << " ms to sort rows in multiplelists order" << endl; z.reset(); ncs.reloadFromRowStore(rs); rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; } else if (sort == VORTEX) { z.reset(); rs.vortexSortRows(indexes); cout << "# " << z.split() << " ms to sort rows in vortex order" << endl; if(maxsize>0) rs.top(maxsize,rs); z.reset(); ncs.reloadFromRowStore(rs); rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; } else if (sort == GRAYCODED) { cerr << "not supported" << endl; } else {// shuffling z.reset(); rs.shuffleRows(); cout << "# " << z.split() << " ms to shuffle rows" << endl; if(maxsize>0) rs.top(maxsize,rs); z.reset(); ncs.reloadFromRowStore(rs); rs.clear(); cout << "# " << z.split() << " ms to reload " << ncs.size() << " bytes into column store" << endl; } cout << "# got RunCount = " << ncs.computeRunCount() << endl; cout << "# got RunCount" << BLOCKSIZE << " = " << ncs.computeRunCountp( BLOCKSIZE) << endl; cout << "# block size = " << BLOCKSIZE << endl; runtests(ncs, skiprepeats); ncs.clear(); }