void test(size_t N ) {
    WallClockTimer time;
    for(int t = 0; t<2;++t) {
      cout <<" test # "<< t<<endl;
      vector<short> data = givemeanarray(N) ;
      vector<short> copydata(data);
      
      time.reset();
      straightsum(&data[0],N);
      cout<<"straight sum (C-like) "<<N/(1000.0*time.split())<<endl;   
      
 
      time.reset();
      slowishSum(data);
      cout<<"basic sum (C++-like) "<<N/(1000.0*time.split())<<endl;   
      
      data = copydata;

      time.reset();
      sum(data);
      cout<<"smarter sum "<<N/(1000.0*time.split())<<endl;   
 
      data = copydata;

      time.reset();
      fastSum(data);
      cout<<"fast sum "<<N/(1000.0*time.split())<<endl;   
 
      cout<<endl<<endl<<endl;

    }

}
int overall(size_t N) {
	int bogus = 0;
	WallClockTimer t;
	t.reset();
    bogus += testSTL(N);
    int delay = t.split();
    cout << "STL vector " << N /(delay * 1000.0) << endl;
    vector<double> idelays;
    for(size_t T = 0 ; T < 20 ; ++T ) {
          t.reset();
    	  bogus += straight(N);
    	  int tdelay = t.split();
    	  idelays.push_back(tdelay);
    }
    cout << "static array : " << N /(median(idelays) * 1000.0) << endl;
    for(size_t factor = 1; factor <= 6; ++ factor) {
        vector<double> delays;
        for(size_t T = 0 ; T < 20 ; ++T ) {
          t.reset();
    	  bogus += testManual(N,2+factor,2);
    	  int tdelay = t.split();
    	  delays.push_back(tdelay);
        }
    	cout << "pointer-based "<< (factor +2)/2.0<< " : " << N /(median(delays) * 1000.0) << endl;
    }
    return bogus;
}
int test(const size_t N) {
	int *  a = new int[N];
	for(size_t k = 0; k< N; ++k)
	  a[k] = k - 2 + k * k;
	int fakecounter = 0;
	cout<<" Buffer size = "<< N*sizeof(int) /(1024.0*1024.0)<<" MB "<<endl;

	WallClockTimer t;
	double besttime1 = numeric_limits<double>::max();
	double besttime2 = numeric_limits<double>::max();
	double besttime3 = numeric_limits<double>::max();
	for(int k = 0; k<20;++k) {
		t.reset();
		fakecounter += totalsum(a,N);
		double thistime1 = t.split();
		if(thistime1 < besttime1) besttime1 = thistime1;
		t.reset();
		fakecounter += sum<2>(a,N);
		double thistime2 = t.split();
		if(thistime2 < besttime2) besttime2 = thistime2;
		t.reset();
		fakecounter += sum<16>(a,N);
		double thistime3 = t.split();
		if(thistime3 < besttime3) besttime3 = thistime3;
	}
    cout<<" total sum speed = "<<N/(1000*1000*besttime1) <<" mis or "<< N*sizeof(int)/(1024.0*1024.0*besttime1)<<" MB/s"<<endl;
    cout<<" partial sum speed = "<<N/(1000*1000*besttime2) <<" mis or "<< N*sizeof(int)/(1024.0*1024.0*besttime2)<<" MB/s"<<endl;
    cout<<" speed ratio = "<< besttime1 /besttime2<<endl;
    cout<<" partial sum speed = "<<N/(1000*1000*besttime3) <<" mis or "<< N*sizeof(int)/(1024.0*1024.0*besttime3)<<" MB/s"<<endl;
    cout<<" speed ratio = "<< besttime1 /besttime3<<endl;
    return fakecounter;
}
Пример #4
0
void test3(int N, int rep) {
  WallClockTimer timer;

  uint64_t total = 0;

  uint64_t sum = 0;

  string       emptyStr;
  stringstream str;

  for (int j = 0; j < rep; ++j) {

    timer.reset();

    for (int i = 0; i < N; i++) {
      str.str(emptyStr);
      str << i << " " << j; 
      sum += reinterpret_cast<size_t>(str.str().c_str());
    }

    total += timer.split();
  }

  cout << "Ignore: " << sum << endl;
  cout << " total # of proc without construct/deconstruct: " << rep * N << ", time " <<  total / 1e3 << " ms" << " proc per sec: " << (rep * N * 1e6 / total ) << endl;
}
void testPackUnpackC(size_t N =  2048 * 32 * 2048) {
	WallClockTimer timer;
	bool* data = new bool[N];
	for(size_t i = 0; i<N; ++i)
	  data[i] = static_cast<bool>(i & 1);
	vector<char> comp(N/8);
	for(size_t t = 0; t< 3; ++t) {
		timer.reset();
		pack(data, &comp[0], N);
		cout<<" pack time = "<<timer.split()<<endl;
		timer.reset();
		unpack(&comp[0], data, N);
		cout<<" unpack time = "<<timer.split()<<endl;
		for(size_t i = 0; i<N; ++i) 
			assert(data[i] == static_cast<bool>(i & 1));	  
	}
	delete[] data;
}
int testStoreLoadC(size_t M =  2048 * 4, size_t N = 2048 * 8, size_t repeat = 1) {
	WallClockTimer timer;
	vector<int> data;
	int bogus;
	for(size_t i = 0; i<M; ++i)
	  data.push_back(i);
	vector<int> bigdata;
	bigdata.resize(M * N);
	for(size_t t = 0; t< 3; ++t) {
		timer.reset();
		for (size_t r = 0; r < repeat; ++r)
			bogus += storeTestC(&data[0],&bigdata[0],N,M);
		if(t>0) cout<<" store time = "<<timer.split()<<endl;
		timer.reset();
		for (size_t r = 0; r < repeat; ++r)
	    	bogus += loadTestC(&data[0],&bigdata[0],N,M);
		if(t>0) cout<<" load time = "<<timer.split()<<endl;
		for(int i = 0; i<M; ++i)
	  		assert(data[i] == i);
	}
    return bogus;
}
void test(size_t N ) {
    cout << "min distance between ints is "<<mindist<<endl;
    WallClockTimer time;
    for(int t = 0; t<2;++t) {
      cout <<" test # "<< t<<endl;
      vector<int> data = givemeanarray(N) ;
      vector<int> copydata(data);
      
      time.reset();
      cdelta<mindist>(&data[0],data.size());
      cout<<"c delta speed "<<N/(1000.0*time.split())<<endl;   
      time.reset();
      cinverseDelta<mindist>(&data[0],data.size());
      cout<<"c inverse delta speed "<<N/(1000.0*time.split())<<endl;   
      if(data != copydata) throw runtime_error("bug!");
      cout<<endl;
 
      time.reset();
      delta<mindist>(data);
      cout<<"delta speed "<<N/(1000.0*time.split())<<endl;   
      time.reset();
      inverseDelta<mindist>(data);
      cout<<"inverse delta speed "<<N/(1000.0*time.split())<<endl;   
      if(data != copydata) throw runtime_error("bug!");
      cout<<endl;


      delta<mindist>(data);
      time.reset();
      slowishinverseDelta<mindist>(data);
      cout<<"slowish inverse delta speed "<<N/(1000.0*time.split())<<endl;   
      if(data != copydata) throw runtime_error("bug!");
      cout<<endl;

      delta<mindist>(data);
      time.reset();
      bufferedinverseDelta<mindist>(data);
      cout<<"buffered inverse delta speed "<<N/(1000.0*time.split())<<endl;   
      if(data != copydata) throw runtime_error("bug!");
      cout<<endl;


      delta<mindist>(data);
      time.reset();
      inverseDeltaVolkov<mindist>(data);
      cout<<"inverse delta speed (volkov-lemire) "<<N/(1000.0*time.split())<<endl;   
      if(data != copydata) throw runtime_error("bug!");
      cout<<endl;


      cout<<endl<<endl<<endl;
    }

}
void test(size_t N ) {
    WallClockTimer time;
    for(int t = 0; t<2;++t) {
      cout <<" test # "<< t<<endl;
      vector<int> data = givemeanarray(N) ;


      {
          vector<int> copydata(data);

          cout << "min distance between ints is "<<mindist<<endl;

          time.reset();
          delta<mindist>(data);
          cout<<"delta speed "<<N/(1000.0*time.split())<<endl;
          time.reset();
          slowishinverseDelta1<mindist>(data);
          cout<<"Slowish(1) inverse delta speed "<<N/(1000.0*time.split())<<endl;
          if(data != copydata) throw runtime_error("bug!");
          cout<<endl;

      }

      {
          vector<int> copydata(data);

          cout << "min distance between ints is "<<mindist<<endl;

          time.reset();
          delta<mindist>(data);
          cout<<"delta speed "<<N/(1000.0*time.split())<<endl;
          time.reset();
          slowishinverseDelta2<mindist>(data);
          cout<<"Slowish(2) inverse delta speed "<<N/(1000.0*time.split())<<endl;
          if(data != copydata) throw runtime_error("bug!");
          cout<<endl;

      }

      {
          vector<int> copydata(data);

          cout << "min distance between ints is "<<mindist<<endl;

          time.reset();
          delta<mindist>(data);
          cout<<"delta speed "<<N/(1000.0*time.split())<<endl;
          time.reset();
          inverseDelta<mindist>(data);
          cout<<"Unroll2 inverse delta speed "<<N/(1000.0*time.split())<<endl;
          if(data != copydata) throw runtime_error("bug!");
          cout<<endl;

      }

      {
          vector<int> copydata(data);

          cout << "min distance between ints is "<<mindist<<endl;

          time.reset();
          delta<mindist>(data);
          cout<<"delta speed "<<N/(1000.0*time.split())<<endl;
          time.reset();
          inverseDeltaMem<mindist>(data);
          cout<<"Unroll2 (mem) inverse delta speed "<<N/(1000.0*time.split())<<endl;
          if(data != copydata) throw runtime_error("bug!");
          cout<<endl;

      }




      {
          vector<int> copydata(data);

          cout << "min distance between ints is "<<mindist<<endl;

          time.reset();
          delta<mindist>(data);
          cout<<"delta speed "<<N/(1000.0*time.split())<<endl;
          time.reset();
          inverseDeltaMy1<mindist>(data);
          cout<<"My1 inverse delta speed "<<N/(1000.0*time.split())<<endl;
          if(data != copydata) throw runtime_error("bug!");
          cout<<endl;

      }

#if 0
      {
          vector<int> copydata(data);

          cout << "min distance between ints is "<<mindist<<endl;

          time.reset();
          delta<mindist>(data);
          cout<<"delta speed "<<N/(1000.0*time.split())<<endl;
          time.reset();
          inverseDeltaMy2<mindist>(data);
          cout<<"My2 inverse delta speed "<<N/(1000.0*time.split())<<endl;
          if(data != copydata) throw runtime_error("bug!");
          cout<<endl;
      }


      {
          vector<int> copydata(data);

          cout << "min distance between ints is "<<mindist<<endl;

          time.reset();
          delta<mindist>(data);
          cout<<"delta speed "<<N/(1000.0*time.split())<<endl;
          time.reset();
          inverseDeltaMy3<mindist>(data);
          cout<<"My3 inverse delta speed "<<N/(1000.0*time.split())<<endl;
          if(data != copydata) throw runtime_error("bug!");
          cout<<endl;
      }
#endif


      {
          int* pCopyData = (int*)memalign(16, data.size() * sizeof data[0]);
          if (!pCopyData) {
            throw runtime_error("Not enough memory");
          }
          memcpy(pCopyData, &data[0], data.size() * sizeof data[0]);

          cout << "min distance between ints is "<<mindist<<endl;

          time.reset();
          deltaForSIMD<mindist>(pCopyData, data.size());
          cout<<"for SIMD delta speed "<<N/(1000.0*time.split())<<endl;
          time.reset();
          inverseDeltaSIMD<mindist>(pCopyData, data.size());
          cout<<"SIMD inverse delta speed "<<N/(1000.0*time.split())<<endl;
          for (size_t i = 0; i < data.size(); ++i) {
            if (data[i] != pCopyData[i]) {
                cerr << "Elem index: " << i << " orig: " << data[i] << " obtained: " << pCopyData[i] << endl;
                throw runtime_error("bug");
             }
          }
          free(pCopyData);
          cout<<endl;
      }

      {
          int* pCopyData = (int*)memalign(16, data.size() * sizeof data[0]);
          if (!pCopyData) {
            throw runtime_error("Not enough memory");
          }
          memcpy(pCopyData, &data[0], data.size() * sizeof data[0]);

          cout << "min distance between ints is "<<mindist<<endl;

          time.reset();
          deltaForSIMD<mindist>(pCopyData, data.size());
          cout<<"for SIMD delta speed "<<N/(1000.0*time.split())<<endl;
          time.reset();
          inverseDeltaSIMDUnrolled<mindist>(pCopyData, data.size());
          cout<<"SIMD inverse UNROLLED delta speed "<<N/(1000.0*time.split())<<endl;
          for (size_t i = 0; i < data.size(); ++i) {
            if (data[i] != pCopyData[i]) {
                cerr << "Elem index: " << i << " orig: " << data[i] << " obtained: " << pCopyData[i] << endl;
                throw runtime_error("bug");
             }
          }
          free(pCopyData);
          cout<<endl;
      }


      cout<<endl<<endl<<endl;
    }
}
int main(int argc, char **argv) {
  size_t howmany = 0;
  size_t loop = 3;
  bool uniform = false;
  uint32_t Big = 22;
  float intersectionratio = 0.3f;
  uint32_t MaxBit = 26;
  int c;
  while ((c = getopt(argc, argv, "uns:m:R:M:S:l:h")) != -1)
    switch (c) {
    case 'h':
      printusage();
      return 0;
    case 'S':
      Big = atoi(optarg);
      break;
    case 'R':
      intersectionratio = atof(optarg);
      break;
    case 'M':
      MaxBit = atoi(optarg);
      if (MaxBit < 1) {
        printusage();
        return -1;
      }
      break;
    case 'm':
      howmany = atoi(optarg);
      if (howmany < 1) {
        printusage();
        return -1;
      }
      break;
    case 'l':
      loop = atoi(optarg);
      if (loop < 1) {
        printusage();
        return -1;
      }
      break;
    case 'u':
      uniform = true;
      break;
    default:
      abort();
    }
  if (howmany == 0) {
    howmany = 5;
  }
  cout << "# howmany : " << howmany << endl;
  cout << "# loop : " << loop << endl;
  cout << "# distribution : " << (uniform ? "uniform" : "clustered") << endl;
  cout << "# Big : " << Big << endl;
  cout << "# intersectionratio : " << intersectionratio << endl;
  cout << "# MaxBit : " << MaxBit << endl;
  UniformDataGenerator udg;
  ClusteredDataGenerator cdg;
  WallClockTimer z;
  size_t bogus = 0;
  vector<uint32_t> buffer(2 * (1U << Big));
#ifdef LIKWID_MARKERS
  char currentMarker[64];
  likwid_markerInit();
#endif

  cout << "# size-ratio\t";
  for (string intername : IntersectionFactory::allNames()) {
    cout << intername << "\t";
  }
  cout << " partioned (Schlegel et al.: improved, original) 16-bitV1 "
          "16-bitscalar ";
  cout << "relative-intersection-size " << endl;

  for (float ir = 1.001; ir <= 10000; ir = ir * sqrt(1.9)) {
    vector<pair<vector<uint32_t>, vector<uint32_t>>> data(howmany);
    uint32_t smallsize =
        static_cast<uint32_t>(round(static_cast<float>(1 << Big) / ir));
    cout << "#generating data...";
    cout.flush();
    for (size_t k = 0; k < howmany; ++k) {
      data[k] = uniform ? getNaivePair(udg, smallsize, 1U << MaxBit, ir,
                                       intersectionratio)
                        : getNaivePair(cdg, smallsize, 1U << MaxBit, ir,
                                       intersectionratio);
    }
    cout << "ok." << endl;
    cout << "#partitions...";
    vector<pair<vector<uint16_t>, vector<uint16_t>>> datapart(howmany);
    for (size_t k = 0; k < howmany; ++k) {
      vector<uint16_t> part1(data[k].first.size() * 4);
      size_t p1length = partition(data[k].first.data(), data[k].first.size(),
                                  part1.data(), part1.size());
      part1.resize(p1length);
      part1.shrink_to_fit();
      vector<uint16_t> part2(data[k].second.size() * 4);
      size_t p2length = partition(data[k].second.data(), data[k].second.size(),
                                  part2.data(), part2.size());
      part2.resize(p2length);
      part2.shrink_to_fit();
      datapart[k] = make_pair(part1, part2);
    }
    cout << "ok." << endl;

    cout << ir << "\t";
    float aratio = 0.0f;
    for (string intername : IntersectionFactory::allNames()) {
      intersectionfunction interfnc =
          IntersectionFactory::getFromName(intername);
      size_t volume = 0;
#ifdef LIKWID_MARKERS
      snprintf(currentMarker, sizeof(currentMarker), "%s %.2f",
               intername.c_str(), ir);
      likwid_markerStartRegion(currentMarker);
#endif
      z.reset();
      for (size_t k = 0; k < data.size(); ++k) {
        volume += (data[k].first.size() + data[k].second.size()) * loop;
        for (size_t L = 0; L < loop; ++L) {
          aratio = interfnc(data[k].first.data(), (data[k].first).size(),
                            data[k].second.data(), (data[k].second).size(),
                            buffer.data());
          bogus += aratio;
        }
      }
      cout << setw(10) << setprecision(5)
           << (volume / (static_cast<double>(z.split()))) << "\t";
#ifdef LIKWID_MARKERS
      likwid_markerStopRegion(currentMarker);
#endif
    }
    z.reset();
    size_t volume = 0;
    for (size_t k = 0; k < data.size(); ++k) {
      volume += (data[k].first.size() + data[k].second.size()) * loop;
      for (size_t L = 0; L < loop; ++L) {
        aratio = intersect_partitioned(
            datapart[k].first.data(), (datapart[k].first).size(),
            datapart[k].second.data(), (datapart[k].second).size(),
            (uint16_t *)buffer.data());
        bogus += aratio;
      }
    }
    cout << setw(10) << setprecision(5)
         << (volume / (static_cast<double>(z.split()))) << "\t";
    z.reset();
    volume = 0;
    for (size_t k = 0; k < data.size(); ++k) {
      volume += (data[k].first.size() + data[k].second.size()) * loop;
      for (size_t L = 0; L < loop; ++L) {
        aratio = original_intersect_partitioned(
            datapart[k].first.data(), (datapart[k].first).size(),
            datapart[k].second.data(), (datapart[k].second).size(),
            (uint16_t *)buffer.data());
        bogus += aratio;
      }
    }
    cout << setw(10) << setprecision(5)
         << (volume / (static_cast<double>(z.split()))) << "\t";
    z.reset();
    volume = 0;
    for (size_t k = 0; k < data.size(); ++k) {
      volume += (data[k].first.size() + data[k].second.size()) * loop;
      for (size_t L = 0; L < loop; ++L) {
        aratio = intersect_partitionedV1(
            datapart[k].first.data(), (datapart[k].first).size(),
            datapart[k].second.data(), (datapart[k].second).size(),
            (uint16_t *)buffer.data());
        bogus += aratio;
      }
    }
    cout << setw(10) << setprecision(5)
         << (volume / (static_cast<double>(z.split()))) << "\t";
    z.reset();
    volume = 0;
    for (size_t k = 0; k < data.size(); ++k) {
      volume += (data[k].first.size() + data[k].second.size()) * loop;
      for (size_t L = 0; L < loop; ++L) {
        aratio = intersect_partitionedscalar(
            datapart[k].first.data(), (datapart[k].first).size(),
            datapart[k].second.data(), (datapart[k].second).size(),
            (uint16_t *)buffer.data());
        bogus += aratio;
      }
    }
    cout << setw(10) << setprecision(5)
         << (volume / (static_cast<double>(z.split()))) << "\t";
    cout << "\t\t" << aratio / smallsize;
    cout << endl;
  }
#ifdef LIKWID_MARKERS
  likwid_markerClose();
#endif

  cout << "# bogus = " << bogus << endl;
}
int main() {
    assert(sizeof(long)==8);
    assert(sizeof(int)==4);
    WallClockTimer timer;
    int repeat = 100;
    int N = 10000;
    cout<<"# We report bits-per-integer speed-of-naive speed-of-popcnt1 speed-of-popcnt2 speed-of-table speed-of-tzcnt1 speed-of-tzcnt2 where speeds are in millions of integers per second "<<endl;
    for(int sb = 1; sb<=64; sb*=2) {
        int setbitsmax = sb*N;
        vector<long> bitmap(N);
        for (int k = 0; k < setbitsmax; ++k) {
            int bit = rand() % (N*64);
            bitmap[bit/64] |= (1L<<(bit%64));
        }
        int bitcount = 0;
        for(int k = 0; k <N; ++k) {
            bitcount += __builtin_popcountl(bitmap[k]);
        }
        double bitsperinteger = N*sizeof(long)*8.0/bitcount;
        vector<int> outputnaive(bitcount);
        vector<int> outputpopcnt1(bitcount);
        vector<int> outputpopcnt2(bitcount);
        vector<int> outputtable(bitcount);
        vector<int> outputctz1(bitcount);
        vector<int> outputctz2(bitcount);
        cout<<"# Stored "<<bitcount<<" unary numbers in  ";
        cout<< N*sizeof(long)<<" bytes " ;
        cout<<" ("<<bitsperinteger<<" bits per number)"<<endl;
        timer.reset();
        int c0 = 0;
        for(int t1=0; t1<repeat; ++t1)
            c0 = bitscanunary_naive(bitmap.data(),N,outputnaive.data());
        int tinaive = timer.split();
        timer.reset();
        int c1 = 0;
        for(int t1=0; t1<repeat; ++t1)
            c1 = bitscanunary_popcnt1(bitmap.data(),N,outputpopcnt1.data());
        assert(c1 == c0);
        int tipopcnt1 = timer.split();
        timer.reset();
        int c12 = 0;
        for(int t1=0; t1<repeat; ++t1)
            c12 = bitscanunary_popcnt2(bitmap.data(),N,outputpopcnt2.data());
        assert(c12 == c0);
        int tipopcnt2 = timer.split();
        timer.reset();
        int c2 = 0;
        for(int t1=0; t1<repeat; ++t1)
            c2 = bitscanunary_table(bitmap.data(),N,outputtable.data());
        assert(c2 == c0);
        int titable = timer.split();
        timer.reset();
        int c3 = 0;
        for(int t1=0; t1<repeat; ++t1)
            c3 = bitscanunary_ctzl1(bitmap.data(),N,outputctz1.data());
        assert(c3 == c0);
        int tictz1 = timer.split();
        timer.reset();
        int c32 = 0;
        for(int t1=0; t1<repeat; ++t1)
            c32 = bitscanunary_ctzl2(bitmap.data(),N,outputctz2.data());
        assert(c32 == c0);
        int tictz2 = timer.split();

        assert (outputnaive == outputpopcnt1);
        assert (outputnaive == outputpopcnt2);
        assert (outputnaive == outputtable);
        assert (outputnaive == outputctz1);
        assert (outputnaive == outputctz2);        
        cout << bitsperinteger<<" " ;
        cout << bitcount * repeat * 0.001 /tinaive <<" ";
        cout << bitcount * repeat * 0.001 /tipopcnt1 <<" ";
        cout << bitcount * repeat * 0.001 /tipopcnt2 <<" ";
        cout << bitcount * repeat * 0.001 /titable <<" ";
        cout << bitcount * repeat * 0.001 /tictz1 <<" ";
        cout << bitcount * repeat * 0.001 /tictz2 <<" ";
        cout << endl ;
    }

    return 0;
}
Пример #11
0
void simplebenchmark(uint32_t N = 1U << 16, uint32_t T = 1U << 9) {
    T = T + 1; // we have a warming up pass
    vector<uint32_t, cacheallocator> data = generateArray32(N);
    vector<uint32_t, cacheallocator> compressed(N, 0);
    vector<uint32_t, cacheallocator> recovered(N, 0);
    WallClockTimer z;
    double packtime, packtimewm, unpacktime;
    double simdpacktime, simdpacktimewm, simdunpacktime;
    double horizontalunpacktime;

    cout << "#million of integers per second: higher is better" << endl;
    cout << "#bit, pack, pack without mask, unpack" << endl;
    for (uint32_t bitindex = 0; bitindex < 32; ++bitindex) {
        uint32_t bit = 32 - bitindex;
        maskfnc(data, bit);
        for (uint32_t repeat = 0; repeat < 1; ++repeat) {
            packtime = 0;
            packtimewm = 0;
            unpacktime = 0;
            simdpacktime = 0;
            simdpacktimewm = 0;
            simdunpacktime = 0;
            horizontalunpacktime = 0;

            for (uint32_t t = 0; t < T; ++t) {
                compressed.clear();
                compressed.resize(N * bit / 32, 0);
                recovered.clear();
                recovered.resize(N, 0);
                simdpack(data, compressed, bit);
                simdunpack(compressed, recovered, bit);
                if (!equalOnFirstBits(data, recovered, bit)) {
                    cout << " Bugs!" << bit << endl;
                    return;
                }

                z.reset();
                simdpack(data, compressed, bit);
                if (t > 0)
                    simdpacktime += z.split();
                simdunpack(compressed, recovered, bit);
                if (!equalOnFirstBits(data, recovered, bit)) {
                    cout << " Bugs!" << bit << endl;
                    return;
                }

                z.reset();
                simdpackwithoutmask(data, compressed, bit);
                if (t > 0)
                    simdpacktimewm += z.split();

                z.reset();
                simdunpack(compressed, recovered, bit);
                if (t > 0)
                    simdunpacktime += z.split();

                if (!equalOnFirstBits(data, recovered, bit)) {
                    cout << " Bugs!" << bit << endl;
                    return;
                }

                z.reset();
                fastpack(data, compressed, bit);
                if (t > 0)
                    packtime += z.split();
                fastunpack(compressed, recovered, bit);
                if (!equalOnFirstBits(data, recovered, bit)) {
                    cout << " Bug1!" << endl;
                    return;
                }

                z.reset();
                fastpackwithoutmask(data, compressed, bit);
                if (t > 0)
                    packtimewm += z.split();

                z.reset();
                fastunpack(compressed, recovered, bit);
                if (t > 0)
                    unpacktime += z.split();

                if (!equalOnFirstBits(data, recovered, bit)) {
                    cout << " Bug1!" << endl;
                    return;
                }

                z.reset();
                horizontalunpack(compressed, recovered, bit);
                if (t > 0)
                    horizontalunpacktime += z.split();

                if (!equalOnFirstBits(data, recovered, bit)) {
                    cout << " Bug1!" << endl;
                    return;
                }

            }

            cout << std::setprecision(4) << bit << "\t\t" << N * (T - 1)
                    / (packtime) << "\t\t" << N * (T - 1) / (packtimewm)
                    << "\t\t\t" << N * (T - 1) / (unpacktime) << "\t\t";

            cout << std::setprecision(4) << bit << "\t\t" << N * (T - 1)
                    / (simdpacktime) << "\t\t" << N * (T - 1)
                    / (simdpacktimewm) << "\t\t" << N * (T - 1)
                    / (simdunpacktime) << "\t\t";
            cout<< N * (T - 1)
                            / (horizontalunpacktime) << "\t\t";


            cout << endl;
        }

    }

}
void simplebenchmark(uint32_t N = 1U << 16, uint32_t T = 1U << 9) {
  T = T + 1; // we have a warming up pass
  uint32_t bogus = 0;
  vector<uint32_t> data(N);
  vector<uint32_t> compressed(N);
  vector<uint32_t> icompressed(N);
  vector<uint32_t> recovered(N);
  WallClockTimer z;
  double unpacktime;
  double iunpacktime;

  cout << "#million of integers per second: higher is better" << endl;
  cout << "#bit,  unpack,iunpack" << endl;

  for (uint32_t bitindex = 0; bitindex < 32; ++bitindex) {
    uint32_t bit = bitindex + 1;
    vector<uint32_t> initdata(N);
    for (size_t i = 0; 4 * i < data.size(); i += 4) {
      initdata[i] = random(bit) + (i >= 4 ? initdata[i - 4] : 0);
      for (size_t j = 1; j < 4; ++j) {
        initdata[i + j] = initdata[i];
      }
    }

    const vector<uint32_t> refdata = initdata;
    vector<uint32_t>().swap(initdata);

    icompressed.clear();
    // 4 * N should be enough for all  schemes
    icompressed.resize(4 * N, 0);
    compressed.clear();
    // 4 * N should be enough for all  schemes
    compressed.resize(4 * N, 0);
    recovered.clear();
    recovered.resize(N, 0);

    if (needPaddingTo128Bits(recovered.data())) {
      throw logic_error("Array is not aligned on 128 bit boundary!");
    }
    if (needPaddingTo128Bits(icompressed.data())) {
      throw logic_error("Array is not aligned on 128 bit boundary!");
    }
    if (needPaddingTo128Bits(compressed.data())) {
      throw logic_error("Array is not aligned on 128 bit boundary!");
    }
    if (needPaddingTo128Bits(refdata.data())) {
      throw logic_error("Array is not aligned on 128 bit boundary!");
    }

    for (uint32_t repeat = 0; repeat < 1; ++repeat) {

      unpacktime = 0;

      iunpacktime = 0;

      for (uint32_t t = 0; t <= T; ++t) {

        assert(data.size() == refdata.size());
        fill(icompressed.begin(), icompressed.end(), 0);
        fill(recovered.begin(), recovered.end(), 0);
        memcpy(data.data(), refdata.data(),
               data.size() * sizeof(uint32_t)); // memcpy can be slow
        Helper::pack(data.data(), data.size(), icompressed.data(), bit);
        z.reset();
        Helper::unpack(icompressed.data(), refdata.size(), recovered.data(),
                       bit);
        if (t > 0) // we don't count the first run
          unpacktime += static_cast<double>(z.split());
        if (!equalOnFirstBits(refdata, recovered, bit)) {
          cout << " Bug 1a " << bit << endl;
          return;
        }
        memcpy(data.data(), refdata.data(),
               data.size() * sizeof(uint32_t)); // memcpy can be slow
        Helper::pack(data.data(), data.size(), icompressed.data(), bit);

        z.reset();
        Helper::iunpack(icompressed.data(), refdata.size(), recovered.data(),
                        bit);
        if (t > 0) // we don't count the first run
          iunpacktime += static_cast<double>(z.split());
        if (!equalOnFirstBits(refdata, recovered, bit)) {
          cout << " Bug 2 " << bit << endl;
          return;
        }
      }

      cout << std::setprecision(4) << bit << "\t\t";
      cout << "\t\t" << N * (T - 1) / (unpacktime) << "\t\t";

      cout << "\t\t" << N * (T - 1) / (iunpacktime);

      cout << endl;
    }
  }
  cout << "# ignore this " << bogus << endl;
}