int main(int argc, char* argv[]) {

  // parse flags
  gflags::SetUsageMessage("This script builds a database for a given dataset of vectors\n"
        "Usage:\n"
        "    tool_createdb --c1 4 --c2 4 --p 2 --basename \"tmp\""
        " You should convert the fvecs beforehand using the accompanying convert script\n");
  gflags::SetVersionString("1.0.0");
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  // select cuda device
  cudaSetDevice(FLAGS_device);
  cudaSetDeviceFlags (cudaDeviceMapHost);

  const string preName  = FLAGS_basename + "_" + std::to_string(FLAGS_dim) + "_" + std::to_string(FLAGS_p)
                        + "_" + std::to_string(FLAGS_c1) + "_" + std::to_string(FLAGS_c2);


  // read in dataset
  FileReader<float> DataReader = FileReader<float>(FLAGS_dataset);
  EXPECT_EQ((uint) FLAGS_dim, DataReader.dim());
  arr<float> data((int) FLAGS_chunksize * FLAGS_dim);
  data.mallocHost();
  data.mallocDevice();
  data.host = DataReader.data(FLAGS_chunksize);
  data.toDevice();

  // building the codebook
  // ==============================================================================================
  int k = 16;
  PerturbationProTree ppt(FLAGS_dim, FLAGS_p, FLAGS_p);
  const string codebook_file = preName + ".ppqt";

  ppt.createTree(FLAGS_c1, FLAGS_c2, data.device, 20000);
  ppt.writeTreeToFile(codebook_file);

  

  const string lineName   = preName + "_" + std::to_string(FLAGS_lineparts) + ".lines";
  const string prefixName = preName + ".prefix";
  const string countsName = preName + ".count";
  const string dbIdxName  = preName + ".dbIdx";

  const uint chunkMax = DataReader.num() / FLAGS_chunksize;
  const uint data_num = chunkMax * FLAGS_chunksize;

  uint* binPrefix = new uint[FLAGS_hashsize];
  uint* binCounts = new uint[FLAGS_hashsize];
  uint* dbIdx = new uint[data_num];

  // building the data base
  // ==============================================================================================
  uint* dbIdxSave = new uint[data_num];

  memset(binPrefix, 0, FLAGS_hashsize * sizeof(uint));
  memset(binCounts, 0, FLAGS_hashsize * sizeof(uint));
  memset(dbIdx, 0, data_num * sizeof(uint));

  uint* chBinPrefix = new uint[FLAGS_hashsize];
  uint* chBinCounts = new uint[FLAGS_hashsize];
  uint* chDBIdx     = new uint[FLAGS_chunksize];
  float* chLines    = new float[FLAGS_chunksize * FLAGS_lineparts];


  ppt.buildKBestDB(data.device, FLAGS_chunksize);
  ppt.lineDist(data.device, FLAGS_chunksize);

  // GPU -> CPU memory
  SAFE_CUDA_CALL(cudaMemcpy(chBinPrefix, ppt.getBinPrefix(), FLAGS_hashsize * sizeof(uint),  cudaMemcpyDeviceToHost));
  SAFE_CUDA_CALL(cudaMemcpy(chBinCounts, ppt.getBinCounts(), FLAGS_hashsize * sizeof(uint),  cudaMemcpyDeviceToHost));
  SAFE_CUDA_CALL(cudaMemcpy(chDBIdx,     ppt.getDBIdx(),     FLAGS_chunksize * sizeof(uint), cudaMemcpyDeviceToHost));
  SAFE_CUDA_CALL(cudaMemcpy(chLines,     ppt.getLine(),      FLAGS_chunksize * FLAGS_lineparts * sizeof(float), cudaMemcpyDeviceToHost));

  ofstream fLines(lineName.c_str(), std::ofstream::out | std::ofstream::binary);
  fLines.write((char*) chLines, FLAGS_chunksize * FLAGS_lineparts * sizeof(float));
  fLines.close();
  cout << "written " << lineName << endl;

  // prefixSum for bin-idx
  ofstream fprefix(prefixName.c_str(), std::ofstream::out | std::ofstream::binary);
  fprefix.write((char*) binPrefix, FLAGS_hashsize * sizeof(uint));
  fprefix.close();
  cout << "written " << prefixName << endl;

  // size of non-empty bins
  ofstream fcounts(countsName.c_str(), std::ofstream::out | std::ofstream::binary);
  fcounts.write((char*) binCounts, FLAGS_hashsize * sizeof(uint));
  fcounts.close();
  cout << "written " << countsName << endl;
  cout << "size: " << (FLAGS_hashsize * sizeof(uint)) << endl;

  // for each bin the ids of containing vectors
  ofstream fdb(dbIdxName.c_str(), std::ofstream::out | std::ofstream::binary);
  fdb.write((char*) dbIdx, data_num * sizeof(uint));
  fdb.close();
  cout << "written " << dbIdxName << endl;

  if (data.device)
    cudaFree(data.device);
  delete[] data.host;


  gflags::ShutDownCommandLineFlags();
  return 0;

}
int main(int argc, char* argv[]) {

  // parse flags
  gflags::SetUsageMessage("This script builds a database for a given dataset of vectors\n"
                          "Usage:\n"
                          "    tool_createdb --c1 4 --c2 4 --p 2 --basename \"tmp\""
                          " You should convert the fvecs beforehand using the accompanying convert script\n");
  gflags::SetVersionString("1.0.0");
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  // select cuda device
  cudaSetDevice(FLAGS_device);
  cudaSetDeviceFlags (cudaDeviceMapHost);

  const string preName  = FLAGS_basename + "_" + std::to_string(FLAGS_dim) + "_" + std::to_string(FLAGS_p)
                          + "_" + std::to_string(FLAGS_c1) + "_" + std::to_string(FLAGS_c2);


  FileReader<float> DataReader = FileReader<float>(FLAGS_dataset);

  FileReader<float> QueryReader = FileReader<float>(FLAGS_queryset);
  EXPECT_EQ((uint) FLAGS_dim, QueryReader.dim());
  arr<float> query = arr<float>(FLAGS_chunksize * FLAGS_dim);
  query.mallocHost();
  query.mallocDevice();
  query.host = QueryReader.data(FLAGS_chunksize);
  query.toDevice();


  PerturbationProTree ppt(FLAGS_dim, FLAGS_p, FLAGS_p);
  const string codebook_file = preName + ".ppqt";

  if (!file_exists(codebook_file)) {
    cout << "you need to generate a codebook first. No codebook found in " << codebook_file << endl;
    return 1;
  } else {
    cout << "codebook exists, reading from " << codebook_file << endl;
    ppt.readTreeFromFile(codebook_file);
  }

  const uint base_num = DataReader.num();

  const string lineName   = preName + "_" + std::to_string(FLAGS_lineparts) + ".lines";
  const string prefixName = preName + ".prefix";
  const string countsName = preName + ".count";
  const string dbIdxName  = preName + ".dbIdx";

  const uint chunkMax = base_num / FLAGS_chunksize;
  const uint data_num = chunkMax * FLAGS_chunksize;

  uint* binPrefix = new uint[FLAGS_hashsize];
  uint* binCounts = new uint[FLAGS_hashsize];
  uint* dbIdx     = new uint[data_num];

  // read data base
  ifstream fprefix(prefixName.c_str(), std::ifstream::in | std::ofstream::binary);
  fprefix.read((char*) binPrefix, FLAGS_hashsize * sizeof(uint));
  fprefix.close();
  cout << "read " << prefixName << endl;

  ifstream fcounts(countsName.c_str(), std::ofstream::in | std::ofstream::binary);
  fcounts.read((char*) binCounts, HASH_SIZE * sizeof(uint));
  fcounts.close();
  cout << "read " << countsName << endl;

  size_t nfloats = DataReader.num();
  nfloats *= FLAGS_lineparts;

  float* hLines = nullptr;
  float* dLines;

  cudaHostAlloc((void **) &hLines, nfloats * sizeof(float), cudaHostAllocMapped);
  cudaHostGetDevicePointer((void **) &dLines, (void *) hLines, 0);
  if (!hLines) {
    cerr << " did not get hLine memory " << endl;
    exit(1);
  }

  ifstream fdb(dbIdxName.c_str(), std::ifstream::in | std::ofstream::binary);
  fdb.read((char*) dbIdx, base_num * sizeof(uint));
  fdb.close();
  cout << "read " << dbIdxName << endl;


  ppt.setDB(base_num, binPrefix, binCounts, dbIdx);

  // query
  vector<uint> resIdx;
  vector<float> resDist;
  
  for (int idxA = 0; idxA < FLAGS_chunksize; idxA += 4096) {
    const int len = min(4096, (int)(FLAGS_chunksize - idxA));
    ppt.queryKNN(resIdx, resDist, query.device + 4096 * idxA * FLAGS_dim, len, 4096);
    for (int r = 0; r < len; ++r) {
      const int queryVectorId = idxA*4096 + r;
      const int bestfoundBaseVectorId = resIdx[4096*r];
      const int secondbestfoundBaseVectorId = resIdx[4096*r+1];
    }
  }

  gflags::ShutDownCommandLineFlags();
  return 0;

}