void dedisperse_single(float **inin, float **outout, int nchan,int ndat)
{
  //omp_set_num_threads(8);
  int npass=get_npass(nchan);
  //printf("need %d passes.\n",npass);
  //npass=2;
  int bs=nchan;
  float **in=inin;

  float **out=outout;
  //FILE *fout;
  //fout = fopen('/var/log/burst_bench.log', 'w');
  
  //fclose(fout);
//  omp_set_dynamic(0);
//  omp_set_num_threads(8);

  for (int i=0;i<npass;i++) {    
#pragma omp parallel for
    for (int j=0;j<nchan;j+=bs) {
      //printf("dedisperse using %i threads\n",omp_get_num_threads());
      dedisperse_kernel(in+j,out+j,bs,ndat);
    }
    bs/=2;
    float **tmp=in;
    in=out;
    out=tmp;
  }
  memcpy(out[0],in[0],nchan*ndat*sizeof(float));
  
}
void dedisperse_dual(float **inin, float **outout, int nchan,int ndat)
{
  int npass=get_npass(nchan);
  //printf("need %d passes from %d channels..\n",npass,nchan);
  //npass=2;
  int bs=nchan;
  float **in=inin;
  float **out=outout;

  //the npasss-1 is so that we stop in time to hand the final pass to 
  //the single-step kernel in the event of an odd depth.
  for (int i=0;i<npass-1;i+=2) {    
#pragma omp parallel for
    for (int j=0;j<nchan;j+=bs) {
      //dedisperse_kernel_2pass_v2(in+j,out+j,bs,ndat);
      dedisperse_block_kernel_2pass((const float **)(in+j),out+j,bs,ndat);
    }
    bs/=4;
    float **tmp=in;
    in=out;
    out=tmp;
  }


  if (npass%2==1) {
    //do a single step if we come in with odd depth
    //printf("doing final step for odd depth with block size %d.\n",bs);
#pragma omp parallel for
    for (int j=0;j<nchan;j+=bs)
      dedisperse_kernel(in+j,out+j,bs,ndat);
    float **tmp=in;
    in=out;
    out=tmp;
    
  }
  
  memcpy(out[0],in[0],nchan*ndat*sizeof(float));
  
}
void dedisperse(float **inin, float **outout, int nchan,int ndat)
{
  //return;
  int npass=get_npass(nchan);
  //printf("need %d passes.\n",npass);
  //npass=2;
  int bs=nchan;
  float **in=inin;
  float **out=outout;

  for (int i=0;i<npass;i++) {    
    //#pragma omp parallel for
    for (int j=0;j<nchan;j+=bs) {
      dedisperse_kernel(in+j,out+j,bs,ndat);
    }
    bs/=2;
    float **tmp=in;
    in=out;
    out=tmp;
  }
  memcpy(out[0],in[0],nchan*ndat*sizeof(float));
  
}
/*--------------------------------------------------------------------------------*/
void dedisperse_2pass(float **dat, float **dat2, int nchan, int ndat)
{
  dedisperse_kernel(dat,dat2,nchan,ndat);
  dedisperse_kernel(dat2,dat,nchan/2,ndat);
  dedisperse_kernel(dat2+nchan/2,dat+nchan/2,nchan/2,ndat);
}
int main(int argc, char *argv[])
{
  //int nchan=4096;
  //int ndat=12000;
  int nchan=1024;
  int ndat=327680;

  int nrep=1;

  if (argc>1)
    nchan=atoi(argv[1]);
  if (argc>2)
    ndat=atoi(argv[2]);
  if (argc>3)
    nrep=atoi(argv[3]);

  float **dat=matrix(nchan,ndat+nchan);
  float **dat2=matrix(nchan,ndat+nchan);
  if (1)
    for (int i=0;i<nchan;i++)
      dat[i][(int)(0.8317*i+160.2)]=1;
  else
    for (int i=0;i<nchan;i++)
      dat[i][ndat/2]=1;
  
#if 0
  write_mat(dat,nchan,ndat,"dat_starting.dat");
  dedisperse_kernel(dat,dat2,nchan,ndat);
  write_mat(dat2,nchan,ndat,"dat_1pass.dat");
  dedisperse_2pass(dat,dat2,nchan,ndat);
  write_mat(dat,nchan,ndat,"dat_2pass.dat");  
#endif



  double t1=omp_get_wtime();
  //dedisperse(dat,dat2,nchan,ndat);
  //dedisperse_blocked(dat,dat2,nchan,ndat);
  dedisperse_blocked_cached(dat,dat2,nchan,ndat);
  double t2=omp_get_wtime();
  printf("took %12.4f seconds.\n",t2-t1);
  int ichan,idat;
  find_peak(dat,nchan,ndat,&ichan,&idat);
  t1=omp_get_wtime();
  printf("took %12.4f seconds to find peak.\n",t1-t2);


  
  for (int i=0;i<10;i++) {
    
    t1=omp_get_wtime();
    for (int j=0;j<nrep;j++) {
      dedisperse_blocked_cached(dat,dat2,nchan,ndat);
      //dedisperse(dat,dat2,nchan,ndat);
    }
    t2=omp_get_wtime();
    double nops=get_npass(nchan)*(nchan+0.0)*(ndat+0.0)*(nrep+0.0);
    printf("took %12.6f seconds at rate %12.6f.\n",t2-t1,nops/(t2-t1)/1024/1024);

    //printf("took %12.4f seconds.\n",t2-t1);
  
  }
  
  //write_mat(dat,nchan,ndat,"dat_final1.dat");

  //write_mat(dat2,nchan,ndat,"dat_final2.dat");
}