void dedisperse_gbt_jon(Data *dat, float *outdata)
{
  
  //remap_data(dat);  
  //float **tmp=matrix(dat->nchan, dat->ndata);


  float **tmp=(float **)malloc(sizeof(float *)*dat->nchan);
  for (int i=0;i<dat->nchan;i++)
    tmp[i]=outdata+i*dat->ndata;

  memset(tmp[0],0,sizeof(tmp[0][0])*dat->ndata*dat->nchan);

  double t1=omp_get_wtime();

  //dedisperse_dual(dat->data,tmp,dat->nchan,dat->ndata);

  //memcpy(ip_dat[0],dat->data[0],dat->nchan*dat->ndata*sizeof(dat->data[0][0]));

  //dedisperse_inplace(ip_dat,dat->nchan,dat->ndata);  
  //dedisperse_single(dat->data,tmp,dat->nchan,dat->ndata);
  dedisperse_blocked_cached(dat->data,tmp,dat->nchan,dat->ndata);
  
  //printf("took %12.4f seconds to dedisperse.\n",omp_get_wtime()-t1);
  for(int i = 0; i < dat->nchan; i++){
    memcpy(outdata + i*dat->ndata,dat->data[i],dat->ndata*sizeof(outdata[0]));
  }
  free(tmp);

  if (0) {
    printf("element 500,300 is %12.5e\n",dat->data[500][300]);
    FILE *outfile=fopen("burst_dump.dat","w");
    fwrite(&(dat->nchan),sizeof(dat->nchan),1,outfile);
    fwrite(&(dat->ndata),sizeof(dat->ndata),1,outfile);
    fwrite(outdata,sizeof(outdata[0]),dat->nchan*dat->ndata,outfile);
    fclose(outfile);
  }
}
int main(int argc, char *argv[])
{
  //int nchan=4096;
  //int ndat=12000;
  int nchan=1024;
  int ndat=327680;

  int nrep=1;

  if (argc>1)
    nchan=atoi(argv[1]);
  if (argc>2)
    ndat=atoi(argv[2]);
  if (argc>3)
    nrep=atoi(argv[3]);

  float **dat=matrix(nchan,ndat+nchan);
  float **dat2=matrix(nchan,ndat+nchan);
  if (1)
    for (int i=0;i<nchan;i++)
      dat[i][(int)(0.8317*i+160.2)]=1;
  else
    for (int i=0;i<nchan;i++)
      dat[i][ndat/2]=1;
  
#if 0
  write_mat(dat,nchan,ndat,"dat_starting.dat");
  dedisperse_kernel(dat,dat2,nchan,ndat);
  write_mat(dat2,nchan,ndat,"dat_1pass.dat");
  dedisperse_2pass(dat,dat2,nchan,ndat);
  write_mat(dat,nchan,ndat,"dat_2pass.dat");  
#endif



  double t1=omp_get_wtime();
  //dedisperse(dat,dat2,nchan,ndat);
  //dedisperse_blocked(dat,dat2,nchan,ndat);
  dedisperse_blocked_cached(dat,dat2,nchan,ndat);
  double t2=omp_get_wtime();
  printf("took %12.4f seconds.\n",t2-t1);
  int ichan,idat;
  find_peak(dat,nchan,ndat,&ichan,&idat);
  t1=omp_get_wtime();
  printf("took %12.4f seconds to find peak.\n",t1-t2);


  
  for (int i=0;i<10;i++) {
    
    t1=omp_get_wtime();
    for (int j=0;j<nrep;j++) {
      dedisperse_blocked_cached(dat,dat2,nchan,ndat);
      //dedisperse(dat,dat2,nchan,ndat);
    }
    t2=omp_get_wtime();
    double nops=get_npass(nchan)*(nchan+0.0)*(ndat+0.0)*(nrep+0.0);
    printf("took %12.6f seconds at rate %12.6f.\n",t2-t1,nops/(t2-t1)/1024/1024);

    //printf("took %12.4f seconds.\n",t2-t1);
  
  }
  
  //write_mat(dat,nchan,ndat,"dat_final1.dat");

  //write_mat(dat2,nchan,ndat,"dat_final2.dat");
}