void dedisperse_lagged(float **inin, float **outout, int nchan, int ndat)
{
    assert(nchan >= 2);
    assert(ndat >= 1);
    
    // detects underallocation, in the common case where inin was allocated with matrix()
    assert(inin[1] - inin[0] >= nchan + ndat - 1);
    assert(outout[1] - outout[0] >= nchan + ndat - 1);
    
    int npass = get_npass(nchan);
    assert(nchan == (1 << npass));   // currently require nchan to be a power of two



    int bs = nchan;
    float **in = inin;
    float **out = outout;

    for (int i = 0; i < npass; i++) {    

  #pragma omp parallel for
      for (int j = 0; j < nchan; j += bs)
          dedisperse_kernel_lagged(in+j, out+j, bs, ndat + j/bs, j/bs);


      float **tmp=in;
      in = out;
      out = tmp;
      bs /= 2;
    } 

    // non-rectangular copy
    for (int j = 0; j < nchan; j++)
      memcpy(out[j], in[j], (ndat+j)*sizeof(float));
}
void dedisperse_single(float **inin, float **outout, int nchan,int ndat)
{
  //omp_set_num_threads(8);
  int npass=get_npass(nchan);
  //printf("need %d passes.\n",npass);
  //npass=2;
  int bs=nchan;
  float **in=inin;

  float **out=outout;
  //FILE *fout;
  //fout = fopen('/var/log/burst_bench.log', 'w');
  
  //fclose(fout);
//  omp_set_dynamic(0);
//  omp_set_num_threads(8);

  for (int i=0;i<npass;i++) {    
#pragma omp parallel for
    for (int j=0;j<nchan;j+=bs) {
      //printf("dedisperse using %i threads\n",omp_get_num_threads());
      dedisperse_kernel(in+j,out+j,bs,ndat);
    }
    bs/=2;
    float **tmp=in;
    in=out;
    out=tmp;
  }
  memcpy(out[0],in[0],nchan*ndat*sizeof(float));
  
}
/*--------------------------------------------------------------------------------*/
void dedisperse_blocked(float **dat, float **dat2, int nchan, int ndat)
{
  int nchan1=64;
  int npass1=get_npass(nchan1);
  int npass=get_npass(nchan);
  int npass2=npass-npass1;
  int nchan2=nchan/nchan1;

  int nblock=nchan/nchan1;
  int nblock2=nchan/nchan2;

  for (int i=0;i<nblock;i++) 
    dedisperse(dat+i*nchan1,dat2+i*nchan1,nchan1,ndat);
  
  
  for (int i=0;i<nblock;i++) 
    for (int j=0;j<nchan1;j++)
      memcpy(dat2[j*nblock+i],dat[i*nchan1+j]+i*j,ndat-i*j);

  for (int i=0;i<nblock2;i++)
    dedisperse(dat2+i*nchan2,dat+i*nchan2,nchan2,ndat);
  
  
}
void dedisperse_dual(float **inin, float **outout, int nchan,int ndat)
{
  int npass=get_npass(nchan);
  //printf("need %d passes from %d channels..\n",npass,nchan);
  //npass=2;
  int bs=nchan;
  float **in=inin;
  float **out=outout;

  //the npasss-1 is so that we stop in time to hand the final pass to 
  //the single-step kernel in the event of an odd depth.
  for (int i=0;i<npass-1;i+=2) {    
#pragma omp parallel for
    for (int j=0;j<nchan;j+=bs) {
      //dedisperse_kernel_2pass_v2(in+j,out+j,bs,ndat);
      dedisperse_block_kernel_2pass((const float **)(in+j),out+j,bs,ndat);
    }
    bs/=4;
    float **tmp=in;
    in=out;
    out=tmp;
  }


  if (npass%2==1) {
    //do a single step if we come in with odd depth
    //printf("doing final step for odd depth with block size %d.\n",bs);
#pragma omp parallel for
    for (int j=0;j<nchan;j+=bs)
      dedisperse_kernel(in+j,out+j,bs,ndat);
    float **tmp=in;
    in=out;
    out=tmp;
    
  }
  
  memcpy(out[0],in[0],nchan*ndat*sizeof(float));
  
}
void dedisperse(float **inin, float **outout, int nchan,int ndat)
{
  //return;
  int npass=get_npass(nchan);
  //printf("need %d passes.\n",npass);
  //npass=2;
  int bs=nchan;
  float **in=inin;
  float **out=outout;

  for (int i=0;i<npass;i++) {    
    //#pragma omp parallel for
    for (int j=0;j<nchan;j+=bs) {
      dedisperse_kernel(in+j,out+j,bs,ndat);
    }
    bs/=2;
    float **tmp=in;
    in=out;
    out=tmp;
  }
  memcpy(out[0],in[0],nchan*ndat*sizeof(float));
  
}
void dedisperse_inplace(float **inin, int nchan, int m)
{
  omp_set_dynamic(0);
  omp_set_num_threads(OMP_THREADS);

  int npass=get_npass(nchan);

  float **in=inin;

  int radix = 1;
  int pairs = nchan/2;
  int threads = 8;

  //initial channel map
  int *fmap = malloc(sizeof(int)*nchan);

  for (int i=0; i<nchan;i++){
    fmap[i] = i;
  }

  //float *vec = (float*)malloc(sizeof(float)*OMP_THREADS*m);
  //float **tmp = (float**)malloc(sizeof(float*)*OMP_THREADS);
  //for(int i = 0; i < OMP_THREADS; i++){

  //}
  float **tmp = matrix(OMP_THREADS,m);

  for (int i=0;i<npass;i++) {

    generate_shift_group(fmap,radix,nchan);

    #pragma omp parallel for
    for (int j=0;j<pairs;j++) {
      int zero = 2*j;
      int zero_ind = 0;
      int id = omp_get_thread_num();
      //Inefficient, but it scans over at most n/2
      while(fmap[zero_ind] != zero - (zero % (nchan/radix))){
        zero_ind++;
      }
      zero_ind += radix*(zero % (nchan/radix));

      int comp_ind = zero_ind + radix;

      int jeff = j % (nchan/(radix*2));

      for(int k = 0; k < m; k++){
        tmp[id][k] = in[zero_ind][k];
      }
      for(int k = 0; k < m; k++){
       in[zero_ind][k] = in[zero_ind][k] + in[comp_ind][k];
      }
      for(int k = 0; k < m - jeff - 1; k++){
       in[comp_ind][k] = tmp[id][k + jeff] + in[comp_ind][k + jeff + 1];
      }
    }

    radix *=2;
  }

  fast_unshuffle(in,fmap,nchan,m);
  //unshuffle(in,fmap,nchan,m);
  free(fmap);
  free(tmp);
  free(tmp[0]);
}
/*--------------------------------------------------------------------------------*/
void dedisperse_blocked_cached(float **dat, float **dat2, int nchan, int ndat)
{
  //int nchan1=128;
  //int chunk_size=768;

  int nchan1=128;
  //int chunk_size=1536;
  int chunk_size=1024;
  int nchunk=ndat/chunk_size;
  int npass1=get_npass(nchan1);
  int npass=get_npass(nchan);
  int npass2=npass-npass1;
  int nchan2=nchan/nchan1;

  int nblock=nchan/nchan1;
  int nblock2=nchan/nchan2;


  
#pragma omp parallel 
  {
    float **tmp1=matrix(nchan1,chunk_size+nchan1);
    float **tmp2=matrix(nchan1,chunk_size+nchan1); 
    
#pragma omp for collapse(2) schedule(dynamic,2)
    for (int i=0;i<nblock;i++) {      
      //printf("i is %d\n",i);
      for (int j=0;j<nchunk;j++) {
        int istart=j*chunk_size;
        int istop=(j+1)*chunk_size+nchan1;
        if (istop>ndat) {
          istop=ndat;
          for (int k=0;k<nchan1;k++)
            memset(tmp1[k]+chunk_size,0,sizeof(float)*nchan1);
        }
        for (int k=0;k<nchan1;k++)
          memcpy(tmp1[k],&(dat[i*nchan1+k][istart]),(istop-istart)*sizeof(float));
        
        dedisperse(tmp1,tmp2,nchan1,chunk_size+nchan1);
        
        for (int k=0;k<nchan1;k++)
          memcpy(&(dat2[i*nchan1+k][istart]),tmp1[k],chunk_size*sizeof(float));
      }
    }
#if 1
    free(tmp1[0]);   
    free(tmp1);
    free(tmp2[0]);
    free(tmp2);
#endif
  }
  
  

  float **dat_shift=(float **)malloc(sizeof(float *)*nchan);
  for (int i=0;i<nblock;i++)
    for (int j=0;j<nchan1;j++)
      dat_shift[j*nblock+i]=dat2[i*nchan1+j]+i*j;  


  //recalculate block sizes to keep amount in cache about the same
  int nelem=nchan1*chunk_size;
  chunk_size=nelem/nchan2;
  nchunk=ndat/chunk_size;

#pragma omp parallel 
  {
    float **tmp1=matrix(nchan2,chunk_size+nchan2);
    float **tmp2=matrix(nchan2,chunk_size+nchan2); 

#pragma omp for  collapse(2) schedule(dynamic,4)
    for (int i=0;i<nblock2;i++) {      
      //printf("i is now %d\n",i);
      for (int j=0;j<nchunk;j++) {
        int istart=j*chunk_size;
        int istop=(j+1)*chunk_size+nchan2;
        if (istop>ndat) {
          istop=ndat;
          for (int k=0;k<nchan2;k++)
            memset(tmp1[k]+chunk_size,0,sizeof(float)*nchan2);
        }
        for (int k=0;k<nchan2;k++) {
          memcpy(tmp1[k],dat_shift[i*nchan2+k]+istart,(istop-istart)*sizeof(float));
        }
        dedisperse(tmp1,tmp2,nchan2,chunk_size+nchan2);
        for (int k=0;k<nchan2;k++)
          memcpy(dat[i*nchan2+k]+istart,tmp2[k],chunk_size*sizeof(float));
      }
    }
    free(tmp1[0]);
    free(tmp1);
    free(tmp2[0]);
    free(tmp2);
    
  }
  //printf("Finished dedispersion.\n");
}
int main(int argc, char *argv[])
{
  //int nchan=4096;
  //int ndat=12000;
  int nchan=1024;
  int ndat=327680;

  int nrep=1;

  if (argc>1)
    nchan=atoi(argv[1]);
  if (argc>2)
    ndat=atoi(argv[2]);
  if (argc>3)
    nrep=atoi(argv[3]);

  float **dat=matrix(nchan,ndat+nchan);
  float **dat2=matrix(nchan,ndat+nchan);
  if (1)
    for (int i=0;i<nchan;i++)
      dat[i][(int)(0.8317*i+160.2)]=1;
  else
    for (int i=0;i<nchan;i++)
      dat[i][ndat/2]=1;
  
#if 0
  write_mat(dat,nchan,ndat,"dat_starting.dat");
  dedisperse_kernel(dat,dat2,nchan,ndat);
  write_mat(dat2,nchan,ndat,"dat_1pass.dat");
  dedisperse_2pass(dat,dat2,nchan,ndat);
  write_mat(dat,nchan,ndat,"dat_2pass.dat");  
#endif



  double t1=omp_get_wtime();
  //dedisperse(dat,dat2,nchan,ndat);
  //dedisperse_blocked(dat,dat2,nchan,ndat);
  dedisperse_blocked_cached(dat,dat2,nchan,ndat);
  double t2=omp_get_wtime();
  printf("took %12.4f seconds.\n",t2-t1);
  int ichan,idat;
  find_peak(dat,nchan,ndat,&ichan,&idat);
  t1=omp_get_wtime();
  printf("took %12.4f seconds to find peak.\n",t1-t2);


  
  for (int i=0;i<10;i++) {
    
    t1=omp_get_wtime();
    for (int j=0;j<nrep;j++) {
      dedisperse_blocked_cached(dat,dat2,nchan,ndat);
      //dedisperse(dat,dat2,nchan,ndat);
    }
    t2=omp_get_wtime();
    double nops=get_npass(nchan)*(nchan+0.0)*(ndat+0.0)*(nrep+0.0);
    printf("took %12.6f seconds at rate %12.6f.\n",t2-t1,nops/(t2-t1)/1024/1024);

    //printf("took %12.4f seconds.\n",t2-t1);
  
  }
  
  //write_mat(dat,nchan,ndat,"dat_final1.dat");

  //write_mat(dat2,nchan,ndat,"dat_final2.dat");
}