/*--------------------------------------------------------------------------------*/
void dedisperse_blocked(float **dat, float **dat2, int nchan, int ndat)
{
  int nchan1=64;
  int npass1=get_npass(nchan1);
  int npass=get_npass(nchan);
  int npass2=npass-npass1;
  int nchan2=nchan/nchan1;

  int nblock=nchan/nchan1;
  int nblock2=nchan/nchan2;

  for (int i=0;i<nblock;i++) 
    dedisperse(dat+i*nchan1,dat2+i*nchan1,nchan1,ndat);
  
  
  for (int i=0;i<nblock;i++) 
    for (int j=0;j<nchan1;j++)
      memcpy(dat2[j*nblock+i],dat[i*nchan1+j]+i*j,ndat-i*j);

  for (int i=0;i<nblock2;i++)
    dedisperse(dat2+i*nchan2,dat+i*nchan2,nchan2,ndat);
  
  
}
/*--------------------------------------------------------------------------------*/
void dedisperse_blocked_cached(float **dat, float **dat2, int nchan, int ndat)
{
  //int nchan1=128;
  //int chunk_size=768;

  int nchan1=128;
  //int chunk_size=1536;
  int chunk_size=1024;
  int nchunk=ndat/chunk_size;
  int npass1=get_npass(nchan1);
  int npass=get_npass(nchan);
  int npass2=npass-npass1;
  int nchan2=nchan/nchan1;

  int nblock=nchan/nchan1;
  int nblock2=nchan/nchan2;


  
#pragma omp parallel 
  {
    float **tmp1=matrix(nchan1,chunk_size+nchan1);
    float **tmp2=matrix(nchan1,chunk_size+nchan1); 
    
#pragma omp for collapse(2) schedule(dynamic,2)
    for (int i=0;i<nblock;i++) {      
      //printf("i is %d\n",i);
      for (int j=0;j<nchunk;j++) {
        int istart=j*chunk_size;
        int istop=(j+1)*chunk_size+nchan1;
        if (istop>ndat) {
          istop=ndat;
          for (int k=0;k<nchan1;k++)
            memset(tmp1[k]+chunk_size,0,sizeof(float)*nchan1);
        }
        for (int k=0;k<nchan1;k++)
          memcpy(tmp1[k],&(dat[i*nchan1+k][istart]),(istop-istart)*sizeof(float));
        
        dedisperse(tmp1,tmp2,nchan1,chunk_size+nchan1);
        
        for (int k=0;k<nchan1;k++)
          memcpy(&(dat2[i*nchan1+k][istart]),tmp1[k],chunk_size*sizeof(float));
      }
    }
#if 1
    free(tmp1[0]);   
    free(tmp1);
    free(tmp2[0]);
    free(tmp2);
#endif
  }
  
  

  float **dat_shift=(float **)malloc(sizeof(float *)*nchan);
  for (int i=0;i<nblock;i++)
    for (int j=0;j<nchan1;j++)
      dat_shift[j*nblock+i]=dat2[i*nchan1+j]+i*j;  


  //recalculate block sizes to keep amount in cache about the same
  int nelem=nchan1*chunk_size;
  chunk_size=nelem/nchan2;
  nchunk=ndat/chunk_size;

#pragma omp parallel 
  {
    float **tmp1=matrix(nchan2,chunk_size+nchan2);
    float **tmp2=matrix(nchan2,chunk_size+nchan2); 

#pragma omp for  collapse(2) schedule(dynamic,4)
    for (int i=0;i<nblock2;i++) {      
      //printf("i is now %d\n",i);
      for (int j=0;j<nchunk;j++) {
        int istart=j*chunk_size;
        int istop=(j+1)*chunk_size+nchan2;
        if (istop>ndat) {
          istop=ndat;
          for (int k=0;k<nchan2;k++)
            memset(tmp1[k]+chunk_size,0,sizeof(float)*nchan2);
        }
        for (int k=0;k<nchan2;k++) {
          memcpy(tmp1[k],dat_shift[i*nchan2+k]+istart,(istop-istart)*sizeof(float));
        }
        dedisperse(tmp1,tmp2,nchan2,chunk_size+nchan2);
        for (int k=0;k<nchan2;k++)
          memcpy(dat[i*nchan2+k]+istart,tmp2[k],chunk_size*sizeof(float));
      }
    }
    free(tmp1[0]);
    free(tmp1);
    free(tmp2[0]);
    free(tmp2);
    
  }
  //printf("Finished dedispersion.\n");
}