/*--------------------------------------------------------------------------------*/ void dedisperse_blocked(float **dat, float **dat2, int nchan, int ndat) { int nchan1=64; int npass1=get_npass(nchan1); int npass=get_npass(nchan); int npass2=npass-npass1; int nchan2=nchan/nchan1; int nblock=nchan/nchan1; int nblock2=nchan/nchan2; for (int i=0;i<nblock;i++) dedisperse(dat+i*nchan1,dat2+i*nchan1,nchan1,ndat); for (int i=0;i<nblock;i++) for (int j=0;j<nchan1;j++) memcpy(dat2[j*nblock+i],dat[i*nchan1+j]+i*j,ndat-i*j); for (int i=0;i<nblock2;i++) dedisperse(dat2+i*nchan2,dat+i*nchan2,nchan2,ndat); }
/*--------------------------------------------------------------------------------*/ void dedisperse_blocked_cached(float **dat, float **dat2, int nchan, int ndat) { //int nchan1=128; //int chunk_size=768; int nchan1=128; //int chunk_size=1536; int chunk_size=1024; int nchunk=ndat/chunk_size; int npass1=get_npass(nchan1); int npass=get_npass(nchan); int npass2=npass-npass1; int nchan2=nchan/nchan1; int nblock=nchan/nchan1; int nblock2=nchan/nchan2; #pragma omp parallel { float **tmp1=matrix(nchan1,chunk_size+nchan1); float **tmp2=matrix(nchan1,chunk_size+nchan1); #pragma omp for collapse(2) schedule(dynamic,2) for (int i=0;i<nblock;i++) { //printf("i is %d\n",i); for (int j=0;j<nchunk;j++) { int istart=j*chunk_size; int istop=(j+1)*chunk_size+nchan1; if (istop>ndat) { istop=ndat; for (int k=0;k<nchan1;k++) memset(tmp1[k]+chunk_size,0,sizeof(float)*nchan1); } for (int k=0;k<nchan1;k++) memcpy(tmp1[k],&(dat[i*nchan1+k][istart]),(istop-istart)*sizeof(float)); dedisperse(tmp1,tmp2,nchan1,chunk_size+nchan1); for (int k=0;k<nchan1;k++) memcpy(&(dat2[i*nchan1+k][istart]),tmp1[k],chunk_size*sizeof(float)); } } #if 1 free(tmp1[0]); free(tmp1); free(tmp2[0]); free(tmp2); #endif } float **dat_shift=(float **)malloc(sizeof(float *)*nchan); for (int i=0;i<nblock;i++) for (int j=0;j<nchan1;j++) dat_shift[j*nblock+i]=dat2[i*nchan1+j]+i*j; //recalculate block sizes to keep amount in cache about the same int nelem=nchan1*chunk_size; chunk_size=nelem/nchan2; nchunk=ndat/chunk_size; #pragma omp parallel { float **tmp1=matrix(nchan2,chunk_size+nchan2); float **tmp2=matrix(nchan2,chunk_size+nchan2); #pragma omp for collapse(2) schedule(dynamic,4) for (int i=0;i<nblock2;i++) { //printf("i is now %d\n",i); for (int j=0;j<nchunk;j++) { int istart=j*chunk_size; int istop=(j+1)*chunk_size+nchan2; if (istop>ndat) { istop=ndat; for (int k=0;k<nchan2;k++) memset(tmp1[k]+chunk_size,0,sizeof(float)*nchan2); } for (int k=0;k<nchan2;k++) { memcpy(tmp1[k],dat_shift[i*nchan2+k]+istart,(istop-istart)*sizeof(float)); } dedisperse(tmp1,tmp2,nchan2,chunk_size+nchan2); for (int k=0;k<nchan2;k++) memcpy(dat[i*nchan2+k]+istart,tmp2[k],chunk_size*sizeof(float)); } } free(tmp1[0]); free(tmp1); free(tmp2[0]); free(tmp2); } //printf("Finished dedispersion.\n"); }