void dedisperse_lagged(float **inin, float **outout, int nchan, int ndat) { assert(nchan >= 2); assert(ndat >= 1); // detects underallocation, in the common case where inin was allocated with matrix() assert(inin[1] - inin[0] >= nchan + ndat - 1); assert(outout[1] - outout[0] >= nchan + ndat - 1); int npass = get_npass(nchan); assert(nchan == (1 << npass)); // currently require nchan to be a power of two int bs = nchan; float **in = inin; float **out = outout; for (int i = 0; i < npass; i++) { #pragma omp parallel for for (int j = 0; j < nchan; j += bs) dedisperse_kernel_lagged(in+j, out+j, bs, ndat + j/bs, j/bs); float **tmp=in; in = out; out = tmp; bs /= 2; } // non-rectangular copy for (int j = 0; j < nchan; j++) memcpy(out[j], in[j], (ndat+j)*sizeof(float)); }
void dedisperse_single(float **inin, float **outout, int nchan,int ndat) { //omp_set_num_threads(8); int npass=get_npass(nchan); //printf("need %d passes.\n",npass); //npass=2; int bs=nchan; float **in=inin; float **out=outout; //FILE *fout; //fout = fopen('/var/log/burst_bench.log', 'w'); //fclose(fout); // omp_set_dynamic(0); // omp_set_num_threads(8); for (int i=0;i<npass;i++) { #pragma omp parallel for for (int j=0;j<nchan;j+=bs) { //printf("dedisperse using %i threads\n",omp_get_num_threads()); dedisperse_kernel(in+j,out+j,bs,ndat); } bs/=2; float **tmp=in; in=out; out=tmp; } memcpy(out[0],in[0],nchan*ndat*sizeof(float)); }
/*--------------------------------------------------------------------------------*/ void dedisperse_blocked(float **dat, float **dat2, int nchan, int ndat) { int nchan1=64; int npass1=get_npass(nchan1); int npass=get_npass(nchan); int npass2=npass-npass1; int nchan2=nchan/nchan1; int nblock=nchan/nchan1; int nblock2=nchan/nchan2; for (int i=0;i<nblock;i++) dedisperse(dat+i*nchan1,dat2+i*nchan1,nchan1,ndat); for (int i=0;i<nblock;i++) for (int j=0;j<nchan1;j++) memcpy(dat2[j*nblock+i],dat[i*nchan1+j]+i*j,ndat-i*j); for (int i=0;i<nblock2;i++) dedisperse(dat2+i*nchan2,dat+i*nchan2,nchan2,ndat); }
void dedisperse_dual(float **inin, float **outout, int nchan,int ndat) { int npass=get_npass(nchan); //printf("need %d passes from %d channels..\n",npass,nchan); //npass=2; int bs=nchan; float **in=inin; float **out=outout; //the npasss-1 is so that we stop in time to hand the final pass to //the single-step kernel in the event of an odd depth. for (int i=0;i<npass-1;i+=2) { #pragma omp parallel for for (int j=0;j<nchan;j+=bs) { //dedisperse_kernel_2pass_v2(in+j,out+j,bs,ndat); dedisperse_block_kernel_2pass((const float **)(in+j),out+j,bs,ndat); } bs/=4; float **tmp=in; in=out; out=tmp; } if (npass%2==1) { //do a single step if we come in with odd depth //printf("doing final step for odd depth with block size %d.\n",bs); #pragma omp parallel for for (int j=0;j<nchan;j+=bs) dedisperse_kernel(in+j,out+j,bs,ndat); float **tmp=in; in=out; out=tmp; } memcpy(out[0],in[0],nchan*ndat*sizeof(float)); }
void dedisperse(float **inin, float **outout, int nchan,int ndat) { //return; int npass=get_npass(nchan); //printf("need %d passes.\n",npass); //npass=2; int bs=nchan; float **in=inin; float **out=outout; for (int i=0;i<npass;i++) { //#pragma omp parallel for for (int j=0;j<nchan;j+=bs) { dedisperse_kernel(in+j,out+j,bs,ndat); } bs/=2; float **tmp=in; in=out; out=tmp; } memcpy(out[0],in[0],nchan*ndat*sizeof(float)); }
void dedisperse_inplace(float **inin, int nchan, int m) { omp_set_dynamic(0); omp_set_num_threads(OMP_THREADS); int npass=get_npass(nchan); float **in=inin; int radix = 1; int pairs = nchan/2; int threads = 8; //initial channel map int *fmap = malloc(sizeof(int)*nchan); for (int i=0; i<nchan;i++){ fmap[i] = i; } //float *vec = (float*)malloc(sizeof(float)*OMP_THREADS*m); //float **tmp = (float**)malloc(sizeof(float*)*OMP_THREADS); //for(int i = 0; i < OMP_THREADS; i++){ //} float **tmp = matrix(OMP_THREADS,m); for (int i=0;i<npass;i++) { generate_shift_group(fmap,radix,nchan); #pragma omp parallel for for (int j=0;j<pairs;j++) { int zero = 2*j; int zero_ind = 0; int id = omp_get_thread_num(); //Inefficient, but it scans over at most n/2 while(fmap[zero_ind] != zero - (zero % (nchan/radix))){ zero_ind++; } zero_ind += radix*(zero % (nchan/radix)); int comp_ind = zero_ind + radix; int jeff = j % (nchan/(radix*2)); for(int k = 0; k < m; k++){ tmp[id][k] = in[zero_ind][k]; } for(int k = 0; k < m; k++){ in[zero_ind][k] = in[zero_ind][k] + in[comp_ind][k]; } for(int k = 0; k < m - jeff - 1; k++){ in[comp_ind][k] = tmp[id][k + jeff] + in[comp_ind][k + jeff + 1]; } } radix *=2; } fast_unshuffle(in,fmap,nchan,m); //unshuffle(in,fmap,nchan,m); free(fmap); free(tmp); free(tmp[0]); }
/*--------------------------------------------------------------------------------*/ void dedisperse_blocked_cached(float **dat, float **dat2, int nchan, int ndat) { //int nchan1=128; //int chunk_size=768; int nchan1=128; //int chunk_size=1536; int chunk_size=1024; int nchunk=ndat/chunk_size; int npass1=get_npass(nchan1); int npass=get_npass(nchan); int npass2=npass-npass1; int nchan2=nchan/nchan1; int nblock=nchan/nchan1; int nblock2=nchan/nchan2; #pragma omp parallel { float **tmp1=matrix(nchan1,chunk_size+nchan1); float **tmp2=matrix(nchan1,chunk_size+nchan1); #pragma omp for collapse(2) schedule(dynamic,2) for (int i=0;i<nblock;i++) { //printf("i is %d\n",i); for (int j=0;j<nchunk;j++) { int istart=j*chunk_size; int istop=(j+1)*chunk_size+nchan1; if (istop>ndat) { istop=ndat; for (int k=0;k<nchan1;k++) memset(tmp1[k]+chunk_size,0,sizeof(float)*nchan1); } for (int k=0;k<nchan1;k++) memcpy(tmp1[k],&(dat[i*nchan1+k][istart]),(istop-istart)*sizeof(float)); dedisperse(tmp1,tmp2,nchan1,chunk_size+nchan1); for (int k=0;k<nchan1;k++) memcpy(&(dat2[i*nchan1+k][istart]),tmp1[k],chunk_size*sizeof(float)); } } #if 1 free(tmp1[0]); free(tmp1); free(tmp2[0]); free(tmp2); #endif } float **dat_shift=(float **)malloc(sizeof(float *)*nchan); for (int i=0;i<nblock;i++) for (int j=0;j<nchan1;j++) dat_shift[j*nblock+i]=dat2[i*nchan1+j]+i*j; //recalculate block sizes to keep amount in cache about the same int nelem=nchan1*chunk_size; chunk_size=nelem/nchan2; nchunk=ndat/chunk_size; #pragma omp parallel { float **tmp1=matrix(nchan2,chunk_size+nchan2); float **tmp2=matrix(nchan2,chunk_size+nchan2); #pragma omp for collapse(2) schedule(dynamic,4) for (int i=0;i<nblock2;i++) { //printf("i is now %d\n",i); for (int j=0;j<nchunk;j++) { int istart=j*chunk_size; int istop=(j+1)*chunk_size+nchan2; if (istop>ndat) { istop=ndat; for (int k=0;k<nchan2;k++) memset(tmp1[k]+chunk_size,0,sizeof(float)*nchan2); } for (int k=0;k<nchan2;k++) { memcpy(tmp1[k],dat_shift[i*nchan2+k]+istart,(istop-istart)*sizeof(float)); } dedisperse(tmp1,tmp2,nchan2,chunk_size+nchan2); for (int k=0;k<nchan2;k++) memcpy(dat[i*nchan2+k]+istart,tmp2[k],chunk_size*sizeof(float)); } } free(tmp1[0]); free(tmp1); free(tmp2[0]); free(tmp2); } //printf("Finished dedispersion.\n"); }
int main(int argc, char *argv[]) { //int nchan=4096; //int ndat=12000; int nchan=1024; int ndat=327680; int nrep=1; if (argc>1) nchan=atoi(argv[1]); if (argc>2) ndat=atoi(argv[2]); if (argc>3) nrep=atoi(argv[3]); float **dat=matrix(nchan,ndat+nchan); float **dat2=matrix(nchan,ndat+nchan); if (1) for (int i=0;i<nchan;i++) dat[i][(int)(0.8317*i+160.2)]=1; else for (int i=0;i<nchan;i++) dat[i][ndat/2]=1; #if 0 write_mat(dat,nchan,ndat,"dat_starting.dat"); dedisperse_kernel(dat,dat2,nchan,ndat); write_mat(dat2,nchan,ndat,"dat_1pass.dat"); dedisperse_2pass(dat,dat2,nchan,ndat); write_mat(dat,nchan,ndat,"dat_2pass.dat"); #endif double t1=omp_get_wtime(); //dedisperse(dat,dat2,nchan,ndat); //dedisperse_blocked(dat,dat2,nchan,ndat); dedisperse_blocked_cached(dat,dat2,nchan,ndat); double t2=omp_get_wtime(); printf("took %12.4f seconds.\n",t2-t1); int ichan,idat; find_peak(dat,nchan,ndat,&ichan,&idat); t1=omp_get_wtime(); printf("took %12.4f seconds to find peak.\n",t1-t2); for (int i=0;i<10;i++) { t1=omp_get_wtime(); for (int j=0;j<nrep;j++) { dedisperse_blocked_cached(dat,dat2,nchan,ndat); //dedisperse(dat,dat2,nchan,ndat); } t2=omp_get_wtime(); double nops=get_npass(nchan)*(nchan+0.0)*(ndat+0.0)*(nrep+0.0); printf("took %12.6f seconds at rate %12.6f.\n",t2-t1,nops/(t2-t1)/1024/1024); //printf("took %12.4f seconds.\n",t2-t1); } //write_mat(dat,nchan,ndat,"dat_final1.dat"); //write_mat(dat2,nchan,ndat,"dat_final2.dat"); }