예제 #1
0
/**
   Partition the ray tracing by DM/Destination combinations, as well as
   segments in each combination to maximum efficiency.
*/
void calc_cachedm(SIM_T *simu){
    double tk_start=myclockd();
    if(simu->parms->sim.cachedm){
	long group=0;
	/*zero out the data. */
	for(int idm=0; idm<simu->parms->ndm; idm++){
	    dzero((dmat*)simu->cachedm->p[idm]);
	    /*do the multi-threaded ray tracing */
	    QUEUE_THREAD(group,(simu->cachedm_prop[idm]), 1);
	}
	WAIT_THREAD(group);
    }
    simu->tk_cache=myclockd()-tk_start;
}
예제 #2
0
void maos_isim(int isim){
    const PARMS_T *parms=global->parms;
    RECON_T *recon=global->recon;
    SIM_T   *simu =global->simu;
    int iseed=global->iseed;
    int simstart=parms->sim.start;
    int simend=parms->sim.end;
    if(isim==simstart+1){//skip slow first step.
	tk_atm=myclockd();
    }
    if(isim+2+parms->sim.dtrat_hi>=simend){
	draw_single=0;
    }
    double ck_0=myclockd();
    simu->isim=isim;
    simu->status->isim=isim;
    sim_update_etf(simu);
    if(parms->atm.frozenflow){
#if USE_CUDA
	if(parms->gpu.evl || parms->gpu.wfs){
	    /*may need to copy another part */
	    gpu_atm2gpu(simu->atm, simu->atmscale, parms, iseed, isim);
	}
#endif
    }else{
	//Do not put this one inside parallel 
	genatm(simu);
	/*re-seed the atmosphere in case atm is loaded from shm/file */
	seed_rand(simu->atm_rand, lrand(simu->init_rand));
    }
    OMPTASK_SINGLE{
	if(parms->sim.dmproj){
	    /* teporarily disable FR.M so that Mfun is used.*/
	    cell *FRM=recon->FR.M; recon->FR.M=NULL; 
	    muv_solve(&simu->dmproj, &recon->FL, &recon->FR, NULL);
	    recon->FR.M=FRM;/*set FR.M back*/
	    if(parms->save.dm){
		zfarr_dcell(simu->save->dmproj, simu->isim, simu->dmproj);
	    }
	    if(!parms->fit.square){
		/* Embed DM commands to a square array for fast ray tracing */
		for(int idm=0; idm<parms->ndm; idm++){
		    loc_embed(simu->dmprojsq->p[idm], recon->aloc->p[idm], simu->dmproj->p[idm]->p);
		}
	    }
#if USE_CUDA
	    if(parms->gpu.evl || parms->gpu.wfs){
		gpu_dmproj2gpu(simu->dmprojsq);
	    }
#endif
	}
	save_dmreal(simu);
	extern int NO_RECON, NO_WFS, NO_EVL;
	if(PARALLEL){
	    /*
	      We do the big loop in parallel to make better use the
	      CPUs. Notice that the reconstructor is working on grad from
	      last time step so that there is no confliction in data access.
	    */
	    /*when we want to apply idealngs correction, wfsgrad need to wait for perfevl. */
	    long group=0;
	    if(parms->gpu.evl && !NO_EVL){
		//Queue tasks on GPU, no stream sync is done
		QUEUE_THREAD(group, simu->perf_evl_pre, 0);
	    }
	    if(!parms->tomo.ahst_idealngs && parms->gpu.wfs && !NO_WFS){
		//task for each wfs
		QUEUE_THREAD(group, simu->wfs_grad_pre, 0);
	    }
	    if(!NO_RECON){
		//don't put this first. It has cpu overhead in computing gradol
		QUEUE(group, reconstruct, simu, 1, 0);
	    }
	    if(!NO_EVL){
		if(parms->gpu.evl){
		    //wait for GPU tasks to be queued before calling sync
		    WAIT(group);
		}
		QUEUE(group, perfevl, simu, 1, 0);
	    }
	    if(!NO_WFS){
		if(parms->tomo.ahst_idealngs || (parms->gpu.wfs && !parms->gpu.evl)){
		    //in ahst_idealngs mode, weight for perfevl to finish.
		    //otherwise, wait for GPU tasks to be queued before calling sync
		    WAIT(group);
		}
		QUEUE(group, wfsgrad, simu, 1, 0);
	    }
	    if(!NO_RECON){
		//wait for all tasks to finish before modifying dmreal
		WAIT(group);
		shift_grad(simu);/*before filter() */
		filter_dm(simu);/*updates dmreal, so has to be after prefevl/wfsgrad is done. */
	    }
	    WAIT(group);
	}else{/*do the big loop in serial mode. */
	    if(parms->sim.closeloop){
		if(!NO_EVL) perfevl(simu);/*before wfsgrad so we can apply ideal NGS modes */
		if(!NO_WFS) wfsgrad(simu);/*output grads to gradcl, gradol */
		if(!NO_RECON) {
		    reconstruct(simu);/*uses grads from gradlast cl, gradlast ol. */
		    shift_grad(simu);
		    filter_dm(simu);
		}
	    }else{/*in OL mode,  */
		if(!NO_WFS) wfsgrad(simu);
		if(!NO_RECON) {
		    shift_grad(simu);
		    reconstruct(simu);
		    filter_dm(simu);
		}
		if(!NO_EVL) perfevl(simu);
	    }
	}
    }
    double ck_end=myclockd();
    long steps_done=iseed*(simend-simstart)+(isim+1-simstart);
    long steps_rest=parms->sim.nseed*(simend-simstart)-steps_done;
    if(isim!=simstart){
	simu->status->rest=(long)((ck_end-tk_0-(tk_atm-tk_1)*(iseed+1))/steps_done*steps_rest
				  +(tk_atm-tk_1)*(parms->sim.nseed-iseed-1));
	simu->status->mean=(ck_end-tk_atm)/(double)(isim-simstart);
    }
    simu->status->laps=(long)(ck_end-tk_0);
    simu->status->tot  =ck_end-ck_0;
    simu->status->wfs  =simu->tk_wfs;
    simu->status->recon=simu->tk_recon;
    simu->status->other=simu->tk_cache;
    simu->status->eval =simu->tk_eval;
    simu->status->scale=1;
    if(simu->timing){
	simu->timing->p[isim*simu->timing->nx]=get_job_mem();
	simu->timing->p[isim*simu->timing->nx+1]=simu->status->tot;
	simu->timing->p[isim*simu->timing->nx+2]=simu->status->wfs;
	simu->timing->p[isim*simu->timing->nx+3]=simu->status->recon;
	simu->timing->p[isim*simu->timing->nx+4]=simu->status->eval;
    }
    double this_time=myclockd();
    if(this_time>simu->last_report_time+1 || isim+1==simend || parms->sim.pause){
	/*we don't print out or report too frequently. */
	simu->last_report_time=this_time;
#if defined(__linux__) || defined(__APPLE__)
	scheduler_report(simu->status);
#endif
	print_progress(simu);
    }
}