int main(int argc, char ** argv) { long lower, upper; int WGS; if (argc != 4) { printf("not 2 arguments\n"); return 1; } sscanf(argv[1], "%ld", &lower); sscanf(argv[2], "%ld", &upper); sscanf(argv[3], "%d", &WGS); long results_size = (upper*(upper-1))/2; long* results = (long *) malloc(sizeof(long)*WGS); int i; for(i = 0; i < WGS; i ++) results[i] = 0L; printf("%ld\n", results_size); FILE *fp; char *KernelSource; cl_kernel kernel; fp = fopen("totient_kernel.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } KernelSource = (char*)malloc(MAX_SOURCE_SIZE); fread( KernelSource, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); size_t local[1]; size_t global[1]; local[0] = WGS; global[0] = results_size; initGPU(); // Fill in here: kernel = setupKernel( KernelSource, "totient", 2, LongArr, WGS, results, IntConst, WGS); // Fill in here: runKernel( kernel, 1, global, local); long tot = 0; int l; for(l = 0; l < WGS; l ++) tot += results[l]; printf("C: Sum of Totients between [%ld..%ld] is %ld\n", lower, upper, tot); return 0; }
DispersalFunctions::DispersalFunctions(Config *settings0, Random* rand0, bool cuda) : sb_old(0), hb_old(0), settings(settings0), _rand(rand0), _cuda(cuda) { if (_cuda) { hb = settings->getHB(0); sb = settings->getSB(0); sb_old = sb; hb_old = hb; initGPU(sb->getF(), sb->getMaxX(), sb->getMaxY(), hb->getF(), hb->getMaxX(), hb->getMaxY()); } }
void initMMU(MMU * pmmu) { bzero((void*)pmmu,sizeof(MMU)); uint8_t bios[BIOS_SIZE] = __BIOS__ ; memcpy(pmmu->bios, bios, BIOS_SIZE); pmmu->bios_enabled = 1; pmmu->rom_bank = 1; pmmu->ram_bank_enable = 1; pmmu->mbc1_mode = MBC1_16_8_MODE; initGPU(&(pmmu->gpu)); #ifdef USE_ADDRESS_LUT fillAddressLUT(pmmu); #endif }
// ----------------------------------------------------------------------------- SceneManager::SceneManager(const glm::ivec2 screenSize, SceneContainer *container) : m_world(container->getWorld()), m_camera(container->getCamera()), #ifndef WINDOWS m_textManager(TextManager(FONT_PATH + FONT_FILE, FONT_HEIGHT, screenSize)), #endif m_drawer(screenSize), m_shadowManager(screenSize), m_BACKGROUND_COLOR(container->getBackgroundColor()), m_positionMutex(SDL_CreateMutex()) { m_lightMask = (2 << (m_world->getLightsNumber() - 1)) - 1; setupProjection(screenSize); initGPU(container); glClearColor(m_BACKGROUND_COLOR.x, m_BACKGROUND_COLOR.y, m_BACKGROUND_COLOR.z, m_BACKGROUND_COLOR.w); checkOpenGLError("GLInitializer: glClearColor"); }
void init_compression(queue * fifo,int maxit,int numb,int bsize) { maxiterations=maxit; numblocks=numb; blocksize=bsize; printf("Initializing the GPU\n"); initGPU(); //create consumer threades pthread_create (&congpu, NULL, gpu_consumer, fifo); pthread_create (&concpu, NULL, cpu_consumer, fifo); pthread_create (&consend, NULL, cpu_sender, fifo); return; }
int computeFitness(int * c_position, int * c_velocity, int * p_angle, int * p_velocity, int * fitness, int n) { if (!initiated) { initGPU(n); initiated = 1; } #pragma mark Writing memory // Allocate memory on the device to hold our data and store the results into buffer_size = sizeof(int) * n; err = clEnqueueWriteBuffer(cmd_queue, mem_c_position, CL_TRUE, 0, buffer_size, (void *) c_position, 0, NULL, NULL); err |= clEnqueueWriteBuffer(cmd_queue, mem_c_velocity, CL_TRUE, 0, buffer_size, (void *) c_velocity, 0, NULL, NULL); err |= clEnqueueWriteBuffer(cmd_queue, mem_p_angle, CL_TRUE, 0, buffer_size, (void *) p_angle, 0, NULL, NULL); err |= clEnqueueWriteBuffer(cmd_queue, mem_p_velocity, CL_TRUE, 0, buffer_size, (void *) p_velocity, 0, NULL, NULL); assert(err == CL_SUCCESS); // Get all of the stuff written and allocated clFinish(cmd_queue); #pragma mark Kernel Arguments // Now setup the arguments to our kernel err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), (void *) &mem_c_position); err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), (void *) &mem_c_velocity); err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), (void *) &mem_p_angle); err |= clSetKernelArg(kernel[0], 3, sizeof(cl_mem), (void *) &mem_p_velocity); err |= clSetKernelArg(kernel[0], 4, sizeof(cl_mem), (void *) &mem_fitness); assert(err == CL_SUCCESS); #pragma mark Execution and Reading memory // Run the calculation by enqueuing it and forcing the // command queue to complete the task size_t global_work_size = n; err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL, &global_work_size, NULL, 0, NULL, NULL); assert(err == CL_SUCCESS); clFinish(cmd_queue); // Once finished read back the results from the answer // array into the results array err = clEnqueueReadBuffer(cmd_queue, mem_fitness, CL_TRUE, 0, buffer_size, fitness, 0, NULL, NULL); assert(err == CL_SUCCESS); clFinish(cmd_queue); return CL_SUCCESS; }
void gpu_init(H264Context *h) { GPUH264Context * const g = &h->gpu; MpegEncContext * const s = &h->s; int pic_width = 16*s->mb_width, pic_height = 16*s->mb_height; int i; tp_3d.texTarget = GL_TEXTURE_3D; tp_3d.texInternalFormat = GL_LUMINANCE8; tp_3d.texFormat = GL_LUMINANCE; if(g->init) { printf("gpu_init called twice (Why?)\n"); return; } //screenWidth = 1920, screenHeight = 1088; screenWidth = pic_width, screenHeight = pic_height; printf("Initializing GPU Context\n"); initGPU(screenWidth, screenHeight); initGPGPU(screenWidth, screenHeight); glEnable(tp_3d.texTarget); //RUDD TEMP DPB is fixed at 64 for now //Nearest Power of 2? g->dpb_tex = createTexture(screenWidth, screenHeight, 16, tp_3d); g->dpb_free = ~0x0; //RUDD TEST for comparison g->lum_residual = av_mallocz(s->linesize * s->mb_height * 16 * sizeof(short)); g->cr_residual = av_mallocz(s->uvlinesize * s->mb_height * 8 * sizeof(short)); g->cb_residual = av_mallocz(s->uvlinesize * s->mb_height * 8 * sizeof(short)); //RUDD TODO size? g->block_buffer = av_mallocz(9000*sizeof(H264mb)); g->init = 1; setup_shaders(g); }
int main(int argc, char* argv[]) { atexit(AtExit); if (argc < 2) { printf("Usage:\n"); #ifdef MODULE_SUPPORT printf("%s <in.ncch> [-d|-noscreen|-codepatch <code>|-modules <num> <in.ncch>|-overdrivlist <num> <services>|-sdmc <path>|-sysdata <path>|-sdwrite|-slotone|-configsave|-gdbport <port>]\n", argv[0]); #else printf("%s <in.ncch> [-d|-noscreen|-codepatch <code>|-sdmc <path>|-sysdata <path>|-sdwrite|-slotone|-configsave|-gdbport <port>]\n", argv[0]); #endif return 1; } //disasm = (argc > 2) && (strcmp(argv[2], "-d") == 0); //noscreen = (argc > 2) && (strcmp(argv[2], "-noscreen") == 0); for (int i = 2; i < argc; i++) { if ((strcmp(argv[i], "-d") == 0))disasm = true; else if ((strcmp(argv[i], "-noscreen") == 0))noscreen = true; else if ((strcmp(argv[i], "-codepatch") == 0)) { i++; codepath = malloc(strlen(argv[i])); strcpy(codepath, argv[i]); } else if ((strcmp(argv[i], "-sdmc") == 0)) { i++; strcpy(config_sdmc_path, argv[i]); config_has_sdmc = true; } else if ((strcmp(argv[i], "-sysdata") == 0)) { i++; strcpy(config_sysdataoutpath, argv[i]); config_usesys = true; } else if ((strcmp(argv[i], "-sdwrite") == 0))config_slotone = true; else if ((strcmp(argv[i], "-slotone") == 0))config_sdmcwriteable = true; else if ((strcmp(argv[i], "-configsave") == 0))config_nand_cfg_save = true; #ifdef GDB_STUB if ((strcmp(argv[i], "-gdbport") == 0)) { i++; global_gdb_port = atoi(argv[i]); if (global_gdb_port < 1 || global_gdb_port > 65535) { DEBUG("ARM9 GDB stub port must be in the range 1 to 65535\n"); exit(-1); } gdb_ctrl_iface.stall = stall_cpu; gdb_ctrl_iface.unstall = unstall_cpu; gdb_ctrl_iface.read_reg = read_cpu_reg; gdb_ctrl_iface.set_reg = set_cpu_reg; gdb_ctrl_iface.install_post_ex_fn = install_post_exec_fn; gdb_ctrl_iface.remove_post_ex_fn = remove_post_exec_fn; gdb_base_memory_iface.prefetch16 = gdb_prefetch16; gdb_base_memory_iface.prefetch32 = gdb_prefetch32; gdb_base_memory_iface.read32 = gdb_read32; gdb_base_memory_iface.write32 = gdb_write32; gdb_base_memory_iface.read16 = gdb_read16; gdb_base_memory_iface.write16 = gdb_write16; gdb_base_memory_iface.read8 = gdb_read8; gdb_base_memory_iface.write8 = gdb_write8; } #endif if (i >= argc)break; #ifdef MODULE_SUPPORT if ((strcmp(argv[i], "-modules") == 0)) { i++; modulenum = atoi(argv[i]); modulenames = malloc(sizeof(char*)*modulenum); i++; for (int j = 0; j < modulenum; j++) { *(modulenames + j) = malloc(strlen(argv[i])); strcpy(*(modulenames + j), argv[i]); i++; } } if (i >= argc)break; if ((strcmp(argv[i], "-overdrivlist") == 0)) { i++; overdrivnum = atoi(argv[i]); overdrivnames = malloc(sizeof(char*)*modulenum); i++; for (int j = 0; j < modulenum; j++) { *(overdrivnames + j) = malloc(strlen(argv[i])); strcpy(*(overdrivnames + j), argv[i]); i++; } } if (i >= argc)break; #endif } #ifdef MODULE_SUPPORT curprocesshandlelist = malloc(sizeof(u32)*(modulenum + 1)); ModuleSupport_MemInit(modulenum); #endif signal(SIGINT, AtSig); if (!noscreen) screen_Init(); hid_spvr_init(); hid_user_init(); initDSP(); mcu_GPU_init(); initGPU(); srv_InitGlobal(); arm11_Init(); #ifdef MODULE_SUPPORT int i; for (i = 0; i<modulenum; i++) { u32 handzwei = handle_New(HANDLE_TYPE_PROCESS, 0); curprocesshandle = handzwei; *(curprocesshandlelist + i) = handzwei; ModuleSupport_SwapProcessMem(i); u32 hand = handle_New(HANDLE_TYPE_THREAD, 0); threads_New(hand); // Load file. FILE* fd = fopen(*(modulenames + i), "rb"); if (fd == NULL) { perror("Error opening file"); return 1; } if (loader_LoadFile(fd) != 0) { fclose(fd); return 1; } } u32 handzwei = handle_New(HANDLE_TYPE_PROCESS, 0); *(curprocesshandlelist + modulenum) = handzwei; ModuleSupport_SwapProcessMem(modulenum); #else u32 handzwei = handle_New(HANDLE_TYPE_PROCESS, 0); curprocesshandle = handzwei; #endif FILE* fd = fopen(argv[1], "rb"); if (fd == NULL) { perror("Error opening file"); return 1; } u32 hand = handle_New(HANDLE_TYPE_THREAD, 0); threads_New(hand); // Load file. if (loader_LoadFile(fd) != 0) { fclose(fd); return 1; } #ifdef GDB_STUB if (global_gdb_port) { gdb_stub = createStub_gdb(global_gdb_port, &gdb_memio, &gdb_base_memory_iface); if (gdb_stub == NULL) { DEBUG("Failed to create ARM9 gdbstub on port %d\n", global_gdb_port); exit(-1); } activateStub_gdb(gdb_stub, &gdb_ctrl_iface); } #endif // Execute. while (running) { if (!noscreen) screen_HandleEvent(); #ifdef MODULE_SUPPORT int k; for (k = 0; k <= modulenum; k++) { ModuleSupport_SwapProcess(k); DEBUG("Process %X\n",k); } #else threads_Execute(); #endif if (!noscreen) screen_RenderGPU(); FPS_Lock(); //mem_Dbugdump(); } fclose(fd); return 0; }
void* CDFAlign::ThreadFunc_cuAlign(void* p) { CDFAlign *pThis=(CDFAlign *)p; APARA ¶=pThis->m_para; pThis->m_bRun=true; char str[512]; int i,j; bool bSuccess=false; //open stack file MRC stack; if(stack.open(pThis->m_fnStack,"rb")<=0) { sprintf(str,"Error: Failed to open stack %s .",pThis->m_fnStack); Message(str); pThis->m_bRun=false; return (void *)0; } //get image size int nx=stack.getNx(); int ny=stack.getNy(); int nz=stack.getNz(); sprintf(str,"\nInput Stack: Nx(%d) Ny(%d) Nz(%d)\n\n",nx,ny,nz); pThis->TextOutput(str); int bin=para.bin; if(bin<=0) return (void *)0; int offsetx=para.crop_offsetx; int offsety=para.crop_offsety; DIM nsamUnbin=verifyCropSize(nx,ny, offsetx, offsety, para.crop_nsam,bin); if(nsamUnbin.x<=0 || nsamUnbin.y<=0 ) { Message("Error: Wrong image Size."); pThis->m_bRun=false; return (void *)0; } DIM nsam=nsamUnbin/bin; int nsamb=nsam.width()+2; if(bin==1) sprintf(str,"Crop Image: Offset(%d %d) Dim(%d %d)\n\n", offsetx,offsety,nsamUnbin.x,nsamUnbin.y); else sprintf(str,"Crop Image: Offset(%d %d) RawDim(%d %d) BinnedDim(%d %d)\n\n", offsetx,offsety,nsamUnbin.x,nsamUnbin.y,nsam.x,nsam.y); pThis->TextOutput(str); pThis->m_nsam=nsam; //pThis->m_nsamRaw=nx/bin; //allocate memeory size_t size=nsam.width()*nsam.height(); size_t sizeb=nsamb*nsam.height(); int sizebUnbin=(nsamUnbin.width()+2)*nsamUnbin.height(); if(para.nStart<0) para.nStart=0; if(para.nEnd>=nz) para.nEnd=nz-1; int nframe=para.nEnd-para.nStart+1; pThis->UpdateDisplay(); //host memory float *bufmrc=new float[nx*ny]; float *bufmrcfft=new float[sizebUnbin]; float *htmp=new float[sizeb]; float *hbuf=new float[sizeb*nframe]; //host memory for entir stack float *hdisp=new float[sizeb]; float *hsumRaw=new float[sizeb]; float *hsumCorr=new float[sizeb]; float *hFSCRaw0=new float[sizeb]; //even number float *hFSCRaw1=new float[sizeb]; //odd float *hFSCCorr0=new float[sizeb]; //even number float *hFSCCorr1=new float[sizeb]; //odd size_t refsize=0; if(para.bDark) { if(pThis->m_pDark!=0) delete [] pThis->m_pDark; pThis->m_pDark=ReadRef(pThis->m_fnDark,nx,ny); if(pThis->m_pDark==0) { Message("Failed to get dark reference."); pThis->m_bRun=false; return (void *)0; } refsize+=nx*ny; } if(para.bGain) { if(pThis->m_pGain!=0) delete [] pThis->m_pGain; pThis->m_pGain=ReadRef(pThis->m_fnGain,nx,ny); if(pThis->m_pGain==0) { Message("Failed to get gain reference."); pThis->m_bRun=false; return (void *)0; } refsize+=nx*ny; } sprintf(str,"Allocate host memory: %f Gb\n",(nx*ny+sizeb*(nframe+8)+sizebUnbin+refsize)/256.0/1024.0/1024.0); pThis->TextOutput(str); if(hbuf==0) { if(bufmrc!=NULL) delete [] bufmrc; Message("Failed to allocate host memeory."); pThis->m_bRun=false; return (void *)0; } //device memory bool success=initGPU(para.GPUNum); if(!success) { sprintf(str,"Failed to initialize GPU #%d.",para.GPUNum); Message(str); delete [] bufmrc; delete [] hbuf; pThis->m_bRun=false; return (void *)0; } float *dsum=0; float *dsumcorr=0; float *dfft=0; float *dtmp=0; GPUMemAlloc((void **)&dsum,sizeof(float)*sizeb); GPUMemAlloc((void **)&dsumcorr,sizeof(float)*sizeb); GPUMemAlloc((void **)&dtmp,sizeof(float)*sizeb); cufftHandle fft_plan,ifft_plan; //prepare fft for unbinned image fft_plan=GPUFFTPlan(nsamUnbin); GPUSync(); GPUMemAlloc((void **)&dfft,sizeof(float)*sizebUnbin); //make a list int sizec=(nsam.width()/2+1)*nsam.height(); MASK *hPosList=new MASK[sizec]; MASK *dPosList=0; MkPosList(hPosList,nsam,para.bfactor); GPUMemAlloc((void **)&dPosList,sizeof(MASK)*sizec); GPUMemH2D((void **)dPosList,(void **)hPosList,sizeof(MASK)*sizec); size_t theFree, theTotal; GPUMemCheck(theFree,theTotal); sprintf(str,"GPU memory: free:%.0fMb total:%.0fMb\n", theFree/1024.0/1024.0, theTotal/1024.0/1024.0); pThis->TextOutput(str); //Read stack pThis->TextOutput("\nRead stack:\n"); float sx=0; float sy=0; float shiftx,shifty,cc; float avgcc=0.0; bool bFSCEven=true; //prepare frame sum numbers //Only the frame inside sum range is used for stack and sum output int nStartSum=para.nStartSum-para.nStart; int nEndSum=para.nEndSum-para.nStart; if(nStartSum<0) nStartSum=0; if(para.nEndSum>para.nEnd) nEndSum=para.nEnd-para.nStart+1; if(nEndSum<=nStartSum || nEndSum>=nframe) nEndSum=nframe-1; //1. calculate sum GPUMemZero((void **)&dsum,sizeof(float)*sizeb); GPUSync(); GPUMemZero((void **)&dsumcorr,sizeof(float)*sizeb); GPUSync(); for(j=para.nStart;j<=para.nEnd;j++) { //read from file and crop if(stack.read2DIm_32bit(bufmrc,j)!=stack.getImSize()) { sprintf(str,"Error when reading #%03d\n",j); pThis->TextOutput(str); } if(para.flp) { //flip image along y axis FlipYAxis(bufmrc, nx, ny); } //apply gain and dark reference if(!ApplyRef(bufmrc, para.bDark, pThis->m_pDark, para.bGain, pThis->m_pGain, nx*ny)) { sprintf(str,"Error when applying dark and/or gain to #%03d\n",j); pThis->TextOutput(str); } crop2fft(bufmrc,DIM(nx,ny),bufmrcfft,offsetx,offsety,nsamUnbin,bin); //copy to GPU GPUMemH2D((void *)dfft,(void *)bufmrcfft,sizeof(float)*sizebUnbin); //do fft GPUFFT2d(dfft,fft_plan); GPUSync(); //do binning if(bin>1) { GPUMemBinD2D(dtmp, dfft, nsam, nsamUnbin); GPUMemD2D(dfft, dtmp, sizeof(float)*sizeb); } //Sum if( (j-para.nStart)>=nStartSum && (j-para.nStart)<=nEndSum ) { if(bFSCEven) GPUAdd(dsum,dfft,sizeb); else GPUAdd(dsumcorr,dfft,sizeb); bFSCEven=!bFSCEven; } //copy ffted image to host GPUMemD2H((void *)(hbuf+(j-para.nStart)*sizeb),(void *)dfft,sizeof(float)*sizeb); GPUSync(); sprintf(str,"......Read and sum frame #%03d mean:%f\n",j,(hbuf+(j-para.nStart)*sizeb)[0]/nsam.x/nsam.y); pThis->TextOutput(str); } GPUMemD2H((void *)hFSCRaw0,(void *)dsum,sizeof(float)*sizeb); GPUMemD2H((void *)hFSCRaw1,(void *)dsumcorr,sizeof(float)*sizeb); GPUAdd(dsum,dsumcorr,sizeb); GPUSync(); //free memory for unbined image delete [] bufmrcfft; bufmrcfft=0; GPUMemFree((void **)&dfft); GPUFFTDestroy(fft_plan); fft_plan=0; //finish GPU memory allocate GPUMemAlloc((void **)&dfft,sizeof(float)*sizeb); GPUMemZero((void **)&dsumcorr,sizeof(float)*sizeb); GPUSync(); ifft_plan=GPUIFFTPlan(nsam); GPUSync(); //copy sum image to host for save and display if(para.bDispFFTRaw || para.bSaveRawSum) { GPUIFFT2d(dsum,ifft_plan); GPUSync(); GPUMultiplyNum(dsum,1.0/size,sizeb); GPUMemD2H((void *)hsumRaw,(void *)dsum,sizeof(float)*sizeb); fft2buf(bufmrc,hsumRaw,nsam); } //save MRC mrcraw; if(para.bSaveRawSum) { //write to file mrcraw.open(pThis->m_fnRawsum,"wb"); mrcraw.createMRC(bufmrc,nsam.width(),nsam.height(),1); //stats sprintf(str,"Mean=%f Min=%f Max=%f\n", mrcraw.m_header.dmean,mrcraw.m_header.dmin,mrcraw.m_header.dmax); pThis->TextOutput(str); mrcraw.close(); sprintf(str,"Save Uncorrected Sum to: %s\n",pThis->m_fnRawsum); pThis->TextOutput(str); } //save un-corrected stack MRC stackRaw; if(para.bSaveStackRaw) { pThis->TextOutput("\nWrite uncorrected stack:\n"); stackRaw.open(pThis->m_fnStackRaw,"wb"); stackRaw.m_header.nx=nsam.x; stackRaw.m_header.ny=nsam.y; stackRaw.m_header.nz=nEndSum-nStartSum+1; stackRaw.updateHeader(); for(j=nStartSum;j<=nEndSum;j++) { //copy to GPU GPUMemH2D((void *)dfft,(void *)(hbuf+j*sizeb),sizeof(float)*sizeb); //ifft GPUIFFT2d(dfft,ifft_plan); GPUSync(); GPUMultiplyNum(dfft,1.0/size,sizeb); GPUSync(); GPUMemD2H((void *)htmp,(void *)dfft,sizeof(float)*sizeb); fft2buf(bufmrc,htmp,nsam); stackRaw.write2DIm(bufmrc,j-nStartSum); sprintf(str,"......Write frame #%03d\n",j+para.nStart); pThis->TextOutput(str); } MinMaxMean(bufmrc,nsam.x*nsam.y, stackRaw.m_header.dmin, stackRaw.m_header.dmax, stackRaw.m_header.dmean); stackRaw.updateHeader(); sprintf(str,"Mean=%f Min=%f Max=%f\n", stackRaw.m_header.dmean,stackRaw.m_header.dmin,stackRaw.m_header.dmax); pThis->TextOutput(str); stackRaw.close(); sprintf(str,"Save Uncorrected Stack to: %s\n",pThis->m_fnStackRaw); pThis->TextOutput(str); } // added by Wen Jiang // generate running average frames for CC float *hbuf_orig=hbuf; float *hbuf_ra=0; if(para.nrw>1) { sprintf(str,"\nStart generating running average of %d frames\n",para.nrw); pThis->TextOutput(str); hbuf_ra = new float[sizeb*nframe]; //host memory for entire stack memcpy(hbuf_ra, hbuf, sizeof(float)*sizeb*nframe); // hbuf stores FFT of all frames for(int fi = 0; fi<nframe; fi++) { sprintf(str,"......Generating runing average of frame %d\n",fi); pThis->TextOutput(str); for(int ri=0; ri<para.nrw; ri++) { int fi2 = fi - para.nrw/2 + ri; if(fi2<0 || fi2==fi || fi2>=nframe) continue; for(size_t pi=0; pi<sizeb; pi++) hbuf_ra[sizeb*fi+pi]+=hbuf[sizeb*fi2+pi]; } } hbuf = hbuf_ra; sprintf(str,"Running average of %d frames are generated\n\n",para.nrw); pThis->TextOutput(str); } //2. frame to frame shift pThis->TextOutput("\nCalculate relative drift between frames\n"); Matrix<complex<double> > A; vector<complex<int> > compList; int ncomp=OD_SetEquation_All(A,compList, nframe, para.FrameDistOffset); Vector<complex<double> > b=Vector<complex<double> >(ncomp); int box=para.CCPeakSearchDim; float *hboxmap=new float[box*box*ncomp]; int par0,par1; for(j=0;j<ncomp;j++) { par0=compList[j].real(); par1=compList[j].imag(); //copy to GPU GPUMemH2D((void *)dsum,(void *)(hbuf+par0*sizeb),sizeof(float)*sizeb); GPUMemH2D((void *)dfft,(void *)(hbuf+par1*sizeb),sizeof(float)*sizeb); //shift and cc sx=0; sy=0; GPUShiftCC(dfft, dsum, dPosList,sx, sy, nsam); GPUSync(); //do ifft GPUIFFT2d(dfft,ifft_plan); GPUSync(); //find shift cc=FindShift(dfft,nsam, hboxmap+j*box*box, box, shiftx, shifty, para.NoisePeakSize-1); b[j]=complex<double>(shiftx,shifty); avgcc+=cc; sprintf(str,"......%03d Frame #%03d VS #%03d xy-shift: %8.4f %8.4f CC:%f\n",j,par0+para.nStart,par1+para.nStart,shiftx,shifty,cc); pThis->TextOutput(str); } // added by Wen Jiang // restore original stack buffer and delete running averages hbuf=hbuf_orig; if(hbuf_ra) delete[] hbuf_ra; //3. sovle overdetermined equation Vector<complex<double> > shift=lsSolver(A,b); Vector<double> ki=abs(A*shift-b); sprintf(str,"\n......ki: First round \n"); pThis->TextOutput(str); for(j=0;j<ki.size();j++) { par0=compList[j].real(); par1=compList[j].imag(); sprintf(str,"......ki #%03d of Frame #%03d VS #%03d: %8.4lf \n",j+para.nStart,par0+para.nStart,par1+para.nStart,ki[j]); pThis->TextOutput(str); } sprintf(str,"................................Average ki: %8.4lf \n\n",sum(ki)/ki.size()); pThis->TextOutput(str); //display CCMap if(para.bDispCCMap) { pThis->CCMapOutput(hboxmap,(void *)&ki); } //3.1 re-sovle overdetermined equation after removing large ki elments double kiThresh=para.kiThresh; vector<int> goodlist=OD_Threshold(A, b, ki, kiThresh); shift=lsSolver(A,b); ki=abs(A*shift-b); sprintf(str,"......ki: Second round \n"); pThis->TextOutput(str); for(j=0;j<ki.size();j++) { par0=compList[goodlist[j] ].real(); par1=compList[goodlist[j] ].imag(); sprintf(str,"......ki #%03d of Frame #%03d VS #%03d: %8.4f \n",j+para.nStart,par0+para.nStart,par1+para.nStart,ki[j]); pThis->TextOutput(str); } sprintf(str,"................................Average ki: %8.4lf \n\n",sum(ki)/ki.size()); pThis->TextOutput(str); //output final shift //calculate average shift double avgshift=0.0; for(j=0;j<shift.size();j++) avgshift+=abs(shift[j]); avgshift/=shift.size(); sprintf(str,"Final shift (Average %8.4lf pixels/frame):\n",avgshift); //output pThis->TextOutput(str); vector<complex<double> > shiftlist; complex<double> totalshift=0; sprintf(str,"......Shift of Frame #%03d : %8.4f %8.4f\n",para.nStart,totalshift.real(),totalshift.imag()); pThis->TextOutput(str); shiftlist.push_back(totalshift); for(j=0;j<shift.size();j++) { totalshift=totalshift+shift[j]; sprintf(str,"......Shift of Frame #%03d : %8.4f %8.4f\n",j+para.nStart+1,totalshift.real(),totalshift.imag()); pThis->TextOutput(str); shiftlist.push_back(totalshift); } pThis->PlotOutput(shiftlist); //save CCMap image if(para.bSaveCCmap) { buf2mrc(pThis->m_fnCCmap,hboxmap,box,box,ncomp); sprintf(str,"Save CC map to: %s\n",pThis->m_fnCCmap); pThis->TextOutput(str); } MRC stackCorr; if(para.bSaveStackCorr) { stackCorr.open(pThis->m_fnStackCorr,"wb"); stackCorr.m_header.nx=nsam.x; stackCorr.m_header.ny=nsam.y; stackCorr.m_header.nz=nEndSum-nStartSum+1; stackCorr.updateHeader(); } //3. correct xy-shift //reset memory GPUMemZero((void **)&dsum,sizeof(float)*sizeb); GPUSync(); GPUMemZero((void **)&dsumcorr,sizeof(float)*sizeb); GPUSync(); //calculate middle frame shift complex<double> midshift=0.0; int RefFrame=0; if(para.bAlignToMid==1) { if(para.nStart > 0 || para.nEnd > 0) { RefFrame = para.nStart + (para.nEnd - para.nStart) / 2; } else { RefFrame=nz/2+1; } } if(para.bAlignToMid<=0) RefFrame=abs(para.bAlignToMid); if(para.bAlignToMid!=0) { if(RefFrame<para.nStart) RefFrame=para.nStart; if(RefFrame>para.nEnd) RefFrame=para.nEnd; if(para.nStartSum>para.nEnd) para.nStartSum=para.nEnd; for(j=0;j<RefFrame-para.nStart;j++) midshift+=shift[j]; } sprintf(str,"\nSum Frame #%03d - #%03d (Reference Frame #%03d):\n",nStartSum+para.nStart,nEndSum+para.nStart,RefFrame); pThis->TextOutput(str); //Add(copy) first frame to GPU totalshift=0; for(j=1;j<nStartSum+1;j++) { totalshift+=shift[j-1]; } GPUMemH2D((void *)dsumcorr,(void *)(hbuf+nStartSum*sizeb),sizeof(float)*sizeb); if(para.bAlignToMid) GPUShift(dsumcorr,dPosList,-totalshift.real()+midshift.real(),-totalshift.imag()+midshift.imag(), nsam); GPUSync(); bFSCEven=false; sprintf(str,"......Add Frame #%03d with xy shift: %8.4lf %8.4lf\n",nStartSum+para.nStart,-totalshift.real()+midshift.real(),-totalshift.imag()+midshift.imag()); pThis->TextOutput(str); //Save stack if(para.bSaveStackCorr) { GPUMemD2D((void *)dfft,(void *)dsumcorr,sizeof(float)*sizeb); GPUIFFT2d(dfft,ifft_plan); GPUSync(); GPUMultiplyNum(dfft,1.0/size,sizeb); GPUSync(); GPUMemD2H((void *)htmp,(void *)dfft,sizeof(float)*sizeb); fft2buf(bufmrc,htmp,nsam); stackCorr.write2DIm(bufmrc,0); } //******* //sum other frame for(j=nStartSum+1;j<=nEndSum;j++) { totalshift+=shift[j-1]; //copy to GPU GPUMemH2D((void *)dfft,(void *)(hbuf+j*sizeb),sizeof(float)*sizeb); //shift GPUShift(dfft,dPosList,-totalshift.real()+midshift.real(),-totalshift.imag()+midshift.imag(), nsam); GPUSync(); //Sum if(bFSCEven) GPUAdd(dsumcorr,dfft,sizeb); else GPUAdd(dsum,dfft,sizeb); bFSCEven=!bFSCEven; sprintf(str,"......Add Frame #%03d with xy shift: %8.4lf %8.4lf\n",j+para.nStart,-totalshift.real()+midshift.real(),-totalshift.imag()+midshift.imag()); pThis->TextOutput(str); //save stack if(para.bSaveStackCorr) { GPUIFFT2d(dfft,ifft_plan); GPUSync(); GPUMultiplyNum(dfft,1.0/size,sizeb); GPUSync(); GPUMemD2H((void *)htmp,(void *)dfft,sizeof(float)*sizeb); fft2buf(bufmrc,htmp,nsam); stackCorr.write2DIm(bufmrc,j-nStartSum); } } if(para.bSaveStackCorr) { MinMaxMean(bufmrc,nsam.x*nsam.y,stackCorr.m_header.dmin, stackCorr.m_header.dmax, stackCorr.m_header.dmean); } //final sum GPUMemD2H((void *)hFSCCorr0,(void *)dsumcorr,sizeof(float)*sizeb); GPUMemD2H((void *)hFSCCorr1,(void *)dsum,sizeof(float)*sizeb); GPUAdd(dsumcorr,dsum,sizeb); GPUSync(); //copy binned sum to display if(para.bDispSumCorr) { DIM dispdim(DISPDIM,DISPDIM); if(nsam.x<nsam.y) dispdim.x=int(DISPDIM*float(nsam.x)/float(nsam.y)+0.5)/2*2; if(nsam.x>nsam.y) dispdim.y=int(DISPDIM*float(nsam.y)/float(nsam.x)+0.5)/2*2; pThis->m_dispdim=dispdim; GPUMemBinD2H(hdisp, dsumcorr, dispdim, nsam); pThis->ImageOutput(hdisp); } //copy sum image to host float *tsum=dsumcorr; GPUIFFT2d(tsum,ifft_plan); GPUMultiplyNum(tsum,1.0/size,sizeb); GPUMemD2H((void *)hsumCorr,(void *)tsum,sizeof(float)*sizeb); fft2buf(bufmrc,hsumCorr,nsam); //save MRC mrc; mrc.open(pThis->m_fnAlignsum,"wb"); mrc.createMRC(bufmrc,nsam.width(),nsam.height(),1); //stats sprintf(str,"Mean=%f Min=%f Max=%f\n",mrc.m_header.dmean,mrc.m_header.dmin,mrc.m_header.dmax); pThis->TextOutput(str); mrc.close(); sprintf(str,"Save Corrected Sum to: %s\n",pThis->m_fnAlignsum); pThis->TextOutput(str); //close save Corrected stack if(para.bSaveStackCorr) { stackCorr.updateHeader(); pThis->TextOutput("\nWrite corrected stack:\n"); sprintf(str,"Mean=%f Min=%f Max=%f\n", stackCorr.m_header.dmean,stackCorr.m_header.dmin,stackCorr.m_header.dmax); pThis->TextOutput(str); stackCorr.close(); sprintf(str,"Save Corrected Stack to: %s\n",pThis->m_fnStackCorr); pThis->TextOutput(str); } if(para.bLogFSC) { complex<double> avgshift=0.0; for(i=0;i<shift.size();i++) { avgshift+=shift[i]/abs(shift[i]); } pThis->PlotFSC((cuComplex *)hFSCRaw0, (cuComplex *)hFSCRaw1, (cuComplex *)hFSCCorr0, (cuComplex *)hFSCCorr1,hPosList,nsam,avgshift); } //free GPU FFT plan, new plan will be created for rectangular image //GPUFFTDestroy(fft_plan); GPUFFTDestroy(ifft_plan); /////////////////////////// DIM nsamsub=nsam.MinSquare(); //prepare new fft if(para.bDispFFTRaw || para.bDispFFTCorr) fft_plan=GPUFFTPlan(nsamsub); //Make Raw fft modulus for display if(para.bDispFFTRaw) { GPUMemH2D((void *)dsum,(void *)hsumRaw,sizeof(float)*sizeb); GPURectFFTLogModulus(dfft, dsum, dtmp, dsumcorr, nsam, para.fftscale,fft_plan); //copy to host, make pwr image GPUMemD2H(htmp,dfft,sizeof(float)*(nsamsub.width()/2+1)*nsamsub.height()); FFTModulusToDispBuf(htmp,hdisp, nsamsub); //copy back to device GPUMemH2D(dtmp,hdisp,sizeof(float)*(nsamsub.width()+2)*nsamsub.height()); //do binning GPUBinFFT(dfft, DISPDIM, dtmp, nsamsub, fft_plan); GPUMemD2H((void *)hdisp,(void *)dfft,sizeof(float)*(DISPDIM+2)*DISPDIM); //display pThis->FFTOutputRaw(hdisp); } //Make Corrected fft modulus for display if(para.bDispFFTCorr) { GPUMemH2D((void *)dsum,(void *)hsumCorr,sizeof(float)*sizeb); GPURectFFTLogModulus(dfft, dsum, dtmp, dsumcorr, nsam, para.fftscale,fft_plan); //copy to host, make pwr image GPUMemD2H(htmp,dfft,sizeof(float)*(nsamsub.width()/2+1)*nsamsub.height()); FFTModulusToDispBuf(htmp,hdisp, nsamsub); //copy back to device GPUMemH2D(dtmp,hdisp,sizeof(float)*(nsamsub.width()+2)*nsamsub.height()); //do binning GPUBinFFT(dfft, DISPDIM, dtmp, nsamsub, fft_plan); GPUMemD2H((void *)hdisp,(void *)dfft,sizeof(float)*(DISPDIM+2)*DISPDIM); //display pThis->FFTOutputCorr(hdisp); } //destory fft if(para.bDispFFTRaw || para.bDispFFTCorr) GPUFFTDestroy(fft_plan); ///////////////////////////////////////// delete [] bufmrc; delete [] hbuf; delete [] hPosList; GPUMemFree((void **)&dPosList); GPUMemFree((void **)&dsum); GPUMemFree((void **)&dsumcorr); GPUMemFree((void **)&dfft); GPUMemFree((void **)&dtmp); delete [] htmp; delete [] hboxmap; delete [] hdisp; delete [] hsumRaw; delete [] hsumCorr; delete [] hFSCRaw0; delete [] hFSCRaw1; delete [] hFSCCorr0; delete [] hFSCCorr1; ResetGPU(); pThis->Done(); sprintf(str,"Done.\n"); pThis->TextOutput(str); return (void *)0; }
void Host::initialize() { initCPU(); initGPU(); initOS(); }
int main(int argc, char** argv) { const int n = NN; const int m = NM; const int iter_max = 500; const double tol = 1.0e-6; double error = 1.0; memset(A, 0, n * m * sizeof(double)); memset(Anew, 0, n * m * sizeof(double)); for (int j = 0; j < n; j++) { A[j][0] = 1.0; Anew[j][0] = 1.0; } #ifdef _OPENACC initGPU(argc, argv); #endif printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m); StartTimer(); int iter = 0; #pragma acc data copy(A[0:NN][0:NM]), create(Anew[0:NN][0:NM]) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc kernels present(A,Anew) { #pragma acc loop gang(512) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc loop gang(512) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error); iter++; } double runtime = GetTimer(); /* int passed = checkSolution(error, &A[0][0], NN, NM, iter_max); if (passed == 0) { printf("Error check passed.\n"); } else if (passed > 0) { printf("Error check failed.\n"); } */ printf(" total : %f s\n", runtime / 1000); return 0; }