void SXFunctionInternal::evaluateOpenCL() { // OpenCL return flag cl_int ret; // Set OpenCL Kernel Parameters int kernel_arg = 0; // Pass inputs for (int i=0; i<nIn(); ++i) { ret = clSetKernelArg(kernel_, kernel_arg++, sizeof(cl_mem), static_cast<void *>(&input_memobj_[i])); casadi_assert(ret == CL_SUCCESS); } // Pass outputs for (int i=0; i<nOut(); ++i) { ret = clSetKernelArg(kernel_, kernel_arg++, sizeof(cl_mem), static_cast<void *>(&output_memobj_[i])); casadi_assert(ret == CL_SUCCESS); } // Execute OpenCL Kernel executeKernel(kernel_); // Get outputs for (int i=0; i<output_memobj_.size(); ++i) { ret = clEnqueueReadBuffer(sparsity_propagation_kernel_.command_queue, output_memobj_[i], CL_TRUE, 0, outputNoCheck(i).size() * sizeof(cl_double), reinterpret_cast<void*>(outputNoCheck(i).ptr()), 0, NULL, NULL); casadi_assert(ret == CL_SUCCESS); } }
// copy image and process using OpenCL //***************************************************************************** void processImage() { // activate destination buffer glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, pbo_source); //// read data into pbo. note: use BGRA format for optimal performance glReadPixels(0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, NULL); if (bPostprocess) { if (iProcFlag == 0) { pboRegister(); executeKernel(blur_radius); pboUnregister(); } else { // map the PBOs glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, pbo_source); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest); unsigned int* source_ptr = (unsigned int*)glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY_ARB); unsigned int* dest_ptr = (unsigned int*)glMapBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY_ARB); // Postprocessing on the CPU postprocessHost(source_ptr, dest_ptr, image_width, image_height, 0, blur_radius, 0.8f, 4.0f); // umap the PBOs glUnmapBufferARB(GL_PIXEL_PACK_BUFFER_ARB); glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, 0); glUnmapBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); } // download texture from PBO glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest); glBindTexture(GL_TEXTURE_2D, tex_screen); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, NULL); } else { // download texture from PBO glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_source); glBindTexture(GL_TEXTURE_2D, tex_screen); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, NULL); } }
void boot(void) { bootData.totalSectors = diskParameter.totalSectors; bootData.sectorSize = diskParameter.sectorSize; bootData.mbr = (MBR *) codeStart; /* 内存探测应该这最先,这要用于分配缓冲大小 */ detectMemory(); /* 加载内核 */ loadKernel(); /* 设置高分辨率模式应该在最后,因为前面有文字打印操作 */ setVbeMode(getPreferredResolution()); executeKernel(); }
void SXFunctionInternal::spEvaluateOpenCL(bool fwd) { // OpenCL return flag cl_int ret; // Select a kernel cl_kernel kernel = fwd ? sp_fwd_kernel_ : sp_adj_kernel_; // Set OpenCL Kernel Parameters int kernel_arg = 0; // Pass inputs for (int i=0; i<nIn(); ++i) { ret = clSetKernelArg(kernel, kernel_arg++, sizeof(cl_mem), static_cast<void *>(&sp_input_memobj_[i])); casadi_assert(ret == CL_SUCCESS); } // Pass outputs for (int i=0; i<nOut(); ++i) { ret = clSetKernelArg(kernel, kernel_arg++, sizeof(cl_mem), static_cast<void *>(&sp_output_memobj_[i])); casadi_assert(ret == CL_SUCCESS); } // Execute OpenCL Kernel executeKernel(kernel); // Get inputs for (int i=0; i<sp_input_memobj_.size(); ++i) { ret = clEnqueueReadBuffer(sparsity_propagation_kernel_.command_queue, sp_input_memobj_[i], CL_TRUE, 0, inputNoCheck(i).size() * sizeof(cl_ulong), reinterpret_cast<void*>(inputNoCheck(i).ptr()), 0, NULL, NULL); casadi_assert(ret == CL_SUCCESS); } // Get outputs for (int i=0; i<sp_output_memobj_.size(); ++i) { ret = clEnqueueReadBuffer(sparsity_propagation_kernel_.command_queue, sp_output_memobj_[i], CL_TRUE, 0, outputNoCheck(i).size() * sizeof(cl_ulong), reinterpret_cast<void*>(outputNoCheck(i).ptr()), 0, NULL, NULL); casadi_assert(ret == CL_SUCCESS); } }
// Run a test sequence without any GL //***************************************************************************** void TestNoGL() { // execute OpenCL kernel without GL interaction cl_pbos[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, 4 * image_width * image_height, NULL, &ciErrNum); cl_pbos[1] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, 4 * image_width * image_height, NULL, &ciErrNum); // set the args values ciErrNum |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &(cl_pbos[0])); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &(cl_pbos[1])); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(image_width), &image_width); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(image_width), &image_height); // Start timer 0 and process n loops on the GPU executeKernel(blur_radius); // Cleanup and exit Cleanup(EXIT_SUCCESS); }
void scan(clContext *clCxt,cl_mem &ginput,cl_mem &goutput,Plan *plan,int elemnum) { cl_mem gadsys; int steplength = ((plan->registergroup + plan->localmemgroup) * plan->cta) / plan->vectorlength; int taillen = elemnum - steplength * plan->vectorlength * plan->localthread * plan->workgroup, tailgroup; tailgroup = (taillen+2047)>>11; int adsyslen = plan->workgroup+tailgroup+2; int *adsys = new int[adsyslen]; memset(adsys,0,adsyslen*sizeof(int)); create(clCxt,&gadsys,adsyslen*sizeof(int)); upload(clCxt,(void *)adsys,&gadsys,adsyslen * sizeof(int)); int registersize = plan->registergroup * plan->cta; int localmemsize = plan->localmemgroup * plan->cta; char build_options[200]; if(plan->coalesced==1){ sprintf(build_options , "-D NB_VEC_%d -D NB_L%d -D NB_G%d -D NB_CTA_%d -D STEP_NUM=%d -D NB_REG_GRP=%d -D NB_REG_SIZE=%d -D NB_LOCAL_GRP=%d -D NB_LOCAL_SIZE=%d", plan->vectorlength,plan->localthread,plan->workgroup,plan->cta,steplength,plan->registergroup, registersize,plan->localmemgroup,localmemsize + 1); } else{ sprintf(build_options , "-D NB_VEC_%d -D NB_L%d -D NB_G%d -D NB_CTA_%d -D STEP_NUM=%d -D NB_REG_GRP=%d -D NB_REG_SIZE=%d -D NB_LOCAL_GRP=%d -D NB_LOCAL_SIZE=%d", plan->vectorlength,plan->localthread,plan->workgroup,plan->cta,steplength,plan->registergroup, registersize,plan->localmemgroup,localmemsize); } cout<<build_options<<endl; vector<pair<size_t ,const void *> > args; args.push_back( make_pair( sizeof(cl_mem) , (void *)&ginput )); args.push_back( make_pair( sizeof(cl_mem) , (void *)&gadsys )); args.push_back( make_pair( sizeof(cl_mem) , (void *)&goutput )); args.push_back( make_pair( sizeof(cl_int) , (void *)&plan->workgroup )); size_t globalthreads[3] = {plan->localthread * plan->workgroup,1,1}; size_t localthreads[3] = {plan->localthread,1,1}; timeRcd.kerneltime = 0; timeRcd.totaltime = 0; executeKernel("scan.cl","scan",args,globalthreads,localthreads,build_options,clCxt); if(taillen!=0){ sprintf(build_options , "-D NB_VEC_TAIL -D NB_L64 -D NB_G%d -D NB_CTA_16 -D STEP_NUM=32 -D NB_REG_GRP=1 -D NB_REG_SIZE=16 -D NB_LOCAL_GRP=1 -D NB_LOCAL_SIZE=17", tailgroup); args.clear(); args.push_back( make_pair( sizeof(cl_mem) , (void *)&ginput)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&gadsys)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&goutput)); args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum)); args.push_back( make_pair( sizeof(cl_int) , (void *)&plan->workgroup)); args.push_back( make_pair( sizeof(cl_int) , (void *)&tailgroup)); args.push_back( make_pair( sizeof(cl_int) , (void *)&taillen)); globalthreads[0] = 64 * tailgroup; localthreads[0] = 64; //timeRcd.kerneltime = 0; //timeRcd.totaltime = 0; executeKernel("scan.cl","scantail",args,globalthreads,localthreads,build_options,clCxt); } //#define PRINT_A #if defined PRINT_A download(clCxt,&gadsys,(void *)adsys,adsyslen*sizeof(int)); for(int i=0,k=1;i< adsyslen;k++) { cout <<setiosflags(ios::left) << "line:"<<k<<" "; for(int j=0;j < 10 && i< adsyslen;j++,i++) cout << setiosflags(ios::left) << setw(10) << adsys[i]; cout << endl; } #endif delete adsys; clReleaseMemObject(gadsys); }
int main ( int argc, char *argv[]){ int i; int state; int sizeOfDimension; int success = 1; int FTI_APP_RANK; herr_t status; threeD ***ptr = allocateLinearMemory(XSIZE, YSIZE, ZSIZE ); threeD *devPtr; int result; MPI_Init(&argc, &argv); result = FTI_Init(argv[1], MPI_COMM_WORLD); if (result == FTI_NREC) { exit(RECOVERY_FAILED); } int crash = atoi(argv[2]); int level = atoi(argv[3]); memset(&ptr[0][0][0],0, sizeof(threeD) * (XSIZE * YSIZE * ZSIZE)); int numGpus = getProperties(); MPI_Comm_rank(FTI_COMM_WORLD,&FTI_APP_RANK); setDevice(FTI_APP_RANK%numGpus); dictionary *ini = iniparser_load( argv[1] ); int grank; MPI_Comm_rank(MPI_COMM_WORLD,&grank); int nbHeads = (int)iniparser_getint(ini, "Basic:head", -1); int finalTag = (int)iniparser_getint(ini, "Advanced:final_tag", 3107); int nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1); int headRank = grank - grank%nodeSize; FTIT_complexType coordinateDef; FTIT_type threeDType; FTI_AddSimpleField( &coordinateDef, &FTI_INTG, offsetof( threeD, x),0, "X"); FTI_AddSimpleField( &coordinateDef, &FTI_INTG, offsetof( threeD, y),1, "y"); FTI_AddSimpleField( &coordinateDef, &FTI_INTG, offsetof( threeD, z),2, "z"); FTI_AddSimpleField( &coordinateDef, &FTI_INTG, offsetof( threeD, id),3, "id"); FTI_InitComplexType(&threeDType, &coordinateDef, 4 , sizeof(threeD), "ThreeD", NULL); if ( (nbHeads<0) || (nodeSize<0) ) { printf("wrong configuration (for head or node-size settings)! %d %d\n",nbHeads, nodeSize); MPI_Abort(MPI_COMM_WORLD, -1); } allocateMemory((void **) &devPtr, (XSIZE * YSIZE * ZSIZE*sizeof(threeD))); FTI_Protect(0, devPtr, (XSIZE * YSIZE * ZSIZE),threeDType); int dimLength[3] = {ZSIZE,YSIZE,XSIZE}; if (grank == 0) for ( i =0 ; i < 3; i++){ printf("Dimension is %d size is %d\n", dimLength[i], XSIZE*YSIZE*ZSIZE*sizeof(threeDType) / (1024*1024)); } FTI_DefineDataset(0, 3, dimLength , "GPU TOPOLOGY" , NULL); state = FTI_Status(); if ( state == INIT ){ executeKernel(devPtr); FTI_Checkpoint(1,level); if ( crash ) { if( nbHeads > 0 ) { int value = FTI_ENDW; MPI_Send(&value, 1, MPI_INT, headRank, finalTag, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); } MPI_Finalize(); exit(0); } }else{ result = FTI_Recover(); if (result != FTI_SCES) { exit(RECOVERY_FAILED); } hostCopy(devPtr, &ptr[0][0][0],(XSIZE * YSIZE * ZSIZE*sizeof(threeD))); } threeD ***validationMemory= allocateLinearMemory(XSIZE, YSIZE, ZSIZE ); initData(&validationMemory[0][0][0]); if (state == RESTART || state == KEEP) { int tmp; result = memcmp(&validationMemory[0][0][0], &ptr[0][0][0],(XSIZE * YSIZE * ZSIZE*sizeof(threeD))); MPI_Allreduce(&result, &tmp, 1, MPI_INT, MPI_SUM, FTI_COMM_WORLD); result = tmp; } deallocateLinearMemory(ZSIZE , ptr); deallocateLinearMemory(ZSIZE , validationMemory); freeCuda(devPtr); if (FTI_APP_RANK == 0 && (state == RESTART || state == KEEP)) { if (result == 0) { printf("[SUCCESSFUL]\n"); } else { printf("[NOT SUCCESSFUL]\n"); success=0; } } MPI_Barrier(FTI_COMM_WORLD); FTI_Finalize(); MPI_Finalize(); if (success == 1) return 0; else exit(DATA_CORRUPT); }