int _main(void) { // Init and resolve libraries initKernel(); initLibc(); initNetwork(); // Connect to server and send message char socketName[] = "debug"; struct sockaddr_in server; server.sin_len = sizeof(server); server.sin_family = AF_INET; server.sin_addr.s_addr = IP(192, 168, 0, 4); server.sin_port = sceNetHtons(9023); memset(server.sin_zero, 0, sizeof(server.sin_zero)); int sock = sceNetSocket(socketName, AF_INET, SOCK_STREAM, 0); sceNetConnect(sock, (struct sockaddr *)&server, sizeof(server)); debug(sock, "PID: %d", syscall(20)); sceNetSocketClose(sock); // Return to browser return 0; }
int _main(void) { // Init and resolve libraries initKernel(); initLibc(); initNetwork(); initPthread(); // Init netdebug struct sockaddr_in server; server.sin_family = sceNetHtons(AF_INET); server.sin_addr.s_addr = IP(192, 168, 0, 4); server.sin_port = sceNetHtons(9023); memset(server.sin_zero, 0, sizeof(server.sin_zero)); netdbg_sock = sceNetSocket("netdebug", AF_INET, SOCK_STREAM, 0); sceNetConnect(netdbg_sock, (struct sockaddr *)&server, sizeof(server)); ftp_init(PS4_IP, PS4_PORT); //INFO("PS4 listening on IP %s Port %i\n", PS4_IP, PS4_PORT); while (1) { sceKernelUsleep(100 * 1000); } ftp_fini(); return 0; }
static RCCResult run(RCCWorker *self, RCCBoolean timedOut, RCCBoolean *newRunCondition) { Gaussian_blurProperties *p = self->properties; Gaussian_blurState *s = self->memories[0]; RCCPort *in = &self->ports[GAUSSIAN_BLUR_IN], *out = &self->ports[GAUSSIAN_BLUR_OUT]; const RCCContainer *c = self->container; (void)timedOut; // End state: just send the zero length message to indicate "done" // This will be unnecessary when EOS indication is fixed if (self->runCondition == &end) { out->output.length = 0; c->advance(out, 0); return RCC_DONE; } // Current buffer unsigned cur = s->inLine % HISTORY_SIZE; // First line: init if(s->inLine == 0) { initKernel(p->sigmaX, p->sigmaY); } // Second line else if(s->inLine == 1) { memset(out->current.data, 0, LINE_BYTES); out->output.length = LINE_BYTES; c->advance(out, LINE_BYTES); } // Middle line else if(s->inLine > 1) { doLine(s->buffers[(cur - 2 + HISTORY_SIZE) % HISTORY_SIZE].data, s->buffers[(cur - 1 + HISTORY_SIZE) % HISTORY_SIZE].data, in->current.data, out->current.data, p->width); out->output.length = LINE_BYTES; c->advance(out, LINE_BYTES); } // Go to next unsigned prev = (cur - 2 + HISTORY_SIZE) % HISTORY_SIZE; if(s->inLine < HISTORY_SIZE - 1) c->take(in, NULL, &s->buffers[cur]); else c->take(in, &s->buffers[prev], &s->buffers[cur]); s->inLine++; // Arrange to send the zero-length message after the last line of last image // This will be unnecessary when EOS indication is fixed if (in->input.length == 0) { self->runCondition = &end; *newRunCondition = 1; } return RCC_OK; }
int main(void) { initKernel(); //Threads anlegen usw. createKernelThreads(); startKernel(); return 0; }
void TestOpenCL::process() { //create opencl context and command queue if(!initOpenCl()) return; //create texture, must be done after openCl context created (at least for AMD) if(!initTexture()) return; //load and compile kernel if(!initKernel()) return; cl::Event event; cl_int status; cl::NDRange globalThreads(m_testImage.width(), m_testImage.height()); int pos=0; int outputIndex=0; while(m_process) { //setup kernel arguments status=m_kernel.setArg(0, *m_testImage.image()); if(status != CL_SUCCESS) assert(false); status=m_kernel.setArg(1, *m_outputImage[outputIndex].image()); if(status != CL_SUCCESS) assert(false); status=m_kernel.setArg(2, pos); if(status != CL_SUCCESS) assert(false); //execute kernel status=m_openCLComandQueue.enqueueNDRangeKernel(m_kernel, cl::NullRange, globalThreads, cl::NullRange, NULL, &event); if(status != CL_SUCCESS) assert(false); m_openCLComandQueue.flush(); pos++; if(pos > m_testImage.width()) pos=0; event.wait(); m_openCLComandQueue.finish(); m_mainView->display(&m_outputImage[outputIndex]); m_auxView->display(&m_outputImage[outputIndex]); outputIndex++; if(outputIndex >= m_outputImage.size()) outputIndex=0; } m_openCLComandQueue.finish(); }
int _main(void) { // Init and resolve libraries initKernel(); initLibc(); initNetwork(); // Connect to server char socketName[] = "debug"; struct sockaddr_in server; server.sin_len = sizeof(server); server.sin_family = AF_INET; server.sin_addr.s_addr = IP(192, 168, 0, 4); server.sin_port = sceNetHtons(9023); memset(server.sin_zero, 0, sizeof(server.sin_zero)); int sock = sceNetSocket(socketName, AF_INET, SOCK_STREAM, 0); sceNetConnect(sock, (struct sockaddr *)&server, sizeof(server)); // Get font path char path[256] = "/"; int length = 11; getSandboxDirectory(path + 1, &length); strcpy(path + 11, "/common/font/DFHEI5-SONY.ttf"); // Open for reading, and get size int fd = open(path, O_RDONLY, 0); struct stat s; fstat(fd, &s); // Allocate buffer and read char *buffer = malloc(s.st_size); read(fd, buffer, s.st_size); close(fd); // Send sceNetSend(sock, buffer, s.st_size, 0); free(buffer); sceNetSocketClose(sock); // Return to browser return 0; }
void btGpuDemo3dOCLWrap::initKernels() { initKernel(GPUDEMO3D_KERNEL_CLEAR_ACCUM_IMPULSE, "kClearAccumImpulse"); setKernelArg(GPUDEMO3D_KERNEL_CLEAR_ACCUM_IMPULSE, 1, sizeof(cl_mem), (void*)&m_dLambdaDtBox); setKernelArg(GPUDEMO3D_KERNEL_CLEAR_ACCUM_IMPULSE, 2, sizeof(int), (void*)&m_maxPointsPerConstr); initKernel(GPUDEMO3D_KERNEL_COLLISION_WITH_WALL, "kCollisionWithWallBox"); setKernelArg(GPUDEMO3D_KERNEL_COLLISION_WITH_WALL, 1, sizeof(cl_mem), (void*)&m_dTrans); setKernelArg(GPUDEMO3D_KERNEL_COLLISION_WITH_WALL, 2, sizeof(cl_mem), (void*)&m_dVel); setKernelArg(GPUDEMO3D_KERNEL_COLLISION_WITH_WALL, 3, sizeof(cl_mem), (void*)&m_dAngVel); setKernelArg(GPUDEMO3D_KERNEL_COLLISION_WITH_WALL, 4, sizeof(cl_mem), (void*)&m_dParams); initKernel(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, "kSolveConstraint"); setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 1, sizeof(cl_mem), (void*)&m_dIds); setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 2, sizeof(cl_mem), (void*)&m_dBatchIds); setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 3, sizeof(cl_mem), (void*)&m_dTrans); setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 4, sizeof(cl_mem), (void*)&m_dVel); setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 5, sizeof(cl_mem), (void*)&m_dAngVel); setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 6, sizeof(cl_mem), (void*)&m_dLambdaDtBox); setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 7, sizeof(cl_mem), (void*)&m_dPositionConstraint); setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 8, sizeof(cl_mem), (void*)&m_dNormal); setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 9, sizeof(cl_mem), (void*)&m_dContact); initKernel(GPUDEMO3D_KERNEL_INTEGRATE_VELOCITIES, "kIntegrateVelocities"); setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_VELOCITIES, 1, sizeof(cl_mem), (void*)&m_dForceTorqueDamp); setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_VELOCITIES, 2, sizeof(cl_mem), (void*)&m_dInvInertiaMass); setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_VELOCITIES, 3, sizeof(cl_mem), (void*)&m_dVel); setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_VELOCITIES, 4, sizeof(cl_mem), (void*)&m_dAngVel); initKernel(GPUDEMO3D_KERNEL_INTEGRATE_TRANSFORMS, "kIntegrateTransforms"); setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_TRANSFORMS, 1, sizeof(cl_mem), (void*)&m_dTrans); setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_TRANSFORMS, 2, sizeof(cl_mem), (void*)&m_dVel); setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_TRANSFORMS, 3, sizeof(cl_mem), (void*)&m_dAngVel); setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_TRANSFORMS, 4, sizeof(cl_mem), (void*)&m_dInvInertiaMass); }
bool System::start() { be_ptr32_t<Thread> thread; // Initialize kernel initKernel(); // Create main thread ExCreateThread(reinterpret_cast<be_uint32_t*>(&thread), mBinary.header.defaultStackSize.size, nullptr, nullptr, reinterpret_cast<void*>(mBinary.header.entryPoint.address), nullptr, 0); // Swap pointer back to little endian memory... thread->join(); return true; }
double OpenCLPricer::priceImplTriangle(OptionSpec& optionSpec, int stepSize) { if (stepSize >= 512) { std::cerr << "[Error] Step size not valid." << "Cannot have more than 512 work items per work group" << std::endl; exit(5); } // ------------------------Derived Parameters------------------------------ float deltaT = optionSpec.yearsToMaturity / optionSpec.numSteps; float upFactor = exp(optionSpec.volatility * sqrt(deltaT)); float downFactor = 1.0f / upFactor; float discountFactor = exp(optionSpec.riskFreeRate * deltaT); float upWeight = (discountFactor - downFactor) / (upFactor - downFactor); float downWeight = 1.0f - upWeight; // Create buffers on the devices cl::Buffer valueBuffer(*context, CL_MEM_READ_WRITE, sizeof(float) * (optionSpec.numSteps + 1)); cl::Buffer triangleBuffer(*context, CL_MEM_READ_WRITE, sizeof(float) * (optionSpec.numSteps + 1)); // Create qeueue to push commands for the devices cl::CommandQueue queue(*context, *defaultDevice); // Build and run init kernel cl::Kernel initKernel(*program, "init"); initKernel.setArg(0, optionSpec.stockPrice); initKernel.setArg(1, optionSpec.strikePrice); initKernel.setArg(2, optionSpec.numSteps); initKernel.setArg(3, optionSpec.type); initKernel.setArg(4, deltaT); initKernel.setArg(5, upFactor); initKernel.setArg(6, downFactor); initKernel.setArg(7, valueBuffer); queue.enqueueNDRangeKernel(initKernel, cl::NullRange, cl::NDRange(optionSpec.numSteps + 1), cl::NullRange); // std::cout << "[INFO] Executing init kernel with " << optionSpec.numSteps + 1 // << " work items" << std::endl; // Block until init kernel finishes execution queue.enqueueBarrierWithWaitList(); // Note(disiok): Here we use work groups of size stepSize + 1 // so that after each iteration, the number of nodes is reduced by stepSize int groupSize = stepSize + 1; cl::Kernel upKernel(*program, "upTriangle"); upKernel.setArg(0, upWeight); upKernel.setArg(1, downWeight); upKernel.setArg(2, discountFactor); upKernel.setArg(3, valueBuffer); upKernel.setArg(4, cl::Local(sizeof(float) * groupSize)); upKernel.setArg(5, triangleBuffer); cl::Kernel downKernel(*program, "downTriangle"); downKernel.setArg(0, upWeight); downKernel.setArg(1, downWeight); downKernel.setArg(2, discountFactor); downKernel.setArg(3, valueBuffer); downKernel.setArg(4, cl::Local(sizeof(float) * groupSize)); downKernel.setArg(5, triangleBuffer); for (int i = 0; i < optionSpec.numSteps / stepSize; i ++) { int numWorkGroupsUp = optionSpec.numSteps / stepSize - i; int numWorkGroupsDown = numWorkGroupsUp - 1; int numWorkItemsUp = numWorkGroupsUp * groupSize; int numWorkItemsDown = numWorkGroupsDown * groupSize; queue.enqueueNDRangeKernel(upKernel, cl::NullRange, cl::NDRange(numWorkItemsUp)), cl::NDRange(groupSize); // std::cout << "[INFO] Executing up kernel with " << numWorkGroupsUp // << " work groups and " << groupSize << " work items per group" // << std::endl; queue.enqueueBarrierWithWaitList(); if (numWorkGroupsDown > 0) { queue.enqueueNDRangeKernel(downKernel, cl::NullRange, cl::NDRange(numWorkItemsDown)), cl::NDRange(groupSize); // std::cout << "[INFO] Executing down kernel with " << numWorkGroupsDown // << " work groups and " << groupSize << " work items per group" // << std::endl; queue.enqueueBarrierWithWaitList(); } } // Read results float* value = new float; queue.enqueueReadBuffer(valueBuffer, CL_TRUE, 0, sizeof(float), value); return *value; }
/** * Algorithm: * init kernel: * Use (optionSpec.numSteps + 1) work-items to compute the option values * at expiry * Only executed once * * group kernel: * Each work-item calculate the previous option values of (groupSize) * lattice points * Kernel executed (optionSpec.numSteps) times * Each execution reduces the number of lattice points by 1 */ double OpenCLPricer::priceImplGroup(OptionSpec& optionSpec, int groupSize) { // ------------------------Derived Parameters------------------------------ float deltaT = optionSpec.yearsToMaturity / optionSpec.numSteps; float upFactor = exp(optionSpec.volatility * sqrt(deltaT)); float downFactor = 1.0f / upFactor; float discountFactor = exp(optionSpec.riskFreeRate * deltaT); float upWeight = (discountFactor - downFactor) / (upFactor - downFactor); float downWeight = 1.0f - upWeight; // Create buffers on the devices cl::Buffer valueBufferA(*context, CL_MEM_READ_WRITE, sizeof(float) * (optionSpec.numSteps + 1)); cl::Buffer valueBufferB(*context, CL_MEM_READ_WRITE, sizeof(float) * (optionSpec.numSteps + 1)); // Create qeueue to push commands for the devices cl::CommandQueue queue(*context, *defaultDevice); // Build and run init kernel cl::Kernel initKernel(*program, "init"); initKernel.setArg(0, optionSpec.stockPrice); initKernel.setArg(1, optionSpec.strikePrice); initKernel.setArg(2, optionSpec.numSteps); initKernel.setArg(3, optionSpec.type); initKernel.setArg(4, deltaT); initKernel.setArg(5, upFactor); initKernel.setArg(6, downFactor); initKernel.setArg(7, valueBufferA); queue.enqueueNDRangeKernel(initKernel, cl::NullRange, cl::NDRange(optionSpec.numSteps + 1), cl::NullRange); // std::cout << "[INFO] Executing init kernel with " << optionSpec.numSteps + 1 // << " work items" << std::endl; // Block until init kernel finishes execution queue.enqueueBarrierWithWaitList(); // Build and run group kernel cl::Kernel groupKernel(*program, "group"); groupKernel.setArg(0, upWeight); groupKernel.setArg(1, downWeight); groupKernel.setArg(2, discountFactor); for (int i = 1; i <= optionSpec.numSteps; i ++) { int numLatticePoints = optionSpec.numSteps + 1 - i; int numWorkItems = ceil((float) numLatticePoints / groupSize); groupKernel.setArg(3, i % 2 == 1 ? valueBufferA : valueBufferB); groupKernel.setArg(4, i % 2 == 1 ? valueBufferB: valueBufferA); groupKernel.setArg(5, numLatticePoints); groupKernel.setArg(6, groupSize); queue.enqueueNDRangeKernel(groupKernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NullRange); // std::cout << "[INFO] Executing group kernel with " << numWorkItems // << " work items" << std::endl; queue.enqueueBarrierWithWaitList(); } // Read results float* value = new float; queue.enqueueReadBuffer(optionSpec.numSteps % 2 == 1? valueBufferB : valueBufferA, CL_TRUE, 0, sizeof(float), value); return *value; }
TextureGpuNUFFTOperator(IndType kernelWidth, IndType sectorWidth, DType osf, Dimensions imgDims): GpuNUFFTOperator(kernelWidth,sectorWidth,osf,imgDims,false,TEXTURE),interpolationType(gpuNUFFT::TEXTURE2D_LOOKUP) { initKernel(); }
void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv) { cl_int ciErrNum; if (!m_cxMainContext) { // m_cxMainContext = clCreateContextFromType(0, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErrNum); m_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); m_cdDevice = btOclGetMaxFlopsDev(m_cxMainContext); btOclPrintDevInfo(m_cdDevice); // create a command-queue m_cqCommandQue = clCreateCommandQueue(m_cxMainContext, m_cdDevice, 0, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); } // Program Setup size_t program_length; char* fileName = "ParticlesOCL.cl"; FILE * fp = fopen(fileName, "rb"); char newFileName[512]; if (fp == NULL) { sprintf(newFileName,"..//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; } if (fp == NULL) { sprintf(newFileName,"Demos//ParticlesOpenCL//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; } if (fp == NULL) { sprintf(newFileName,"..//..//..//..//..//Demos//ParticlesOpenCL//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; else { printf("cannot find %s\n",newFileName); exit(0); } } // char *source = oclLoadProgSource(".//Demos//SpheresGrid//SpheresGrid.cl", "", &program_length); //char *source = btOclLoadProgSource(".//Demos//SpheresOpenCL//Shared//SpheresGrid.cl", "", &program_length); char *source = btOclLoadProgSource(fileName, "", &program_length); if(source == NULL) { printf("ERROR : OpenCL can't load file %s\n", fileName); } // oclCHECKERROR (source == NULL, oclFALSE); btAssert(source != NULL); // create the program printf("OpenCL compiles %s ...", fileName); m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&source, &program_length, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); free(source); // build the program ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, "-I .", NULL, NULL); if(ciErrNum != CL_SUCCESS) { // write out standard error // oclLog(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR); // write out the build log and ptx, then exit char cBuildLog[10240]; // char* cPtx; // size_t szPtxLength; clGetProgramBuildInfo(m_cpProgram, btOclGetFirstDev(m_cxMainContext), CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL ); // oclGetProgBinary(m_cpProgram, oclGetFirstDev(m_cxMainContext), &cPtx, &szPtxLength); // oclLog(LOGBOTH | CLOSELOG, 0.0, "\n\nLog:\n%s\n\n\n\n\nPtx:\n%s\n\n\n", cBuildLog, cPtx); printf("\n\n%s\n\n\n", cBuildLog); printf("Press ENTER key to terminate the program\n"); getchar(); exit(-1); } printf("OK\n"); // create the kernels postInitDeviceData(); initKernel(PARTICLES_KERNEL_COMPUTE_CELL_ID, "kComputeCellId"); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 2, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSimParams); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_INTEGRATE_MOTION, "kIntegrateMotion"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 1, sizeof(cl_mem), (void *) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 2, sizeof(cl_mem), (void *) &m_dVel); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 3, sizeof(cl_mem), (void *) &m_dSimParams); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_CLEAR_CELL_START, "kClearCellStart"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 0, sizeof(int), (void *) &m_numGridCells); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dCellStart); initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart"); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem), (void*) &m_dCellStart); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem), (void*) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 4, sizeof(cl_mem), (void*) &m_dVel); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 5, sizeof(cl_mem), (void*) &m_dSortedPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 6, sizeof(cl_mem), (void*) &m_dSortedVel); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_COLLIDE_PARTICLES, "kCollideParticles"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 1, sizeof(cl_mem), (void*) &m_dVel); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 2, sizeof(cl_mem), (void*) &m_dSortedPos); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSortedVel); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 4, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 5, sizeof(cl_mem), (void*) &m_dCellStart); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 6, sizeof(cl_mem), (void*) &m_dSimParams); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL, "kBitonicSortCellIdLocal"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1, "kBitonicSortCellIdLocal1"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL, "kBitonicSortCellIdMergeGlobal"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL, "kBitonicSortCellIdMergeLocal"); }
//------------------------------------------------------------------------------ int main(int argc, char** argv) { if(argc < 5) { std::cout << "usage: " << argv[0] << " <platform id(0, 1, ...)>" " <device type: default | cpu | gpu | acc>" " <device id(0, 1, ...)>" " <number of double prec. elements>\n"; exit(EXIT_FAILURE); } std::vector<cl::Platform> platforms; std::vector<cl::Device> devices; const int platformID = atoi(argv[1]); cl_device_type deviceType; const std::string kernelName(argv[4]); const std::string dt = std::string(argv[2]); if(dt == "default") deviceType = CL_DEVICE_TYPE_DEFAULT; else if(dt == "cpu") deviceType = CL_DEVICE_TYPE_CPU; else if(dt == "gpu") deviceType = CL_DEVICE_TYPE_GPU; else if(dt == "acc") deviceType = CL_DEVICE_TYPE_ACCELERATOR; else { std::cerr << "ERROR - unrecognized device type " << dt << std::endl; exit(EXIT_FAILURE); } const int deviceID = atoi(argv[3]); const size_t SIZE = atoll(argv[4]); const size_t BYTE_SIZE = SIZE * sizeof(real_t); // init MPI environment MPI_Init(&argc, &argv); int task = -1; MPI_Comm_rank(MPI_COMM_WORLD, &task); try { //OpenCL init cl::Platform::get(&platforms); if(platforms.size() <= platformID) { std::cerr << "Platform id " << platformID << " is not available\n"; exit(EXIT_FAILURE); } platforms[platformID].getDevices(deviceType, &devices); cl::Context context(devices); cl::CommandQueue queue(context, devices[deviceID], CL_QUEUE_PROFILING_ENABLE); std::vector< real_t > data(SIZE, -1); //device buffer #1: holds local data cl::Buffer devData(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR //<-- page locked memory | CL_MEM_COPY_HOST_PTR, //<-- copy data from 'data' BYTE_SIZE, const_cast< double* >(&data[0])); //device buffer #2: holds data received from other node cl::Buffer devRecvData(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, BYTE_SIZE); //process data on the GPU(set array elements to local MPI id) const char CLCODE_INIT[] = "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n" "typedef double real_t;\n" "__kernel void arrayset(__global real_t* outputArray,\n" " real_t value) {\n" "//get global thread id for dimension 0\n" "const int id = get_global_id(0);\n" "outputArray[id] = value;\n" "}"; cl::Program::Sources initSource(1, std::make_pair(CLCODE_INIT, sizeof(CLCODE_INIT))); cl::Program initProgram(context, initSource); initProgram.build(devices); cl::Kernel initKernel(initProgram, "arrayset"); initKernel.setArg(0, devData); initKernel.setArg(1, real_t(task)); queue.enqueueNDRangeKernel(initKernel, cl::NDRange(0), cl::NDRange(SIZE), cl::NDRange(1)); //perform data exchange: //1) map device buffers to host memory void* sendHostPtr = queue.enqueueMapBuffer(devData, CL_FALSE, CL_MAP_READ, 0, BYTE_SIZE); if(sendHostPtr == 0) throw std::runtime_error("NULL mapped ptr"); void* recvHostPtr = queue.enqueueMapBuffer(devRecvData, CL_FALSE, CL_MAP_WRITE, 0, BYTE_SIZE); if(recvHostPtr == 0) throw std::runtime_error("NULL mapped ptr"); queue.finish(); //2) copy data to from remote process const int tag0to1 = 0x01; const int tag1to0 = 0x10; MPI_Request send_req; MPI_Request recv_req; int source = -1; int dest = -1; if(task == 0 ) { source = 1; dest = 1; } else { source = 0; dest = 0; } MPI_Status status; if(task == 0) { MPI_Isend(sendHostPtr, SIZE, MPI_DOUBLE, dest, tag0to1, MPI_COMM_WORLD, &send_req); MPI_Irecv(recvHostPtr, SIZE, MPI_DOUBLE, source, tag1to0, MPI_COMM_WORLD, &recv_req); } else { MPI_Isend(sendHostPtr, SIZE, MPI_DOUBLE, dest, tag1to0, MPI_COMM_WORLD, &send_req); MPI_Irecv(recvHostPtr, SIZE, MPI_DOUBLE, source, tag0to1, MPI_COMM_WORLD, &recv_req); } //3) as soon as data is copied do unmap buffers, indirectlry // triggering a host --> device copy MPI_Wait(&recv_req, &status); queue.enqueueUnmapMemObject(devRecvData, recvHostPtr); MPI_Wait(&send_req, &status); queue.enqueueUnmapMemObject(devData, sendHostPtr); //note that instead of having each process compile the code //you could e.g. send the size and content of the source buffer //to each process from root; or even send the precompiled code, //in this case all nodes of the clusted must be the same whereas //in the case of source code compilation hybrid systems are //automatically supported by OpenCL //process data on the GPU: increment local data array with value //received from other process const char CLCODE_COMPUTE[] = "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n" "typedef double real_t;\n" "__kernel void sum( __global const real_t* in,\n" " __global real_t* inout) {\n" "const int id = get_global_id(0);\n" "inout[id] += in[id];\n" "}"; cl::Program::Sources computeSource(1, std::make_pair(CLCODE_COMPUTE, sizeof(CLCODE_COMPUTE))); cl::Program computeProgram(context, computeSource); computeProgram.build(devices); cl::Kernel computeKernel(computeProgram, "sum"); computeKernel.setArg(0, devRecvData); computeKernel.setArg(1, devData); queue.enqueueNDRangeKernel(computeKernel, cl::NDRange(0), cl::NDRange(SIZE), cl::NDRange(1)); //map device data to host memory for validation and output real_t* computedDataHPtr = reinterpret_cast< real_t* >( queue.enqueueMapBuffer(devData, CL_FALSE, CL_MAP_READ, 0, BYTE_SIZE)); if(computedDataHPtr == 0) throw std::runtime_error("NULL mapped ptr"); queue.finish(); const int value = 1; // task id 0 + task id 1 const std::vector< real_t > reference(SIZE, value); if(std::equal(computedDataHPtr, computedDataHPtr + SIZE, reference.begin())) { std::cout << '[' << task << "]: PASSED" << std::endl; } else { std::cout << '[' << task << "]: FAILED" << std::endl; } //release mapped pointer queue.enqueueUnmapMemObject(devData, computedDataHPtr); //release MPI resources MPI_Finalize(); } catch(cl::Error e) { std::cerr << e.what() << ": Error code " << e.err() << std::endl; MPI_Finalize(); exit(EXIT_FAILURE); } return 0; }
void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv) { cl_int ciErrNum; if (!m_cxMainContext) { cl_device_type deviceType = CL_DEVICE_TYPE_ALL; m_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0, 0); int numDev = btOpenCLUtils::getNumDevices(m_cxMainContext); if (!numDev) { btAssert(0); exit(0);//this is just a demo, exit now } m_cdDevice = btOpenCLUtils::getDevice(m_cxMainContext,0); oclCHECKERROR(ciErrNum, CL_SUCCESS); btOpenCLDeviceInfo clInfo; btOpenCLUtils::getDeviceInfo(m_cdDevice,clInfo); btOpenCLUtils::printDeviceInfo(m_cdDevice); // create a command-queue m_cqCommandQue = clCreateCommandQueue(m_cxMainContext, m_cdDevice, 0, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); } // Program Setup size_t program_length; #ifdef LOAD_FROM_MEMORY program_length = strlen(source); printf("OpenCL compiles ParticlesOCL.cl ... "); #else const char* fileName = "ParticlesOCL.cl"; FILE * fp = fopen(fileName, "rb"); char newFileName[512]; if (fp == NULL) { sprintf(newFileName,"..//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; } if (fp == NULL) { sprintf(newFileName,"Demos//ParticlesOpenCL//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; } if (fp == NULL) { sprintf(newFileName,"..//..//..//..//..//Demos//ParticlesOpenCL//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; else { printf("cannot find %s\n",newFileName); exit(0); } } // char *source = oclLoadProgSource(".//Demos//SpheresGrid//SpheresGrid.cl", "", &program_length); //char *source = btOclLoadProgSource(".//Demos//SpheresOpenCL//Shared//SpheresGrid.cl", "", &program_length); char *source = btOclLoadProgSource(fileName, "", &program_length); if(source == NULL) { printf("ERROR : OpenCL can't load file %s\n", fileName); } // oclCHECKERROR (source == NULL, oclFALSE); btAssert(source != NULL); // create the program printf("OpenCL compiles %s ...", fileName); #endif //LOAD_FROM_MEMORY //printf("%s\n", source); m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&source, &program_length, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); #ifndef LOAD_FROM_MEMORY free(source); #endif //LOAD_FROM_MEMORY //#define LOCAL_SIZE_LIMIT 1024U #define LOCAL_SIZE_MAX 1024U // Build the program with 'mad' Optimization option #ifdef MAC const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -cl-mad-enable -DMAC -DGUID_ARG"; #else const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -DGUID_ARG= "; #endif // build the program ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL); if(ciErrNum != CL_SUCCESS) { // write out standard error // oclLog(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR); // write out the build log and ptx, then exit char cBuildLog[10240]; // char* cPtx; // size_t szPtxLength; clGetProgramBuildInfo(m_cpProgram, m_cdDevice, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL ); // oclGetProgBinary(m_cpProgram, oclGetFirstDev(m_cxMainContext), &cPtx, &szPtxLength); // oclLog(LOGBOTH | CLOSELOG, 0.0, "\n\nLog:\n%s\n\n\n\n\nPtx:\n%s\n\n\n", cBuildLog, cPtx); printf("\n\n%s\n\n\n", cBuildLog); printf("Press ENTER key to terminate the program\n"); getchar(); exit(-1); } printf("OK\n"); // create the kernels postInitDeviceData(); initKernel(PARTICLES_KERNEL_COMPUTE_CELL_ID, "kComputeCellId"); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 2, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSimParams); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_INTEGRATE_MOTION, "kIntegrateMotion"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 1, sizeof(cl_mem), (void *) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 2, sizeof(cl_mem), (void *) &m_dVel); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 3, sizeof(cl_mem), (void *) &m_dSimParams); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_CLEAR_CELL_START, "kClearCellStart"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 0, sizeof(int), (void *) &m_numGridCells); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dCellStart); initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart"); // ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 0, sizeof(int), (void*) &m_numParticles); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem), (void*) &m_dCellStart); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem), (void*) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 4, sizeof(cl_mem), (void*) &m_dVel); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 5, sizeof(cl_mem), (void*) &m_dSortedPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 6, sizeof(cl_mem), (void*) &m_dSortedVel); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_COLLIDE_PARTICLES, "kCollideParticles"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 1, sizeof(cl_mem), (void*) &m_dVel); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 2, sizeof(cl_mem), (void*) &m_dSortedPos); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSortedVel); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 4, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 5, sizeof(cl_mem), (void*) &m_dCellStart); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 6, sizeof(cl_mem), (void*) &m_dSimParams); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL, "kBitonicSortCellIdLocal"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1, "kBitonicSortCellIdLocal1"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL, "kBitonicSortCellIdMergeGlobal"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL, "kBitonicSortCellIdMergeLocal"); }