//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { numParticles = 1024; uint gridDim = 64; numIterations = 1; cutGetCmdLineArgumenti( argc, (const char**) argv, "n", (int *) &numParticles); cutGetCmdLineArgumenti( argc, (const char**) argv, "grid", (int *) &gridDim); gridSize.x = gridSize.y = gridSize.z = gridDim; printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); bool benchmark = !cutCheckCmdLineFlag(argc, (const char**) argv, "noqatest") != 0; cutGetCmdLineArgumenti( argc, (const char**) argv, "i", &numIterations); cudaInit(argc, argv); glutInit(&argc, argv); glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); glutInitWindowSize(640, 480); glutCreateWindow("CUDA particles"); initGL(); init(numParticles, gridSize); initParams(); initMenus(); if (benchmark) { if (numIterations <= 0) numIterations = 300; runBenchmark(numIterations); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); glutMainLoop(); } if (psystem) delete psystem; cudaThreadExit(); return 0; }
void bw(uint64_t size) { uint8_t *H, *D; int i; printf("%"PRIu64" ", size); cudaInit(); printf("0 "); // reg func H = (uint8_t*)malloc(sizeof(uint8_t)*size); time_begin(); cudaMalloc((void**)&D, sizeof(uint8_t)*size); printf("%u ", time_end()); for(i=0; i<size; i++) { H[i]=i%255; } time_begin(); cudaMemcpy(D, H, size*sizeof(uint8_t), cudaMemcpyHostToDevice); printf("%u ", time_end()); printf("0 "); // exec kernel for(i=0; i<size; i++) { H[i]=0; } time_begin(); cudaMemcpy(H, D, size*sizeof(uint8_t), cudaMemcpyDeviceToHost); printf("%u ", time_end()); for(i=0; i<size; i++) { if(H[i]!=i%255) printf("error %d\n", i); } free(H); time_begin(); cudaFree(D); printf("%u ", time_end()); cudaFini(); printf("\n"); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // Write the xyz file char outfile[256]; sprintf(outfile, "outfile.xyz"); if ((fpout = fopen(outfile, "w")) == NULL) { printf("Cannot Open File\n"); exit(1); } printf("%s Starting...\n\n", sSDKsample); numParticles = NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; gridSize.x = gridSize.y = gridSize.z = gridDim; printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); printf("particles: %d\n", numParticles); cudaInit(argc, argv); initParticleSystem(numParticles, gridSize); printf("%e\n", timestep); //psystem->dumpParticles(0, numParticles-1, 0); if (numIterations <= 0) { numIterations = (int)(2.0/timestep); } std::cout << "1. I am here \n"; runBenchmark(numIterations, argv[0]); std::cout << "2. I am here \n"; if (psystem) { delete psystem; cleanup(); } cudaDeviceReset(); exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { // numParticles = 65536*2; // numParticles = 65536; // numParticles = 32768; // numParticles = 8192; // numParticles = 4096; numParticles = 2048; // numParticles = 1024; // numParticles = 256; // numParticles = 32; // numParticles = 2; uint gridDim = 64; numIterations = 0; gridSize.x = gridSize.y = gridSize.z = gridDim; printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); cudaInit(argc, argv); glutInit(&argc, argv); glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); glutInitWindowSize(640, 480); glutCreateWindow("CUDA particles"); initGL(); init(numParticles, gridSize); initParams(); initMenus(); glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); glutMainLoop(); if (psystem) delete psystem; return 0; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { numParticles = NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; if (argc > 1) { if (checkCmdLineFlag(argc, (const char **) argv, "n")) { numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "grid")) { gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid"); } } gridSize.x = gridSize.y = gridSize.z = gridDim; printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); printf("particles: %d\n", numParticles); if (checkCmdLineFlag(argc, (const char **) argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i"); } cudaInit(argc, argv); initParticleSystem(numParticles, gridSize); initParams(); if (numIterations <= 0) numIterations = 300; runBenchmark(numIterations, argv[0]); if (psystem) { delete psystem; } exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }
int main ( int argc, char **argv ) { #ifdef BUILD_CUDA // Initialize CUDA cudaInit( argc, argv ); #endif // set up the window glutInit( &argc, &argv[0] ); glutInitDisplayMode( GLUT_RGB | GLUT_DOUBLE | GLUT_DEPTH ); glutInitWindowPosition( 100, 100 ); glutInitWindowSize( (int) window_width, (int) window_height ); glutCreateWindow ( "Fluids v.1 (c) 2008, R. Hoetzlein (ZLib)" ); // glutFullScreen (); // initialize parameters init(); // wait for something to happen glutMainLoop(); return 0; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { printf("%s Starting...\n\n", sSDKsample); numParticles = NUM_PARTICLES; maxNumParticles = MAX_NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; printf("Surely I can get this far\n"); if (argc > 1) { if (checkCmdLineFlag(argc, (const char **) argv, "n")) { numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "grid")) { gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile); fpsLimit = frameCheckNumber; numIterations = 1; } } //******************************************************* // RMK Hard code for cylindrical coords (y=theta=1) // DomainSize //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/RVert.txt"; char Zfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/ZVert.txt"; char Rfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/RVert.txt"; numVelNodes.x = filecount(Rfile); //-1; numVelNodes.z = filecount(Zfile); //-1; numVelNodes.y = 1; numCells.x = 80; //47; //24; //29; numCells.y = 1; numCells.z = 160; //188; //95; //88; numParticles = numCells.x*numCells.z*20; //avgnumparticles = 40 srand( time( NULL ) ); //numParticles = numCells.x*numCells.z*40; printf("vel grid: %d x %d x %d = %d cells\n", numVelNodes.x, numVelNodes.y, numVelNodes.z, numVelNodes.x*numVelNodes.y*numVelNodes.z); printf(" grid: %d x %d x %d = %d cells\n", numCells.x, numCells.y, numCells.z, numCells.x*numCells.y*numCells.z); //printf("vel grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); bool benchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0; if (checkCmdLineFlag(argc, (const char **) argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i"); } if (g_refFile) { cudaInit(argc, argv); } else { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -file=<*.bin>\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } initGL(&argc, argv); cudaGLInit(argc, argv); } // Moved code snippet to CellSystem //initCellSystem(gridSize); // now moved to particlesystem printf("Begin initialization\n"); //initParticleSystem(numParticles, gridSize, g_refFile==NULL); initParticleSystem(maxNumParticles, numParticles, numVelNodes, numCells, g_refFile==NULL); //printf("Finished with initParticleSystem, %d\n",g_refFile==NULL); //cin.ignore(); initParams(); printf("Finished with initialization\n"); if (!g_refFile) { initMenus(); } if (benchmark || g_refFile) { if (numIterations <= 0) { numIterations = 300; } runBenchmark(numIterations, argv[0]); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); atexit(cleanup); glutMainLoop(); } if (psystem) { delete psystem; } cudaDeviceReset(); exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); numParticles = NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; if (argc > 1) { if (checkCmdLineFlag(argc, (const char **) argv, "n")) { numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "grid")) { gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile); fpsLimit = frameCheckNumber; numIterations = 1; } } gridSize.x = gridSize.y = gridSize.z = gridDim; printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); printf("particles: %d\n", numParticles); bool benchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0; if (checkCmdLineFlag(argc, (const char **) argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i"); } if (g_refFile) { cudaInit(argc, argv); } else { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -file=<*.bin>\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } initGL(&argc, argv); cudaGLInit(argc, argv); } initParticleSystem(numParticles, gridSize, g_refFile==NULL); initParams(); if (!g_refFile) { initMenus(); } if (benchmark || g_refFile) { if (numIterations <= 0) { numIterations = 300; } runBenchmark(numIterations, argv[0]); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); glutCloseFunc(cleanup); glutMainLoop(); } if (psystem) { delete psystem; } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }
void MakeBid( std::shared_ptr<Packet_ServerBeginRound> roundInfo, // Information about this particular round const std::shared_ptr<Packet_ServerRequestBid> request, // The specific request we received double period, // How long this bidding period will last double skewEstimate, // An estimate of the time difference between us and the server (positive -> we are ahead) std::vector<uint32_t> &solution, // Our vector of indices describing the solution uint32_t *pProof // Will contain the "proof", which is just the value ) { double tSafetyMargin = 0.5; // accounts for uncertainty in network conditions /* This is when the server has said all bids must be produced by, plus the adjustment for clock skew, and the safety margin */ double tFinish = request->timeStampReceiveBids * 1e-9 + skewEstimate - tSafetyMargin; Log(Log_Verbose, "MakeBid - start, total period=%lg.", period); /* We will use this to track the best solution we have created so far. */ roundInfo->maxIndices = 4; std::vector<uint32_t> bestSolution(roundInfo->maxIndices); std::vector<uint32_t> gpuBestSolution(roundInfo->maxIndices); bigint_t bestProof, gpuBestProof; wide_ones(BIGINT_WORDS, bestProof.limbs); // Incorporate the existing block chain data - in a real system this is the // list of transactions we are signing. This is the FNV hash: // http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function hash::fnv<64> hasher; uint64_t chainHash = hasher((const char *)&roundInfo->chainData[0], roundInfo->chainData.size()); bigint_t x; wide_x_init(&x.limbs[0], uint32_t(0), roundInfo->roundId, roundInfo->roundSalt, chainHash); std::vector<uint32_t> indices(roundInfo->maxIndices); //Define TBB arrays uint32_t *parallel_Indices = (uint32_t *)malloc(sizeof(uint32_t) * TBB_PARALLEL_COUNT); uint32_t *parallel_BestSolutions = (uint32_t *)malloc(sizeof(uint32_t) * TBB_PARALLEL_COUNT * roundInfo->maxIndices); uint32_t *parallel_Proofs = (uint32_t *)malloc(sizeof(uint32_t) * 8 * TBB_PARALLEL_COUNT); uint32_t *parallel_BestProofs = (uint32_t *)malloc(sizeof(uint32_t) * 8 * TBB_PARALLEL_COUNT); //Define GPU arrays uint32_t *d_ParallelBestSolutions; checkCudaErrors(cudaMalloc((void **)&d_ParallelBestSolutions, sizeof(uint32_t) * CUDA_DIM * CUDA_DIM * roundInfo->maxIndices)); checkCudaErrors(cudaMemcpy(d_hashConstant, &roundInfo->c[0], sizeof(uint32_t) * 4, cudaMemcpyHostToDevice)); unsigned gpuTrials = 0; unsigned cpuTrials = 0; unsigned maxNum = uint32_t(0xFFFFFFFF); auto runGPU = [ = , &gpuTrials] { cudaInit(CUDA_DIM, d_ParallelBestProofs); do { cudaIteration(d_ParallelIndices, d_ParallelProofs, d_ParallelBestProofs, d_ParallelBestSolutions, x, d_hashConstant, roundInfo->hashSteps, CUDA_DIM, gpuTrials, CUDA_TRIALS, roundInfo->maxIndices); gpuTrials += CUDA_TRIALS; } while ((tFinish - now() * 1e-9) > 0); }; std::thread runGPUThread(runGPU); auto tbbInitial = [ = ](unsigned i) { bigint_t ones; wide_ones(8, ones.limbs); wide_copy(8, ¶llel_BestProofs[i * 8], ones.limbs); }; tbb::parallel_for<unsigned>(0, TBB_PARALLEL_COUNT, tbbInitial); do { auto tbbIteration = [ = ](unsigned i) { uint32_t index = maxNum - (TBB_PARALLEL_COUNT<<2) - cpuTrials + (i<<1); bigint_t proof = tbbHash(roundInfo.get(), index, x); wide_copy(8, ¶llel_Proofs[i * 8], proof.limbs); parallel_Indices[i] = index; }; tbb::parallel_for<unsigned>(0, TBB_PARALLEL_COUNT, tbbIteration); auto tbbCrossHash = [ = ](unsigned i) { for (unsigned xorStride = 1; xorStride < TBB_PARALLEL_COUNT >> 2; xorStride++) { if (i + (roundInfo->maxIndices * xorStride) < TBB_PARALLEL_COUNT) { bigint_t candidateBestProof; wide_copy(8, candidateBestProof.limbs, ¶llel_Proofs[i * 8]); for (unsigned indexNum = 1; indexNum < roundInfo->maxIndices; indexNum++) { wide_xor(8, candidateBestProof.limbs, candidateBestProof.limbs, ¶llel_Proofs[(i + (indexNum * xorStride)) * 8]); } if (wide_compare(8, candidateBestProof.limbs, ¶llel_BestProofs[i * 8]) < 0) { wide_copy(8, ¶llel_BestProofs[i * 8], candidateBestProof.limbs); for (unsigned ID = 0; ID < roundInfo->maxIndices; ID++) { parallel_BestSolutions[(i * roundInfo->maxIndices) + ID] = parallel_Indices[i + (ID * xorStride)]; } } } } }; tbb::parallel_for<unsigned>(0, TBB_PARALLEL_COUNT, tbbCrossHash); cpuTrials += TBB_PARALLEL_COUNT; } while ((tFinish - now() * 1e-9) > 0); runGPUThread.join(); auto reduceGPU = [ = , &gpuBestSolution, &gpuBestProof] { cudaParallelReduce(CUDA_DIM, roundInfo->maxIndices, d_ParallelBestProofs, d_ParallelBestSolutions, &gpuBestSolution[0], gpuBestProof.limbs); }; std::thread reduceThread(reduceGPU); //TBB for (int toDo = TBB_PARALLEL_COUNT / 2; toDo >= 1; toDo >>= 1) { auto tbbReduce = [ = ](unsigned i) { if (wide_compare(BIGINT_WORDS, ¶llel_BestProofs[(i + toDo) * 8], ¶llel_BestProofs[i * 8]) < 0) { wide_copy(8, ¶llel_BestProofs[i * 8], ¶llel_BestProofs[(i + toDo) * 8]); wide_copy(roundInfo->maxIndices, ¶llel_BestSolutions[i * roundInfo->maxIndices], ¶llel_BestSolutions[(i + toDo) * roundInfo->maxIndices]); } }; tbb::parallel_for<unsigned>(0, toDo, tbbReduce); } wide_copy(BIGINT_WORDS, bestProof.limbs, ¶llel_BestProofs[0]); wide_copy(roundInfo->maxIndices, &bestSolution[0], ¶llel_BestSolutions[0]); reduceThread.join(); if (wide_compare(BIGINT_WORDS, gpuBestProof.limbs, bestProof.limbs) < 0) { Log(Log_Verbose, "Accepting GPU Solution"); wide_copy(8, bestProof.limbs, gpuBestProof.limbs); wide_copy(roundInfo->maxIndices, &bestSolution[0], &gpuBestSolution[0]); } solution = bestSolution; wide_copy(BIGINT_WORDS, pProof, bestProof.limbs); free(parallel_Indices); free(parallel_BestSolutions); free(parallel_Proofs); free(parallel_BestProofs); checkCudaErrors(cudaFree(d_ParallelBestSolutions)); Log(Log_Verbose, "MakeBid - finish. Total trials %d, cpu: %d, gpu %d", cpuTrials + gpuTrials, cpuTrials, gpuTrials); }