int main(int argc, char *argv[]) { int warpSize, pshift; hipDeviceProp_t devProp; hipDeviceGetProperties(&devProp, 0); if(strncmp(devProp.name,"Fiji",1)==0) {warpSize =64; pshift =6;} else {warpSize =32; pshift =5;} unsigned int Num_Threads_per_Block = 512; unsigned int Num_Blocks_per_Grid = 1; unsigned int Num_Warps_per_Block = Num_Threads_per_Block/warpSize; unsigned int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize; unsigned int* host_ballot = (unsigned int*)malloc(Num_Warps_per_Grid*sizeof(unsigned int)); unsigned int* device_ballot; HIP_ASSERT(hipMalloc((void**)&device_ballot, Num_Warps_per_Grid*sizeof(unsigned int))); int divergent_count =0; for (int i=0; i<Num_Warps_per_Grid; i++) host_ballot[i] = 0; HIP_ASSERT(hipMemcpy(device_ballot, host_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyHostToDevice)); hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_ballot,Num_Warps_per_Block,pshift); HIP_ASSERT(hipMemcpy(host_ballot, device_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyDeviceToHost)); for (int i=0; i<Num_Warps_per_Grid; i++) { if ((host_ballot[i] == 0)||(host_ballot[i]/warpSize == warpSize)) std::cout << "Warp " << i << " IS convergent- Predicate true for " << host_ballot[i]/warpSize << " threads\n"; else {std::cout << "Warp " << i << " IS divergent - Predicate true for " << host_ballot[i]/warpSize<< " threads\n"; divergent_count++;} } if (divergent_count==1) printf("PASSED\n"); else printf("FAILED\n"); return EXIT_SUCCESS; }
void runTest(int argc, char **argv) { hipDeviceProp_t deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev = 0; hipDeviceGetProperties(&deviceProp, dev); // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, " "SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = (deviceProp.major * 0x10 + deviceProp.minor); unsigned int numThreads = 256; unsigned int numBlocks = 64; unsigned int numData = 11; unsigned int memSize = sizeof(int) * numData; //allocate mem for the result on host side int *hOData = (int *) malloc(memSize); //initialize the memory for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; //To make the AND and XOR tests generate something other than 0... hOData[8] = hOData[10] = 0xff; // allocate device memory for result int *dOData; hipMalloc((void **) &dOData, memSize); // copy host memory to device to initialize to zero hipMemcpy(dOData, hOData, memSize,hipMemcpyHostToDevice); // execute the kernel hipLaunchKernel(testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData); //Copy result from device to host hipMemcpy(hOData,dOData, memSize,hipMemcpyDeviceToHost); // Compute reference solution testResult = computeGold(hOData, numThreads * numBlocks); // Cleanup memory free(hOData); hipFree(dOData); }
unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N) { int device; HIPCHECK(hipGetDevice(&device)); hipDeviceProp_t props; HIPCHECK(hipDeviceGetProperties(&props, device)); unsigned blocks = props.multiProcessorCount * blocksPerCU; if (blocks * threadsPerBlock > N) { blocks = (N+threadsPerBlock-1)/threadsPerBlock; } return blocks; }
int main(int argc, char *argv[]) { float *A_d, *C_d; float *A_h, *C_h; size_t N = 1000000; size_t Nbytes = N * sizeof(float); hipDeviceProp_t props; CHECK(hipDeviceGetProperties(&props, 0/*deviceID*/)); printf ("info: running on device %s\n", props.name); printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); A_h = (float*)malloc(Nbytes); CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); C_h = (float*)malloc(Nbytes); CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); // Fill with Phi + i for (size_t i=0; i<N; i++) { A_h[i] = 1.618f + i; } printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); CHECK(hipMalloc(&A_d, Nbytes)); CHECK(hipMalloc(&C_d, Nbytes)); printf ("info: copy Host2Device\n"); CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); const unsigned blocks = 512; const unsigned threadsPerBlock = 256; printf ("info: launch 'vector_square' kernel\n"); hipLaunchKernel(HIP_KERNEL_NAME(vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N); printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); printf ("info: check result\n"); for (size_t i=0; i<N; i++) { if (C_h[i] != A_h[i] * A_h[i]) { CHECK(hipErrorUnknown); } } printf ("PASSED!\n"); }