int main() { hipDeviceProp_t prop; HIP_PRINT_STATUS(hipGetDeviceProperties(&prop, -1)); int cnt; hipGetDeviceCount(&cnt); HIP_PRINT_STATUS(hipGetDeviceProperties(&prop, cnt+1)); HIP_PRINT_STATUS(hipGetDeviceProperties(NULL, 0)); }
int main(int argc, char *argv[]) { int warpSize, pshift; hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); if(strncmp(devProp.name,"Fiji",1)==0) { warpSize =64; pshift =6; } else {warpSize =32; pshift=5;} int anycount =0; int allcount =0; int Num_Threads_per_Block = 1024; int Num_Blocks_per_Grid = 1; int Num_Warps_per_Block = Num_Threads_per_Block/warpSize; int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize; int * host_any = ( int*)malloc(Num_Warps_per_Grid*sizeof(int)); int * host_all = ( int*)malloc(Num_Warps_per_Grid*sizeof(int)); int *device_any; int *device_all; HIP_ASSERT(hipMalloc((void**)&device_any,Num_Warps_per_Grid*sizeof( int))); HIP_ASSERT(hipMalloc((void**)&device_all,Num_Warps_per_Grid*sizeof(int))); for (int i=0; i<Num_Warps_per_Grid; i++) { host_any[i] = 0; host_all[i] = 0; } HIP_ASSERT(hipMemcpy(device_any, host_any,sizeof(int), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(device_all, host_all,sizeof(int), hipMemcpyHostToDevice)); hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_any, device_all ,Num_Warps_per_Block,pshift); HIP_ASSERT(hipMemcpy(host_any, device_any, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost)); HIP_ASSERT(hipMemcpy(host_all, device_all, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost)); for (int i=0; i<Num_Warps_per_Grid; i++) { printf("warp no. %d __any = %d \n",i,host_any[i]); printf("warp no. %d __all = %d \n",i,host_all[i]); if (host_all[i]!=1) ++allcount; #if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT ) if (host_any[i]!=64) ++anycount; #else if (host_any[i]!=1) ++anycount; #endif } #if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT ) if (anycount == 1 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n"); #else if (anycount == 0 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n"); #endif return EXIT_SUCCESS; }
unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N) { int device; HIPCHECK(hipGetDevice(&device)); hipDeviceProp_t props; HIPCHECK(hipGetDeviceProperties(&props, device)); unsigned blocks = props.multiProcessorCount * blocksPerCU; if (blocks * threadsPerBlock > N) { blocks = (N + threadsPerBlock - 1) / threadsPerBlock; } return blocks; }
void runTest(int argc, char **argv) { hipDeviceProp_t deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev = 0; hipGetDeviceProperties(&deviceProp, dev); // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, " "SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = (deviceProp.major * 0x10 + deviceProp.minor); unsigned int numThreads = 256; unsigned int numBlocks = 64; unsigned int numData = 11; unsigned int memSize = sizeof(int) * numData; //allocate mem for the result on host side int *hOData = (int *) malloc(memSize); //initialize the memory for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; //To make the AND and XOR tests generate something other than 0... hOData[8] = hOData[10] = 0xff; // allocate device memory for result int *dOData; hipMalloc((void **) &dOData, memSize); // copy host memory to device to initialize to zero hipMemcpy(dOData, hOData, memSize,hipMemcpyHostToDevice); // execute the kernel hipLaunchKernel(testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData); //Copy result from device to host hipMemcpy(hOData,dOData, memSize,hipMemcpyDeviceToHost); // Compute reference solution testResult = computeGold(hOData, numThreads * numBlocks); // Cleanup memory free(hOData); hipFree(dOData); }
int main(int argc, char *argv[]) { float *A_h, *A_d; float *Y_h, *Y_d; float *X_h, *X_d; hipDeviceProp_t props; CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/)); printf ("info: running on device %s\n", props.name); //bug will appear if num_row is too big 3125*128 //for(int i=1 ; i < 1e3; i*=2) int i = 131072/NB_X; { size_t num_row = NB_X * i; no_cache(A_h, A_d, X_h, X_d, Y_h, Y_d, num_row); } }
int main(int argc, char *argv[]) { int warpSize, pshift; hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); if(strncmp(devProp.name,"Fiji",1)==0) {warpSize = 64; pshift =6;} else {warpSize =32; pshift =5;} unsigned int Num_Threads_per_Block = 512; unsigned int Num_Blocks_per_Grid = 1; unsigned int Num_Warps_per_Block = Num_Threads_per_Block/warpSize; unsigned int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize; unsigned int* host_ballot = (unsigned int*)malloc(Num_Warps_per_Grid*sizeof(unsigned int)); unsigned int* device_ballot; HIP_ASSERT(hipMalloc((void**)&device_ballot, Num_Warps_per_Grid*sizeof(unsigned int))); int divergent_count =0; for (int i=0; i<Num_Warps_per_Grid; i++) host_ballot[i] = 0; HIP_ASSERT(hipMemcpy(device_ballot, host_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyHostToDevice)); hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_ballot,Num_Warps_per_Block,pshift); HIP_ASSERT(hipMemcpy(host_ballot, device_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyDeviceToHost)); for (int i=0; i<Num_Warps_per_Grid; i++) { if ((host_ballot[i] == 0)||(host_ballot[i]/warpSize == warpSize)) std::cout << "Warp " << i << " IS convergent- Predicate true for " << host_ballot[i]/warpSize << " threads\n"; else {std::cout << " Warp " << i << " IS divergent - Predicate true for " << host_ballot[i]/warpSize<< " threads\n"; divergent_count++;} } if (divergent_count==1) printf("PASSED\n"); else printf("FAILED\n"); return EXIT_SUCCESS; }