void run(size_t size, hipStream_t stream1, hipStream_t stream2){ float *Ah, *Bh, *Cd, *Dd, *Eh; float *Ahh, *Bhh, *Cdd, *Ddd, *Ehh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Ahh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bhh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cdd, size)); HIPCHECK(hipMalloc(&Ddd, size)); HIPCHECK(hipHostMalloc((void**)&Ehh, size, hipHostMallocDefault)); HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream1)); HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream1, Cd); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream2, Cdd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1)); HIPCHECK(hipMemcpyAsync(Ehh, Ddd, size, hipMemcpyDeviceToHost, stream2)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] = Ah[10] + 1.0f); HIPASSERT(Ehh[10] = Ahh[10] + 1.0f); }
int main(){ int A=0, *Ad; hipMalloc((void**)&Ad, SIZE); hipMemcpy(Ad, &A, SIZE, hipMemcpyHostToDevice); hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, 0, Ad); hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost); }
void runbench(double *cd, long size){ if( memory_ratio>UNROLL_ITERATIONS ){ fprintf(stderr, "ERROR: memory_ratio exceeds UNROLL_ITERATIONS\n"); exit(1); } const long compute_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/2; const int BLOCK_SIZE = 256; const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE; const long long computations = 2*(long long)(COMP_ITERATIONS)*REGBLOCK_SIZE*compute_grid_size; const long long memoryoperations = (long long)(COMP_ITERATIONS)*compute_grid_size; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimGrid(TOTAL_BLOCKS, 1, 1); hipEvent_t start, stop; initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< float, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0f, (float*)cd); float kernel_time_mad_sp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd); float kernel_time_mad_dp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd); float kernel_time_mad_int = finalizeEvents(start, stop); const double memaccesses_ratio = (double)(memory_ratio)/UNROLL_ITERATIONS; const double computations_ratio = 1.0-memaccesses_ratio; printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", UNROLL_ITERATIONS-memory_ratio, (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(float)), kernel_time_mad_sp, (computations_ratio*(double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.), (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(double)), kernel_time_mad_dp, (computations_ratio*(double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.), (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(int)), kernel_time_mad_int, (computations_ratio*(double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) ); }
int main(){ int A=0, *Ad; hipMalloc((void**)&Ad, SIZE); hipMemcpy(Ad, &A, SIZE, hipMemcpyHostToDevice); dim3 dimGrid, dimBlock; dimGrid.x = 1, dimGrid.y =1, dimGrid.z = 1; dimBlock.x = 1, dimBlock.y = 1, dimGrid.z = 1; hipLaunchKernel(HIP_KERNEL_NAME(Iter), dimGrid, dimBlock, 0, 0, Ad); hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost); }
void runbench_warmup(double *cd, long size){ const long reduced_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/32; const int BLOCK_SIZE = 256; const int TOTAL_REDUCED_BLOCKS = reduced_grid_size/BLOCK_SIZE; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimReducedGrid(TOTAL_REDUCED_BLOCKS, 1, 1); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< short, BLOCK_SIZE, 0 >), dim3(dimReducedGrid), dim3(dimBlock ), 0, 0, (short)1, (short*)cd); CUDA_SAFE_CALL( hipGetLastError() ); CUDA_SAFE_CALL( hipDeviceSynchronize() ); }
void runbench(double *cd, long size){ const long compute_grid_size = size/ELEMENTS_PER_THREAD; const int BLOCK_SIZE = 256; const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE; const long long computations = ELEMENTS_PER_THREAD*(long long)compute_grid_size+(2*ELEMENTS_PER_THREAD*compute_iterations)*(long long)compute_grid_size; const long long memoryoperations = size; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimGrid(TOTAL_BLOCKS, 1, 1); hipEvent_t start, stop; initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< float, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0f, (float*)cd); float kernel_time_mad_sp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd); float kernel_time_mad_dp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd); float kernel_time_mad_int = finalizeEvents(start, stop); printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", compute_iterations, ((double)computations)/((double)memoryoperations*sizeof(float)), kernel_time_mad_sp, ((double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.), ((double)computations)/((double)memoryoperations*sizeof(double)), kernel_time_mad_dp, ((double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.), ((double)computations)/((double)memoryoperations*sizeof(int)), kernel_time_mad_int, ((double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) ); }
int main() { float *A, *Ad; for(int i=0;i<len;i++) { A[i] = 1.0f; } Ad = (float*)mallocHip(size); memcpyHipH2D(Ad, A, size); hipLaunchKernel(HIP_KERNEL_NAME(Kern), dim3(len/1024), dim3(1024), 0, 0, A); memcpyHipD2H(A, Ad, size); for(int i=0;i<len;i++) { assert(A[i] == 2.0f); } }
int main(int argc, char *argv[]) { float *A_d, *C_d; float *A_h, *C_h; size_t N = 1000000; size_t Nbytes = N * sizeof(float); hipDeviceProp_t props; CHECK(hipDeviceGetProperties(&props, 0/*deviceID*/)); printf ("info: running on device %s\n", props.name); printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); A_h = (float*)malloc(Nbytes); CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); C_h = (float*)malloc(Nbytes); CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); // Fill with Phi + i for (size_t i=0; i<N; i++) { A_h[i] = 1.618f + i; } printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); CHECK(hipMalloc(&A_d, Nbytes)); CHECK(hipMalloc(&C_d, Nbytes)); printf ("info: copy Host2Device\n"); CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); const unsigned blocks = 512; const unsigned threadsPerBlock = 256; printf ("info: launch 'vector_square' kernel\n"); hipLaunchKernel(HIP_KERNEL_NAME(vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N); printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); printf ("info: check result\n"); for (size_t i=0; i<N; i++) { if (C_h[i] != A_h[i] * A_h[i]) { CHECK(hipErrorUnknown); } } printf ("PASSED!\n"); }
void run1(size_t size, hipStream_t stream){ float *Ah, *Bh, *Cd, *Dd, *Eh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); for(int i=0;i<N;i++){ Ah[i] = 1.0f; } HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] == Ah[10] + 1.0f); }
int main(){ hipError_t err; float *A, *Ad; A = new float[LEN]; for(int i=0;i<LEN;i++){ A[i] = 1.0f; } hipStream_t stream; err = hipStreamCreate(&stream); check("Creating stream",err); err = hipMalloc(&Ad, SIZE); check("Allocating Ad memory on device", err); err = hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); check("Doing memory copy from A to Ad", err); float mS = 0; hipEvent_t start, stop; hipEventCreate(&start); hipEventCreate(&stop); ResultDatabase resultDB[8]; hipEventRecord(start); hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[0].AddResult(std::string("First Kernel Launch"), "", "uS", mS*1000); // std::cout<<"First Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl; resultDB[0].DumpSummary(std::cout); hipEventRecord(start); hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[1].AddResult(std::string("Second Kernel Launch"), "", "uS", mS*1000); // std::cout<<"Second Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl; resultDB[1].DumpSummary(std::cout); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipDeviceSynchronize(); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[2].AddResult(std::string("NULL Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); resultDB[2].DumpSummary(std::cout); // std::cout<<"NULL Stream Sync dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipEventRecord(stop); hipDeviceSynchronize(); hipEventElapsedTime(&mS, start, stop); resultDB[3].AddResult(std::string("NULL Stream Async dispatch wait"), "", "uS", mS*1000/ITER); resultDB[3].DumpSummary(std::cout); // std::cout<<"NULL Stream Async dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); hipDeviceSynchronize(); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[4].AddResult(std::string("Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); resultDB[4].DumpSummary(std::cout); // std::cout<<"Stream Sync dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); } hipDeviceSynchronize(); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[5].AddResult(std::string("Stream Async dispatch wait"), "", "uS", mS*1000/ITER); // std::cout<<"Stream Async dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; resultDB[5].DumpSummary(std::cout); hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[6].AddResult(std::string("NULL Stream No Wait"), "", "uS", mS*1000/ITER); resultDB[6].DumpSummary(std::cout); // std::cout<<"NULL Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[7].AddResult(std::string("Stream Dispatch No Wait"), "", "uS", mS*1000/ITER); resultDB[7].DumpSummary(std::cout); // std::cout<<"Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); }
int main(){ hipLaunchKernel(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0); hipDeviceSynchronize(); passed(); }
int test_amdgcn_wave_lshift_1 (const int n, const int blockSize, const int launch_iter=1, const int shfl_iter=1, const bool verify=true) { const int WIDTH = 64; const int DELTA = 1; std::vector<int> input(n); std::future<void> inputFuture = std::async([&]() { std::default_random_engine generator; std::uniform_int_distribution<int> input_dist(0, WIDTH-1); std::generate(std::begin(input), std::end(input),[&]() { return input_dist(generator); }); }); inputFuture.wait(); int* gpuInput; hipMalloc(&gpuInput, n * sizeof(int)); hipMemcpy(gpuInput, input.data(), n * sizeof(int), hipMemcpyHostToDevice); int* gpuOutput; hipMalloc(&gpuOutput, n * sizeof(int)); // warm up { hipEvent_t start, stop; initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(run_amdgcn_wave_lshift_1) , dim3(n/blockSize), dim3(blockSize), 0, 0 , gpuInput, gpuOutput, shfl_iter); float time_ms = finalizeEvents(start, stop); } // measure the performance hipEvent_t start, stop; initializeEvents(&start, &stop); for (int i = 0; i < launch_iter; i++) { hipLaunchKernel(HIP_KERNEL_NAME(run_amdgcn_wave_lshift_1) , dim3(n/blockSize), dim3(blockSize), 0, 0 , gpuInput, gpuOutput, shfl_iter); } float time_ms = finalizeEvents(start, stop); std::vector<int> output(n); hipMemcpy(output.data(), gpuOutput, n * sizeof(int), hipMemcpyDeviceToHost); // verification int errors = 0; if (verify) { for (int i = 0; i < n; i+=WIDTH) { int local_output[WIDTH]; for (int j = 0; j < shfl_iter; j++) { for (int k = 0; k < WIDTH; k++) { unsigned int lane = ((k+(int)DELTA)<WIDTH)?(k+DELTA):k; local_output[k] = input[i+lane]; } for (int k = 0; k < WIDTH; k++) { input[i+k] = local_output[k]; } } for (int k = 0; k < WIDTH; k++) { if (input[i+k] != output[i+k]) { errors++; } } } } std::cout << __FUNCTION__ << "<" << DELTA << "," << WIDTH << "> total(" << launch_iter << " launches, " << shfl_iter << " wavefront_shift_left/lane/kernel): " << time_ms << "ms, " << time_ms/(double)launch_iter << " ms/kernel, " << errors << " errors" << std::endl; hipFree(gpuInput); hipFree(gpuOutput); return errors; }
void no_cache(float *A_h, float *A_d, float *X_h, float *X_d, float *Y_h, float *Y_d, size_t NUM_ROW, int p=0) { if(p) printf ("info: allocate host mem (%6.2f KB)\n", NUM_COLUMN*NUM_ROW*sizeof(float)/1024.0); A_h = (float*)malloc(NUM_ROW * NUM_COLUMN * sizeof(float)); CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); X_h = (float*)malloc(NUM_COLUMN * sizeof(float)); CHECK(X_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); Y_h = (float*)malloc(NUM_ROW * sizeof(float)); CHECK(Y_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); // Fill with Phi + i for (size_t i=0; i<NUM_ROW * NUM_COLUMN; i++) { A_h[i] = 1.618f + (i % NB_X); } for (size_t i=0; i< NUM_COLUMN; i++) { X_h[i] = 1.618f + i; } if(p) printf ("info: allocate device mem (%6.2f KB)\n", NUM_ROW * NUM_COLUMN * sizeof(float)/1024.0); CHECK(hipMalloc(&A_d, NUM_ROW * NUM_COLUMN * sizeof(float))); CHECK(hipMalloc(&X_d, NUM_COLUMN * sizeof(float))); CHECK(hipMalloc(&Y_d, NUM_ROW * sizeof(float))); if(p) printf ("info: copy Host2Device\n"); CHECK ( hipMemcpy(A_d, A_h, NUM_ROW * NUM_COLUMN * sizeof(float), hipMemcpyHostToDevice)); CHECK ( hipMemcpy(X_d, X_h, NUM_COLUMN * sizeof(float), hipMemcpyHostToDevice)); const unsigned blocks = (NUM_ROW -1)/NB_X + 1; const unsigned threadsPerBlock = NB_X; if(p) printf ("info: launch 'gemv_kernel' kernel\n"); for(int i=1 ; i < 1e3; i*=2) { size_t num_row = NB_X * i; clock_t t; t = clock(); double time; time = rocblas_wtime(); hipLaunchKernelGGL(HIP_KERNEL_NAME(gemv_kernel), dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, X_d, Y_d, num_row); //time = rocblas_wtime() - time; hipDeviceSynchronize(); t = clock() - t; time = ((double)t)/CLOCKS_PER_SEC*1000 ; if(p) printf ("It took me %d clicks (%f miliseconds).\n",t, time); printf ("Row = %d, It took me (%f milliseconds), Gflops=%f\n",time, num_row, 2*num_row*NUM_COLUMN/(time)/10e6); } /* if(p) printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(Y_h, Y_d, NUM_ROW * sizeof(float), hipMemcpyDeviceToHost)); if(p) printf ("info: check result\n"); for (size_t i=0; i<NUM_ROW; i++) { float res = 0; for(int j=0; j<NUM_COLUMN; j++){ res += A_h[i + j * NUM_ROW] * X_h[j]; } if (Y_h[i] != res) { printf("i=%d, CPU result=%f, GPU result=%f\n", i, res, Y_h[i]); //CHECK(hipErrorUnknown); } } */ if(p) printf ("PASSED!\n"); hipFree(A_d); hipFree(Y_d); hipFree(X_d); free(A_h); free(Y_h); free(X_h); }