//////////////////////////////////////////////////////////////////////////////// //! Run a simple test matrix multiply using CUBLAS //////////////////////////////////////////////////////////////////////////////// int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32; // set seed for rand() srand(2006); // allocate host memory for matrices A and B unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float *h_A = (float *)malloc(mem_size_A); unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float *h_B = (float *)malloc(mem_size_B); // set seed for rand() srand(2006); // initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); // allocate device memory float *d_A, *d_B, *d_C; unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; unsigned int mem_size_C = sizeof(float) * size_C; // allocate host memory for the result float *h_C = (float *) malloc(mem_size_C); float *h_CUBLAS = (float *) malloc(mem_size_C); checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A)); checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B)); checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C)); // setup execution parameters dim3 threads(block_size, block_size); dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); // create and start timer printf("Computing result using CUBLAS..."); // execute the kernel int nIter = 30; // CUBLAS version 2.0 { const float alpha = 1.0f; const float beta = 0.0f; cublasHandle_t handle; cudaEvent_t start, stop; checkCudaErrors(cublasCreate(&handle)); //Perform warmup operation with cublas checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA)); // Allocate CUDA events that we'll use for timing checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); // Record the start event checkCudaErrors(cudaEventRecord(start, NULL)); for (int j = 0; j < nIter; j++) { //note cublas is column primary! //need to transpose the order checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA)); } printf("done.\n"); // Record the stop event checkCudaErrors(cudaEventRecord(stop, NULL)); // Wait for the stop event to complete checkCudaErrors(cudaEventSynchronize(stop)); float msecTotal = 0.0f; checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); // Compute and print the performance float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); // copy result from device to host checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost)); // Destroy the handle checkCudaErrors(cublasDestroy(handle)); } // compute reference solution printf("Computing result using host CPU..."); float *reference = (float *)malloc(mem_size_C); matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB); printf("done.\n"); // check result (CUBLAS) bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true) { printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f); } printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL"); printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); // clean up memory free(h_A); free(h_B); free(h_C); free(reference); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_B)); checkCudaErrors(cudaFree(d_C)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (resCUBLAS == true) { return EXIT_SUCCESS; // return value = 1 } else { return EXIT_FAILURE; // return value = 0 } }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test matrix multiply using CUBLAS //////////////////////////////////////////////////////////////////////////////// int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) { cudaDeviceProp deviceProp; cudaError_t error; error = cudaGetDeviceProperties(&deviceProp, devID); if (error != cudaSuccess) { printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32; // set seed for rand() srand(2006); // allocate host memory for matrices A and B unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float *h_A = (float *)malloc(mem_size_A); unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float *h_B = (float *)malloc(mem_size_B); // set seed for rand() srand(2006); // initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); // allocate device memory float *d_A, *d_B, *d_C; unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; unsigned int mem_size_C = sizeof(float) * size_C; // allocate host memory for the result float *h_C = (float *) malloc(mem_size_C); float *h_CUBLAS = (float *) malloc(mem_size_C); error = cudaMalloc((void **) &d_A, mem_size_A); if (error != cudaSuccess) { printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_B, mem_size_B); if (error != cudaSuccess) { printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_C, mem_size_C); if (error != cudaSuccess) { printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } // create and start timer StopWatchInterface *timerMemIn = NULL; sdkCreateTimer(&timerMemIn); // start the timer sdkStartTimer(&timerMemIn); // copy host memory to device error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy d_A h_A returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy d_B h_B returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } sdkStopTimer(&timerMemIn); printf("\nMemory H2D Transferring time: %f (ms)\n", sdkGetTimerValue(&timerMemIn)); sdkDeleteTimer(&timerMemIn); // setup execution parameters dim3 threads(block_size, block_size); dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); // create and start timer printf("Computing result using CUBLAS..."); // execute the kernel int nIter = 30; // CUBLAS version 2.0 { cublasHandle_t handle; cublasStatus_t ret; ret = cublasCreate(&handle); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } const float alpha = 1.0f; const float beta = 0.0f; //Perform warmup operation with cublas ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } // Allocate CUDA events that we'll use for timing cudaEvent_t start; error = cudaEventCreate(&start); if (error != cudaSuccess) { fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } cudaEvent_t stop; error = cudaEventCreate(&stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Record the start event error = cudaEventRecord(start, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } for (int j = 0; j < nIter; j++) { //note cublas is column primary! //need to transpose the order ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } } printf("done.\n"); // Record the stop event error = cudaEventRecord(stop, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Wait for the stop event to complete error = cudaEventSynchronize(stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } float msecTotal = 0.0f; error = cudaEventElapsedTime(&msecTotal, start, stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Compute and print the performance float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); // create and start timer StopWatchInterface *timerMemOut = NULL; sdkCreateTimer(&timerMemOut); // start the timer sdkStartTimer(&timerMemOut); // copy result from device to host error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost); sdkStopTimer(&timerMemOut); printf("\Memory D2H Transferring time: %f (ms)\n", sdkGetTimerValue(&timerMemOut)); sdkDeleteTimer(&timerMemOut); if (error != cudaSuccess) { printf("cudaMemcpy h_CUBLAS d_C returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } checkError(cublasDestroy(handle), "cublasDestroy() error!\n"); } // compute reference solution printf("Computing result using host CPU..."); float *reference = (float *)malloc(mem_size_C); // create and start timer StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); // start the timer sdkStartTimer(&timer); matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB); sdkStopTimer(&timer); printf("\nCPU Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); printf("done.\n"); // check result (CUBLAS) bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true) { printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f); } printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL"); // clean up memory free(h_A); free(h_B); free(h_C); free(reference); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaDeviceReset(); if (resCUBLAS == true) { return EXIT_SUCCESS; // return value = 1 } else { return EXIT_FAILURE; // return value = 0 } }