/* Main */ int test_cublas(void) { cublasStatus status; cudaError_t e; float* h_A; float* h_B; float* h_C; float* h_C_ref; float* d_A = 0; void *vp; float* d_B = 0; float* d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; float error_norm; float ref_norm; float diff; /* Initialize CUBLAS */ printf("simpleCUBLAS test running..\n"); status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate host memory for the matrices */ h_A = (float*)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf (stderr, "!!!! host memory allocation error (A)\n"); return EXIT_FAILURE; } h_B = (float*)malloc(n2 * sizeof(h_B[0])); if (h_B == 0) { fprintf (stderr, "!!!! host memory allocation error (B)\n"); return EXIT_FAILURE; } h_C = (float*)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C[i] = rand() / (float)RAND_MAX; } /* Allocate device memory for the matrices */ if (cudaMalloc(&vp, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (A)\n"); return EXIT_FAILURE; } d_A = (float *) vp; if (cudaMalloc(&vp, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (B)\n"); return EXIT_FAILURE; } d_B = (float *) vp; if (cudaMalloc(&vp, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (C)\n"); return EXIT_FAILURE; } d_C = (float *) vp; /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Clear last error */ cublasGetError(); /* Performs operation using cublas */ cublasSgemm('n', 'n', N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Allocate host memory for reading back the result from device memory */ h_C = (float*)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } /* Check result against reference */ error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf (stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED"); /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_ref); e = cudaFree(d_A); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } e = cudaFree(d_B); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } e = cudaFree(d_C); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } return EXIT_SUCCESS; }
/* Main */ int main(int argc, char **argv) { cublasStatus_t status; float *h_A; float *h_B; float *h_C; float *h_C_rnd; float *h_C_ref; float *d_A = 0; float *d_B = 0; float *d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; cublasHandle_t handle; int dev_id; cudaDeviceProp device_prop; bool do_device_api_test = false; float host_api_test_ratio, device_api_test_ratio; /* Initialize CUBLAS */ printf("simpleCUBLAS test running...\n"); dev_id = findCudaDevice(argc, (const char **) argv); checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); if ((device_prop.major << 4) + device_prop.minor >= 0x35) { printf("Host and device APIs will be tested.\n"); do_device_api_test = true; } /* else if ((device_prop.major << 4) + device_prop.minor >= 0x20) { printf("Host API will be tested.\n"); do_device_api_test = false; } */ else { fprintf(stderr, "simpleDevLibCUBLAS examples requires Compute Capability of SM 3.5 or higher\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_SUCCESS; } status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! CUBLAS initialization error\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Allocate host memory for the matrices */ h_A = (float *)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf(stderr, "!!!! host memory allocation error (A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } h_B = (float *)malloc(n2 * sizeof(h_B[0])); if (h_B == 0) { fprintf(stderr, "!!!! host memory allocation error (B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } h_C_rnd = (float *)malloc(n2 * sizeof(h_C_rnd[0])); if (h_C_rnd == 0) { fprintf(stderr, "!!!! host memory allocation error (C_rnd)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } h_C = (float *)malloc(n2 * sizeof(h_C_ref[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C_rnd[i] = rand() / (float)RAND_MAX; h_C[i] = h_C_rnd[i]; } /* Allocate device memory for the matrices */ if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* * Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* * Performs operation using cublas */ status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! kernel execution error\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Allocate host memory for reading back the result from device memory */ h_C = (float *)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (read C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Check result against reference */ host_api_test_ratio = check_result(h_C, h_C_ref, n2); if (do_device_api_test) { /* Reset device resident C matrix */ status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* * Performs operation using the device API of CUBLAS library */ device_cublas_sgemm(N, alpha, d_A, d_B, beta, d_C); /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (read C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Check result against reference */ device_api_test_ratio = check_result(h_C, h_C_ref, n2); } /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_rnd); free(h_C_ref); if (cudaFree(d_A) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaFree(d_B) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaFree(d_C) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! shutdown error (A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); bool test_result = do_device_api_test ? host_api_test_ratio < 1e-6 && device_api_test_ratio < 1e-6 : host_api_test_ratio < 1e-6; printf("simpleCUBLAS completed, returned %s\n", test_result ? "OK" : "ERROR!"); exit(test_result ? EXIT_SUCCESS : EXIT_FAILURE); }
/* Main */ int main(int argc, char **argv) { cublasStatus_t status; float *h_A; float *h_B; float *h_C; float *h_C_ref; float *d_A = 0; float *d_B = 0; float *d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; float error_norm; float ref_norm; float diff; cublasHandle_t handle; int dev = findCudaDevice(argc, (const char **) argv); if (dev == -1) { return EXIT_FAILURE; } /* Initialize CUBLAS */ printf("simpleCUBLAS test running..\n"); status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate host memory for the matrices */ h_A = (float *)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf(stderr, "!!!! host memory allocation error (A)\n"); return EXIT_FAILURE; } h_B = (float *)malloc(n2 * sizeof(h_B[0])); if (h_B == 0) { fprintf(stderr, "!!!! host memory allocation error (B)\n"); return EXIT_FAILURE; } h_C = (float *)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C[i] = rand() / (float)RAND_MAX; } /* Allocate device memory for the matrices */ if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Performs operation using cublas */ status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Allocate host memory for reading back the result from device memory */ h_C = (float *)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } /* Check result against reference */ error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf(stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_ref); if (cudaFree(d_A) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } if (cudaFree(d_B) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } if (cudaFree(d_C) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } printf("CUBLAS program finished\n"); exit(error_norm / ref_norm < 1e-6f ? EXIT_SUCCESS : EXIT_FAILURE); }
static int benchmark_blas(const int N) { real *h_A; real *h_B; real *h_C; real *h_C_ref; real alpha = 1.0f; real beta = 0.0f; int n2 = N * N; int i; real error_norm; real ref_norm; real diff; /* Allocate host memory for the matrices */ h_A = (real *)malloc(n2 * sizeof(h_A[0])); h_B = (real *)malloc(n2 * sizeof(h_B[0])); h_C = (real *)malloc(n2 * sizeof(h_C[0])); h_C_ref = (real *)malloc(n2 * sizeof(h_C[0])); /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (real)RAND_MAX; h_B[i] = rand() / (real)RAND_MAX; h_C[i] = rand() / (real)RAND_MAX; h_C_ref[i] = h_C[i]; } #ifdef VERIFY /* Performs operation using plain C code*/ clock_t c_start = clock(); simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref); clock_t c_end = clock(); #endif clock_t g_start = clock(); gpu_gemm(h_A, h_B, h_C, alpha, beta, N); clock_t g_end = clock(); double g_time = (double)(g_end - g_start) / CLOCKS_PER_SEC; std::cout << N << " " << g_time << " "<< 2.0 * pow(N, 3) / g_time / 1000 /1000 / 1000; #ifdef VERIFY std::cout<<" "<< 1000.0 * (c_end - c_start) / CLOCKS_PER_SEC << std::endl; #else std::cout << std::endl; #endif #ifdef VERIFY error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (real)sqrt((double)error_norm); ref_norm = (real)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf(stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } if (error_norm / ref_norm > 1e-6f) { printf("simpleCUBLAS test failed.\n"); exit(EXIT_FAILURE); } #endif /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_ref); }