int main(int argc, char **argv){ int i; int M; int N; int nz; int *I; int *J; double *val; double *x; double *rhs; //Make Sparse Matrix M = N = 1048576; nz = (N-2)*3 + 4; I = new int[N+1]; J = new int[nz]; val = new double[nz]; x = new double[N]; rhs = new double[N]; genTridiag(I, J, val, N, nz); for (int i = 0; i < N; i++) rhs[i] = 0.1; solver _sc; _sc.CallSetA( M , val , I , J ); _sc.CallSetX( x ); //CG for(i=0;i<N;i++) x[i] = 0.0; //CG(M,N,nz,I,J,val,x,rhs); _sc.CallCG( rhs ); std::cout << "[CG]" << std::endl; for(i=0;i<5;i++) std::cout << x[i] << std::endl; //BICG-STAB for(i=0;i<N;i++) x[i] = 0.0; //BiCGSTAB(M,N,nz,I,J,val,x,rhs); _sc.CallBiCGSTAB( rhs ); std::cout << "[BiCGSTAB]" << std::endl; for(i=0;i<5;i++) std::cout << x[i] << std::endl; //GCR Method for(i=0;i<N;i++) x[i] = 0.0; //GCR(M,N,nz,I,J,val,x,rhs); _sc.CallGCR( rhs ); std::cout << "[GCR]" << std::endl; for(i=0;i<5;i++) std::cout << x[i] << std::endl; delete[] I; delete[] J; delete[] val; delete[] x; delete[] rhs; return 0; }
int main(int argc, char **argv) { int N = 0, nz = 0, *I = NULL, *J = NULL; float *val = NULL; const float tol = 1e-5f; const int max_iter = 10000; float *x; float *rhs; float a, b, na, r0, r1; float dot; float *r, *p, *Ax; int k; float alpha, beta, alpham1; printf("Starting [%s]...\n", sSDKname); // This will pick the best possible CUDA capable device cudaDeviceProp deviceProp; int devID = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); #if defined(__APPLE__) || defined(MACOSX) fprintf(stderr, "Unified Memory not currently supported on OS X\n"); cudaDeviceReset(); exit(EXIT_WAIVED); #endif if (sizeof(void *) != 8) { fprintf(stderr, "Unified Memory requires compiling for a 64-bit system.\n"); cudaDeviceReset(); exit(EXIT_WAIVED); } if (((deviceProp.major << 4) + deviceProp.minor) < 0x30) { fprintf(stderr, "%s requires Compute Capability of SM 3.0 or higher to run.\nexiting...\n", argv[0]); cudaDeviceReset(); exit(EXIT_WAIVED); } // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); /* Generate a random tridiagonal symmetric matrix in CSR format */ N = 1048576; nz = (N-2)*3 + 4; cudaMallocManaged((void **)&I, sizeof(int)*(N+1)); cudaMallocManaged((void **)&J, sizeof(int)*nz); cudaMallocManaged((void **)&val, sizeof(float)*nz); genTridiag(I, J, val, N, nz); cudaMallocManaged((void **)&x, sizeof(float)*N); cudaMallocManaged((void **)&rhs, sizeof(float)*N); for (int i = 0; i < N; i++) { rhs[i] = 1.0; x[i] = 0.0; } /* Get handle to the CUBLAS context */ cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); /* Get handle to the CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); // temp memory for CG checkCudaErrors(cudaMallocManaged((void **)&r, N*sizeof(float))); checkCudaErrors(cudaMallocManaged((void **)&p, N*sizeof(float))); checkCudaErrors(cudaMallocManaged((void **)&Ax, N*sizeof(float))); cudaDeviceSynchronize(); for (int i=0; i < N; i++) { r[i] = rhs[i]; } alpha = 1.0; alpham1 = -1.0; beta = 0.0; r0 = 0.; cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, x, &beta, Ax); cublasSaxpy(cublasHandle, N, &alpham1, Ax, 1, r, 1); cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); k = 1; while (r1 > tol*tol && k <= max_iter) { if (k > 1) { b = r1 / r0; cublasStatus = cublasSscal(cublasHandle, N, &b, p, 1); cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, r, 1, p, 1); } else { cublasStatus = cublasScopy(cublasHandle, N, r, 1, p, 1); } cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, p, &beta, Ax); cublasStatus = cublasSdot(cublasHandle, N, p, 1, Ax, 1, &dot); a = r1 / dot; cublasStatus = cublasSaxpy(cublasHandle, N, &a, p, 1, x, 1); na = -a; cublasStatus = cublasSaxpy(cublasHandle, N, &na, Ax, 1, r, 1); r0 = r1; cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); cudaThreadSynchronize(); printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); k++; } printf("Final residual: %e\n",sqrt(r1)); fprintf(stdout,"&&&& uvm_cg test %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED"); float rsum, diff, err = 0.0; for (int i = 0; i < N; i++) { rsum = 0.0; for (int j = I[i]; j < I[i+1]; j++) { rsum += val[j]*x[J[j]]; } diff = fabs(rsum - rhs[i]); if (diff > err) { err = diff; } } cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); cudaFree(I); cudaFree(J); cudaFree(val); cudaFree(x); cudaFree(r); cudaFree(p); cudaFree(Ax); cudaDeviceReset(); printf("Test Summary: Error amount = %f, result = %s\n", err, (k <= max_iter) ? "SUCCESS" : "FAILURE"); exit((k <= max_iter) ? EXIT_SUCCESS : EXIT_FAILURE); }
int _tmain(int argc, _TCHAR* argv[]) { int M = 0, N = 0, nz = 0, *I = NULL, *J = NULL; cuDoubleComplex *val = NULL; cuDoubleComplex *x, *y; cuDoubleComplex *d_x, *d_y; double duration, duration_setup; std::clock_t setup_clock; setup_clock = std::clock(); // This will pick the best possible CUDA capable device cudaDeviceProp deviceProp; int devID = findCudaDevice(argc, (const char **)argv); if (devID < 0) { printf("no devices found...\n"); exit(EXIT_SUCCESS); } checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = (deviceProp.major * 0x10 + deviceProp.minor); if (version < 0x11) { printf("Requires a minimum CUDA compute 1.1 capability\n"); cudaDeviceReset(); exit(EXIT_SUCCESS); } M = N = 8388608; //2 ^ 23 //M = N = 4194304; //2 ^ 22 //M = N = 2097152; //2 ^ 21 //M = N = 1048576; //2 ^ 20 //M = N = 524288; //2 ^ 19 nz = N * 8; I = (int *)malloc(sizeof(int)*(N + 1)); J = (int *)malloc(sizeof(int)*nz); val = (cuDoubleComplex *)malloc(sizeof(cuDoubleComplex)*nz); genTridiag(I, J, val, N, nz); x = (cuDoubleComplex*)malloc(sizeof(cuDoubleComplex)* N); y = (cuDoubleComplex*)malloc(sizeof(cuDoubleComplex)* N); //create an array for the answer array (Y) and set all of the answers to 0 for the test (could do random) for (int i = 0; i < N; i++) { y[i] = make_cuDoubleComplex(0.0, 0.0); } //Get handle to the CUBLAS context cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); //Get handle to the CUSPARSE context cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); //Get handle to a CUSPARSE matrix descriptor cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); //Get handle to a matrix_solve_info object cusparseSolveAnalysisInfo_t info = 0; cusparseStatus = cusparseCreateSolveAnalysisInfo(&info); checkCudaErrors(cusparseStatus); cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); duration_setup = (std::clock() - setup_clock) / (double)CLOCKS_PER_SEC; printf("setup_time: %f\r\n", duration_setup); std::clock_t start; start = std::clock(); checkCudaErrors(cudaMalloc((void **)&d_x, N*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_y, N*sizeof(float))); cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); //Analyze the matrix. The info variable is needed to perform additional operations on the matrix cusparseStatus = cusparseZcsrsv_analysis(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nz, descr, val, J, I, info); //Uses infor gathered from the matrix to solve the matrix. cusparseStatus = cusparseZcsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, 0, descr, val, J, I, info, d_x, d_y); //Get the result back from the device cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); duration = (std::clock() - start) / (double)CLOCKS_PER_SEC; printf("time ellapsed: %f", duration); //free up memory cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); free(I); free(J); free(val); free(x); cudaFree(d_x); cudaDeviceReset(); //Wait for user input so they can see the results char* s = (char*)malloc(sizeof(char) * 8); scanf(s); exit(0); }
int main(int argc, char **argv) { int M = 0, N = 0, nz = 0, *I = NULL, *J = NULL; float *val = NULL; const float tol = 1e-5f; const int max_iter = 10000; float *x; float *rhs; float a, b, na, r0, r1; int *d_col, *d_row; float *d_val, *d_x, dot; float *d_r, *d_p, *d_Ax; int k; float alpha, beta, alpham1; shrQAStart(argc, argv); // This will pick the best possible CUDA capable device cudaDeviceProp deviceProp; int devID = findCudaDevice(argc, (const char **)argv); if (devID < 0) { printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); exit(0); } checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) ); // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = (deviceProp.major * 0x10 + deviceProp.minor); if(version < 0x11) { printf("%s: requires a minimum CUDA compute 1.1 capability\n", sSDKname); cudaDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } /* Generate a random tridiagonal symmetric matrix in CSR format */ M = N = 1048576; nz = (N-2)*3 + 4; I = (int*)malloc(sizeof(int)*(N+1)); J = (int*)malloc(sizeof(int)*nz); val = (float*)malloc(sizeof(float)*nz); genTridiag(I, J, val, N, nz); x = (float*)malloc(sizeof(float)*N); rhs = (float*)malloc(sizeof(float)*N); for (int i = 0; i < N; i++) { rhs[i] = 1.0; x[i] = 0.0; } /* Get handle to the CUBLAS context */ cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); if ( checkCublasStatus (cublasStatus, "!!!! CUBLAS initialization error\n") ) return EXIT_FAILURE; /* Get handle to the CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); if ( checkCusparseStatus (cusparseStatus, "!!!! CUSPARSE initialization error\n") ) return EXIT_FAILURE; cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); if ( checkCusparseStatus (cusparseStatus, "!!!! CUSPARSE cusparseCreateMatDescr error\n") ) return EXIT_FAILURE; cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); checkCudaErrors( cudaMalloc((void**)&d_col, nz*sizeof(int)) ); checkCudaErrors( cudaMalloc((void**)&d_row, (N+1)*sizeof(int)) ); checkCudaErrors( cudaMalloc((void**)&d_val, nz*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&d_x, N*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&d_r, N*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&d_p, N*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&d_Ax, N*sizeof(float)) ); cudaMemcpy(d_col, J, nz*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_row, I, (N+1)*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_val, val, nz*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_r, rhs, N*sizeof(float), cudaMemcpyHostToDevice); alpha = 1.0; alpham1 = -1.0; beta = 0.0; r0 = 0.; cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax); cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1); cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); k = 1; while (r1 > tol*tol && k <= max_iter) { if (k > 1) { b = r1 / r0; cublasStatus = cublasSscal(cublasHandle, N, &b, d_p, 1); cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1); } else { cublasStatus = cublasScopy(cublasHandle, N, d_r, 1, d_p, 1); } cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax); cublasStatus = cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, &dot); a = r1 / dot; cublasStatus = cublasSaxpy(cublasHandle, N, &a, d_p, 1, d_x, 1); na = -a; cublasStatus = cublasSaxpy(cublasHandle, N, &na, d_Ax, 1, d_r, 1); r0 = r1; cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); cudaThreadSynchronize(); printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); k++; } cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost); float rsum, diff, err = 0.0; for (int i = 0; i < N; i++) { rsum = 0.0; for (int j = I[i]; j < I[i+1]; j++) { rsum += val[j]*x[J[j]]; } diff = fabs(rsum - rhs[i]); if (diff > err) err = diff; } cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); free(I); free(J); free(val); free(x); free(rhs); cudaFree(d_col); cudaFree(d_row); cudaFree(d_val); cudaFree(d_x); cudaFree(d_r); cudaFree(d_p); cudaFree(d_Ax); cudaDeviceReset(); printf("Test Summary: Error amount = %f\n", err); shrQAFinishExit(argc, (const char **)argv, (k <= max_iter) ? QA_PASSED : QA_FAILED ); }