void gpu_cublas1(double *A, double *B, double *C, double *D, double *r, double *nrmC, int N, int N2) { #pragma acc data present(A, B, C, D) { #pragma acc host_data use_device(A, B, C, D) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N); printf(" gpu gemm success \n"); cublasDdot(handle, N2, C, 1, B, 1, r); printf(" gpu dot success \n"); *r = -1.0 * *r; cublasDaxpy(handle, N2, r, B, 1, C, 1); printf(" gpu axpy success \n"); cublasDnrm2(handle, N2, C, 1, nrmC); printf(" gpu nrm2 success \n"); cublasDcopy(handle, N2, C, 1, D, 1); printf(" gpu copy success \n"); *nrmC = 1.0 / *nrmC; cublasDscal(handle, N2, nrmC, D, 1); printf(" gpu scal success \n"); cublasDestroy(handle); printf(" gpu destroy success \n"); } } }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 2 arguments expected // Input and result if ((nrhs!=2) || (nlhs != 1)) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { gm = gmGetGPUmat(); init = 1; } /* mex parameters are: 0 array 1 1 array 2 This function is a wrapper for the CUBLAS double-prec dot product function pending newer matcuda */ GPUtype arrayA = gm->gputype.getGPUtype(prhs[0]); GPUtype arrayB = gm->gputype.getGPUtype(prhs[1]); int numElements = gm->gputype.getNumel(arrayA); if (numElements != gm->gputype.getNumel(arrayB)) mexErrMsgTxt("Arrays contain different numbers of elements.\n"); plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); //mxReal is our data-type double *AdotB = mxGetPr(plhs[0]); void *pointerA = (void *)gm->gputype.getGPUptr(arrayA); void *pointerB = (void *)gm->gputype.getGPUptr(arrayB); double *u = (double*)pointerA; double *v = (double *)pointerB; AdotB[0] = cublasDdot(numElements, u, 1, v, 1); }
double magma_ddot( magma_int_t n, const double *dx, magma_int_t incx, const double *dy, magma_int_t incy ) { return cublasDdot( n, dx, incx, dy, incy ); }
Darray<double> cudot (const Darray<double>& lhs, const Darray<double>& rhs) { // context check CHECK_EQ(lhs.getDeviceManager().getDeviceID(), rhs.getDeviceManager().getDeviceID()); CHECK_EQ(lhs.ndim(), rhs.ndim()); CHECK_LT(lhs.ndim(), 3); CHECK_LT(rhs.ndim(), 3); Darray<double> ret; if (lhs.ndim()==1 && rhs.ndim()==1) { // shape check CHECK_EQ(lhs.size(), rhs.size()); ret = Darray<double>(lhs.getDeviceManager(), {1}); // using cublas ddot lhs.deviceSet(); cublasDdot (DeviceManager::handle, lhs.size(), lhs.data, 1, rhs.data, 1, ret.data); } // 2D matrix dot else if (lhs.ndim()==2 && rhs.ndim()==2) { // shape check CHECK_EQ(lhs.shape()[1], rhs.shape()[0]); ret = Darray<double>(lhs.getDeviceManager(), {lhs.shape()[0], rhs.shape()[1]}); // using cblas dgemm lhs.deviceSet(); const double alpha = 1.; const double beta = 0.; CUBLAS_SAFE_CALL( cublasDgemm (DeviceManager::handle, CUBLAS_OP_N, CUBLAS_OP_N, lhs.shape()[0], rhs.shape()[1], lhs.shape()[1], &alpha, lhs.dev_data, lhs.shape()[0], rhs.dev_data, rhs.shape()[0], &beta, ret.dev_data, ret.shape()[0]) ); } return ret; }
void dot_gpu(double *x, double *y, double *result, int N) { #pragma acc data present(x, y) { #pragma acc host_data use_device(x, y) { cublasHandle_t h; cublasCreate(&h); cublasDdot(h, N, x, 1, y, 1, result); cublasDestroy(h); } } }
void mpla_ddot(double* xy, struct mpla_vector* x, struct mpla_vector* y, struct mpla_instance* instance) { // compute process-wise dot product double xy_tmp; cublasDdot(instance->cublas_handle, x->cur_proc_row_count, x->data, 1, y->data, 1, &xy_tmp); // create sub-communicator for each process column int remain_dims[2]; remain_dims[0]=1; remain_dims[1]=0; MPI_Comm column_comm; MPI_Cart_sub(instance->comm, remain_dims, &column_comm); // parallel summation and communication MPI_Allreduce(&xy_tmp, xy, 1, MPI_DOUBLE, MPI_SUM, column_comm); MPI_Comm_free(&column_comm); }
SEXP d_dot(SEXP rx, SEXP rincx, SEXP ry, SEXP rincy) { int nx, ny, n, incx = asInteger(rincx), incy = asInteger(rincy); double * x, * y; unpackVector(rx, &nx, &x); unpackVector(ry, &ny, &y); n = imin2(nx, ny); SEXP out; PROTECT(out = allocVector(REALSXP, 1)); REAL(out)[0] = cublasDdot(n, x, incx, y, incy); checkCublasError("d_dot"); UNPROTECT(1); return out; }
long benchmark(int size) { long requestStart, requestEnd; int incx = 1, incy = 1, n = size; double *cuA, *cuB; cublasStatus status; double* a = random_array(size); double* b = random_array(size); status = cublasAlloc(n, sizeof(double),(void**)&cuA); checkStatus("A", status); status = cublasAlloc(n, sizeof(double),(void**)&cuB); checkStatus("B", status); status = cublasSetVector(n, sizeof(double), a, incx, cuA, incx); checkStatus("setA", status); status = cublasSetVector(n, sizeof(double), b, incy, cuB, incy); checkStatus("setB", status); requestStart = currentTimeNanos(); cublasDdot(n, cuA, incx, cuB, incy); requestEnd = currentTimeNanos(); status = cublasFree(cuA); checkStatus("freeA", status); status = cublasFree(cuB); checkStatus("freeB", status); free(a); free(b); return (requestEnd - requestStart); }
void caffe_gpu_dot<double>(const int n, const double* x, const double* y, double * out) { CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); }
int main(int argc, char* argv[]) { const int bufsize = 512; char buffer[bufsize]; int m,n,S; double time_st,time_end,time_avg; //omp_set_num_threads(2); // printf("\n-----------------\nnumber of threads fired = %d\n-----------------\n",(int)omp_get_num_threads()); if(argc!=2) { cout<<"Insufficient arguments"<<endl; return 1; } graph G; cerr<<"Start reading "; // time_st=dsecnd(); G.create_graph(argv[1]); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cout<<"Success "<<endl; // cerr<<"Reading time "<<time_avg<<endl; cerr<<"Constructing Matrices "; // time_st=dsecnd(); G.construct_MNA(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; // G.construct_sparse_MNA(); m=G.node_array.size()-1; n=G.voltage_edge_id.size(); cout<<endl; cout<<"MATRIX STAT:"<<endl; cout<<"Nonzero elements: "<<G.nonzero<<endl; cout<<"Number of Rows: "<<m+n<<endl; printf("\n Nonzero = %ld", G.nonzero); printf("\n Rows = %d", m+n); cout<<"MAT val: "<<endl; int i,j; // G.Mat_val[0] +=100; /* for(i=0;i<G.nonzero;i++) cout<<" "<<G.Mat_val[i]; cout<<endl; for(i=0;i<G.nonzero;i++) cout<<" "<<G.columns[i]; cout<<endl; for(i=0;i<m+n+1;i++) cout<<" "<<G.rowIndex[i]; cout<<endl; for(i=0;i<m+n;i++) { cout<<endl; int startindex=G.rowIndex[i]; int endindex=G.rowIndex[i+1]; for(j=startindex;j<endindex;j++) cout<<" "<<G.Mat_val[j]; cout<<endl; } */ /* for (i=0;i<m+n+1;i++) { //cout<<endl; if(G.rowIndex[i]==G.rowIndex[i+1]) break; for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { if(G.Mat_val[j]>10) cout<<G.Mat_val[j]<<"\t"; } //cout<<endl; /*for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { cout<<G.columns[j]<<"\t"; } //cout<<endl; } cout<<endl; */ //printing the matrix printf("\n Fine till here"); printf("\n"); // int* rowmIndex=(int*)calloc(m+1,sizeof(int)); printf("\n Fine till here"); printf("\n"); //int rowmIndex[5]={1,2,3,4,5}; /* for(i=0;i<m+1;i++) { rowmIndex[i]=G.rowIndex[i]; printf(" %d", rowmIndex[i]); } */ cerr<<"Solving Equations "<<endl; double r1, b, alpha, alpham1, beta, r0, a, na; const double tol = 0.1; const int max_iter = 1000000; int *d_col, *d_row; double *d_val, *d_x, dot; double *d_r, *d_p, *d_Ax; int k; cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); /* Get handle to the CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); checkCudaErrors(cudaMalloc((void **)&d_col, G.nonzero*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_row, (m+n+1)*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_val, G.nonzero*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_x, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_r, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_p, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_Ax, (m+n)*sizeof(double))); cudaMemcpy(d_col, G.columns, G.nonzero*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_row, G.rowIndex, (m+n+1)*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_val, G.Mat_val, G.nonzero*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_x, G.x, (m+n)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_r, G.b, (m+n)*sizeof(double), cudaMemcpyHostToDevice); alpha = 1.0; alpham1 = -1.0; beta = 0.0; r0 = 0.; printf("\n Data transferred\n"); cudaEvent_t start,stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); cusparseDcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax); cublasDaxpy(cublasHandle, (m+n), &alpham1, d_Ax, 1, d_r, 1); cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1); k = 1; while (r1 > tol && k <= max_iter) { if (k > 1) { b = r1 / r0; cublasStatus = cublasDscal(cublasHandle, (m+n), &b, d_p, 1); cublasStatus = cublasDaxpy(cublasHandle, (m+n), &alpha, d_r, 1, d_p, 1); } else { cublasStatus = cublasDcopy(cublasHandle, (m+n), d_r, 1, d_p, 1); } cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax); cublasStatus = cublasDdot(cublasHandle, (m+n), d_p, 1, d_Ax, 1, &dot); a = r1 / dot; cublasStatus = cublasDaxpy(cublasHandle, (m+n), &a, d_p, 1, d_x, 1); na = -a; cublasStatus = cublasDaxpy(cublasHandle, (m+n), &na, d_Ax, 1, d_r, 1); r0 = r1; cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1); // cudaThreadSynchronize(); // printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); k++; } cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); printf("Iterations = %3d\tTime : %.6f milli-seconds : \n", k, elapsedTime); cudaMemcpy(G.x, d_x, (m+n)*sizeof(double), cudaMemcpyDeviceToHost); /* printf("\n x = \n"); for(i=0;i<(m+n);i++) { printf("\n x[%d] = %.8f", i, G.x[i]); } */ float rsum, diff, err = 0.0; for (int i = 0; i < (m+n); i++) { rsum = 0.0; for (int j = G.rowIndex[i]; j < G.rowIndex[i+1]; j++) { rsum += G.Mat_val[j]*G.x[G.columns[j]]; } diff = fabs(rsum - G.b[i]); if (diff > err) { err = diff; } } cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); /* free(I); free(J); free(val); free(x); free(rhs); */ cudaFree(d_col); cudaFree(d_row); cudaFree(d_val); cudaFree(d_x); cudaFree(d_r); cudaFree(d_p); cudaFree(d_Ax); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); /* printf("\n X is:\n"); for(i=0;i<(m+n);i++) { printf("\n X[%d] = %.4f", i, G.x[i]); } */ printf("Test Summary: Error amount = %f\n", err); exit((k <= max_iter) ? 0 : 1); /* culaSparseHandle handle; if (culaSparseCreate(&handle) != culaSparseNoError) { // this should only fail under extreme conditions std::cout << "fatal error: failed to create library handle!" << std::endl; exit(EXIT_FAILURE); } StatusChecker sc(handle); culaSparsePlan plan; sc = culaSparseCreatePlan(handle, &plan); culaSparseCsrOptions formatOpts; culaSparseCsrOptionsInit(handle, &formatOpts); //formatOpts.indexing=1; sc = culaSparseSetDcsrData(handle, plan, &formatOpts, m+n, G.nonzero, &G.Mat_val[0], &G.rowIndex[0], &G.columns[0], &G.x[0], &G.b[0]); printf("\n Fine till here"); printf("\n"); culaSparseConfig config; sc = culaSparseConfigInit(handle, &config); config.relativeTolerance = 1e-2; config.maxIterations = 100000; config.divergenceTolerance = 20; culaSparseResult result; /* sc = culaSparseSetHostPlatform(handle, plan, 0); // set cg solver sc = culaSparseSetCgSolver(handle, plan, 0); printf("\n Fine till here"); printf("\n"); // perform solve (cg + no preconditioner on host) culaSparseResult result; sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; if (culaSparsePreinitializeCuda(handle) == culaSparseNoError) { // change to cuda accelerated platform sc = culaSparseSetCudaPlatform(handle, plan, 0); // perform solve (cg + ilu0 on cuda) culaSparseGmresOptions solverOpts; culaSparseStatus status = culaSparseGmresOptionsInit(handle, &solverOpts); solverOpts.restart = 20; sc = culaSparseSetCgSolver(handle, plan, 0); // sc = culaSparseSetJacobiPreconditioner(handle, plan, 0); // sc = culaSparseSetIlu0Preconditioner(handle, plan, 0); sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; // change preconditioner to fainv // this avoids data transfer by using data cached by the plan // sc = culaSparseSetIlu0Preconditioner(handle, plan, 0); // perform solve (cg + fainv on cuda) // these timing results should indicate minimal overhead /* sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; // change solver // this avoids preconditioner regeneration by using data cached by the plan sc = culaSparseSetBicgstabSolver(handle, plan, 0); // perform solve (bicgstab + fainv on cuda) // the timing results should indicate minimal overhead and preconditioner generation time sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; sc = culaSparseSetGmresSolver(handle, plan, 0); // perform solve (bicgstab + fainv on cuda) // the timing results should indicate minimal overhead and preconditioner generation time sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; } else { std::cout << "alert: no cuda capable gpu found" << std::endl; } // cleanup plan culaSparseDestroyPlan(plan); // cleanup handle culaSparseDestroy(handle); FILE* myWriteFile; myWriteFile=fopen("result.txt","w"); for (i = 0; i < n; i++) { fprintf(myWriteFile,"%1f\n",G.x[i]); // printf ("\n x [%d] = % f", i, x[i]); } fprintf(myWriteFile,".end\n"); fclose(myWriteFile); printf ("\n"); // time_st=dsecnd(); // solver(G.rowIndex,G.columns,G.Mat_val,G.b,G.x,m+n,G.nonzero); // time_end=dsecnd(); // time_avg = (time_end-time_st); // printf("Successfully Solved in : %.6f secs\n",time_avg); cerr<<endl; cerr<<"Fillup Graph "; // time_st=dsecnd(); G.fillup_graph(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; //G.output_graph_stdout(); cerr<<"Matching KCL "; // time_st=dsecnd(); G.check_kcl(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; /*for (int i=0;i<m+n;i++) { cout<<"M"<<i<<endl; for (int j=0;j<m+n;j++) cout<<" "<<j<<"#"<<M[i][j]<<endl; }*/ }
cublasStatus_t cublasXdot(int n, const double *x, int incx, const double *y, int incy, double *result) { return cublasDdot(g_context->cublasHandle, n, x, incx, y, incy, result); }
double dot(const Vector<double> &x, const Vector<double> &y) { assert(x.getSize() == y.getSize()); return cublasDdot(x.getSize(), x, x.inc(), y, y.inc()); }
// // Overloaded function for dispatching to // * CUBLAS backend, and // * double value-type. // inline double dot( const int n, const double* x, const int incx, const double* y, const int incy ) { return cublasDdot( n, x, incx, y, incy ); }