void CUDABLAS1::scal( IndexType n, const double alpha, double* x_d, const IndexType incx, SyncToken* syncToken ) { LAMA_CHECK_CUDA_ACCESS cudaStream_t stream = NULL; if ( syncToken ) { CUDAStreamSyncToken* cudaStreamSyncToken = dynamic_cast<CUDAStreamSyncToken*>( syncToken ); LAMA_ASSERT_DEBUG( cudaStreamSyncToken, "no cuda stream sync token provided" ) stream = cudaStreamSyncToken->getCUDAStream(); } cublasSetKernelStream( stream ); LAMA_CHECK_CUBLAS_ERROR cublasDscal( n, alpha, x_d, incx ); // No error check here possible as kernel is started asynchronously if ( !syncToken ) { cudaStreamSynchronize( 0 ); LAMA_CHECK_CUDA_ERROR }
void gpu_cublas1(double *A, double *B, double *C, double *D, double *r, double *nrmC, int N, int N2) { #pragma acc data present(A, B, C, D) { #pragma acc host_data use_device(A, B, C, D) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N); printf(" gpu gemm success \n"); cublasDdot(handle, N2, C, 1, B, 1, r); printf(" gpu dot success \n"); *r = -1.0 * *r; cublasDaxpy(handle, N2, r, B, 1, C, 1); printf(" gpu axpy success \n"); cublasDnrm2(handle, N2, C, 1, nrmC); printf(" gpu nrm2 success \n"); cublasDcopy(handle, N2, C, 1, D, 1); printf(" gpu copy success \n"); *nrmC = 1.0 / *nrmC; cublasDscal(handle, N2, nrmC, D, 1); printf(" gpu scal success \n"); cublasDestroy(handle); printf(" gpu destroy success \n"); } } }
void magma_dscal( magma_int_t n, double alpha, double *dx, magma_int_t incx ) { cublasDscal( n, alpha, dx, incx ); }
void d_scal(SEXP ralpha, SEXP rx, SEXP rincx) { int n, incx = asInteger(rincx); double * x, alpha = asReal(ralpha); unpackVector(rx, &n, &x); cublasDscal(n, alpha, x, incx); checkCublasError("d_scal"); }
static vl::Error scal(vl::Context & context, ptrdiff_t n, type alpha, type *x, ptrdiff_t incx) { cublasHandle_t handle ; cublasStatus_t status ; status = context.getCudaHelper().getCublasHandle(&handle) ; if (status != CUBLAS_STATUS_SUCCESS) goto done ; status = cublasDscal(handle, (int)n, &alpha, x, (int)incx) ; done: return context.setError (context.getCudaHelper().catchCublasError(status, "cublasDscal"), __func__) ; }
void cube_blas_d_scal (cube_t *ctx, int n, const double *alpha, double *x, int incx) { cublasStatus_t status; if (! cube_context_check (ctx)) return; status = cublasDscal (ctx->h_blas, n, alpha, x, incx); cube_blas_check (ctx, status); }
void caffe_gpu_scale<double>(const int n, const double alpha, const double *x, double* y) { CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1)); CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1)); }
void caffe_gpu_scal<double>(const int N, const double alpha, double *X) { CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1)); }
int main(int argc, char* argv[]) { const int bufsize = 512; char buffer[bufsize]; int m,n,S; double time_st,time_end,time_avg; //omp_set_num_threads(2); // printf("\n-----------------\nnumber of threads fired = %d\n-----------------\n",(int)omp_get_num_threads()); if(argc!=2) { cout<<"Insufficient arguments"<<endl; return 1; } graph G; cerr<<"Start reading "; // time_st=dsecnd(); G.create_graph(argv[1]); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cout<<"Success "<<endl; // cerr<<"Reading time "<<time_avg<<endl; cerr<<"Constructing Matrices "; // time_st=dsecnd(); G.construct_MNA(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; // G.construct_sparse_MNA(); m=G.node_array.size()-1; n=G.voltage_edge_id.size(); cout<<endl; cout<<"MATRIX STAT:"<<endl; cout<<"Nonzero elements: "<<G.nonzero<<endl; cout<<"Number of Rows: "<<m+n<<endl; printf("\n Nonzero = %ld", G.nonzero); printf("\n Rows = %d", m+n); cout<<"MAT val: "<<endl; int i,j; // G.Mat_val[0] +=100; /* for(i=0;i<G.nonzero;i++) cout<<" "<<G.Mat_val[i]; cout<<endl; for(i=0;i<G.nonzero;i++) cout<<" "<<G.columns[i]; cout<<endl; for(i=0;i<m+n+1;i++) cout<<" "<<G.rowIndex[i]; cout<<endl; for(i=0;i<m+n;i++) { cout<<endl; int startindex=G.rowIndex[i]; int endindex=G.rowIndex[i+1]; for(j=startindex;j<endindex;j++) cout<<" "<<G.Mat_val[j]; cout<<endl; } */ /* for (i=0;i<m+n+1;i++) { //cout<<endl; if(G.rowIndex[i]==G.rowIndex[i+1]) break; for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { if(G.Mat_val[j]>10) cout<<G.Mat_val[j]<<"\t"; } //cout<<endl; /*for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { cout<<G.columns[j]<<"\t"; } //cout<<endl; } cout<<endl; */ //printing the matrix printf("\n Fine till here"); printf("\n"); // int* rowmIndex=(int*)calloc(m+1,sizeof(int)); printf("\n Fine till here"); printf("\n"); //int rowmIndex[5]={1,2,3,4,5}; /* for(i=0;i<m+1;i++) { rowmIndex[i]=G.rowIndex[i]; printf(" %d", rowmIndex[i]); } */ cerr<<"Solving Equations "<<endl; double r1, b, alpha, alpham1, beta, r0, a, na; const double tol = 0.1; const int max_iter = 1000000; int *d_col, *d_row; double *d_val, *d_x, dot; double *d_r, *d_p, *d_Ax; int k; cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); /* Get handle to the CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); checkCudaErrors(cudaMalloc((void **)&d_col, G.nonzero*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_row, (m+n+1)*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_val, G.nonzero*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_x, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_r, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_p, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_Ax, (m+n)*sizeof(double))); cudaMemcpy(d_col, G.columns, G.nonzero*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_row, G.rowIndex, (m+n+1)*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_val, G.Mat_val, G.nonzero*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_x, G.x, (m+n)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_r, G.b, (m+n)*sizeof(double), cudaMemcpyHostToDevice); alpha = 1.0; alpham1 = -1.0; beta = 0.0; r0 = 0.; printf("\n Data transferred\n"); cudaEvent_t start,stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); cusparseDcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax); cublasDaxpy(cublasHandle, (m+n), &alpham1, d_Ax, 1, d_r, 1); cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1); k = 1; while (r1 > tol && k <= max_iter) { if (k > 1) { b = r1 / r0; cublasStatus = cublasDscal(cublasHandle, (m+n), &b, d_p, 1); cublasStatus = cublasDaxpy(cublasHandle, (m+n), &alpha, d_r, 1, d_p, 1); } else { cublasStatus = cublasDcopy(cublasHandle, (m+n), d_r, 1, d_p, 1); } cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax); cublasStatus = cublasDdot(cublasHandle, (m+n), d_p, 1, d_Ax, 1, &dot); a = r1 / dot; cublasStatus = cublasDaxpy(cublasHandle, (m+n), &a, d_p, 1, d_x, 1); na = -a; cublasStatus = cublasDaxpy(cublasHandle, (m+n), &na, d_Ax, 1, d_r, 1); r0 = r1; cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1); // cudaThreadSynchronize(); // printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); k++; } cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); printf("Iterations = %3d\tTime : %.6f milli-seconds : \n", k, elapsedTime); cudaMemcpy(G.x, d_x, (m+n)*sizeof(double), cudaMemcpyDeviceToHost); /* printf("\n x = \n"); for(i=0;i<(m+n);i++) { printf("\n x[%d] = %.8f", i, G.x[i]); } */ float rsum, diff, err = 0.0; for (int i = 0; i < (m+n); i++) { rsum = 0.0; for (int j = G.rowIndex[i]; j < G.rowIndex[i+1]; j++) { rsum += G.Mat_val[j]*G.x[G.columns[j]]; } diff = fabs(rsum - G.b[i]); if (diff > err) { err = diff; } } cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); /* free(I); free(J); free(val); free(x); free(rhs); */ cudaFree(d_col); cudaFree(d_row); cudaFree(d_val); cudaFree(d_x); cudaFree(d_r); cudaFree(d_p); cudaFree(d_Ax); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); /* printf("\n X is:\n"); for(i=0;i<(m+n);i++) { printf("\n X[%d] = %.4f", i, G.x[i]); } */ printf("Test Summary: Error amount = %f\n", err); exit((k <= max_iter) ? 0 : 1); /* culaSparseHandle handle; if (culaSparseCreate(&handle) != culaSparseNoError) { // this should only fail under extreme conditions std::cout << "fatal error: failed to create library handle!" << std::endl; exit(EXIT_FAILURE); } StatusChecker sc(handle); culaSparsePlan plan; sc = culaSparseCreatePlan(handle, &plan); culaSparseCsrOptions formatOpts; culaSparseCsrOptionsInit(handle, &formatOpts); //formatOpts.indexing=1; sc = culaSparseSetDcsrData(handle, plan, &formatOpts, m+n, G.nonzero, &G.Mat_val[0], &G.rowIndex[0], &G.columns[0], &G.x[0], &G.b[0]); printf("\n Fine till here"); printf("\n"); culaSparseConfig config; sc = culaSparseConfigInit(handle, &config); config.relativeTolerance = 1e-2; config.maxIterations = 100000; config.divergenceTolerance = 20; culaSparseResult result; /* sc = culaSparseSetHostPlatform(handle, plan, 0); // set cg solver sc = culaSparseSetCgSolver(handle, plan, 0); printf("\n Fine till here"); printf("\n"); // perform solve (cg + no preconditioner on host) culaSparseResult result; sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; if (culaSparsePreinitializeCuda(handle) == culaSparseNoError) { // change to cuda accelerated platform sc = culaSparseSetCudaPlatform(handle, plan, 0); // perform solve (cg + ilu0 on cuda) culaSparseGmresOptions solverOpts; culaSparseStatus status = culaSparseGmresOptionsInit(handle, &solverOpts); solverOpts.restart = 20; sc = culaSparseSetCgSolver(handle, plan, 0); // sc = culaSparseSetJacobiPreconditioner(handle, plan, 0); // sc = culaSparseSetIlu0Preconditioner(handle, plan, 0); sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; // change preconditioner to fainv // this avoids data transfer by using data cached by the plan // sc = culaSparseSetIlu0Preconditioner(handle, plan, 0); // perform solve (cg + fainv on cuda) // these timing results should indicate minimal overhead /* sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; // change solver // this avoids preconditioner regeneration by using data cached by the plan sc = culaSparseSetBicgstabSolver(handle, plan, 0); // perform solve (bicgstab + fainv on cuda) // the timing results should indicate minimal overhead and preconditioner generation time sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; sc = culaSparseSetGmresSolver(handle, plan, 0); // perform solve (bicgstab + fainv on cuda) // the timing results should indicate minimal overhead and preconditioner generation time sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; } else { std::cout << "alert: no cuda capable gpu found" << std::endl; } // cleanup plan culaSparseDestroyPlan(plan); // cleanup handle culaSparseDestroy(handle); FILE* myWriteFile; myWriteFile=fopen("result.txt","w"); for (i = 0; i < n; i++) { fprintf(myWriteFile,"%1f\n",G.x[i]); // printf ("\n x [%d] = % f", i, x[i]); } fprintf(myWriteFile,".end\n"); fclose(myWriteFile); printf ("\n"); // time_st=dsecnd(); // solver(G.rowIndex,G.columns,G.Mat_val,G.b,G.x,m+n,G.nonzero); // time_end=dsecnd(); // time_avg = (time_end-time_st); // printf("Successfully Solved in : %.6f secs\n",time_avg); cerr<<endl; cerr<<"Fillup Graph "; // time_st=dsecnd(); G.fillup_graph(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; //G.output_graph_stdout(); cerr<<"Matching KCL "; // time_st=dsecnd(); G.check_kcl(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; /*for (int i=0;i<m+n;i++) { cout<<"M"<<i<<endl; for (int j=0;j<m+n;j++) cout<<" "<<j<<"#"<<M[i][j]<<endl; }*/ }
cublasStatus_t cublasXscal(int n, const double* alpha, double* x, int incx) { return cublasDscal(g_context->cublasHandle, n, alpha, x, incx); }
void scal(double alpha, Vector<double> &x) { cublasDscal(x.getSize(), alpha, x, x.inc()); }
int CORE_dtstrf_cublas(int M, int N, int IB, int NB, double *U, int LDU, double *A, int LDA, double *L, int LDL, int *IPIV, double *WORK, int LDWORK, int *INFO) { static double zzero = 0.0; static double mzone =-1.0; cublasStatus_t status; cudaError_t err; double alpha; int i, j, ii, sb; int im, ip; #if CONFIG_VERBOSE fprintf(stdout, "%s: M=%d N=%d IB=%d NB=%d U=%p LDU=%d A=%p LDA=%d L=%p LDL=%d IPIV=%p WORK=%p LDWORK=%d\n", __FUNCTION__, M, N, IB, NB, U, LDU, A, LDA, L, LDL, IPIV, WORK, LDWORK); fflush(stdout); #endif /* Check input arguments */ *INFO = 0; if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDU < max(1,NB)) && (NB > 0)) { coreblas_error(6, "Illegal value of LDU"); return -6; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(8, "Illegal value of LDA"); return -8; } if ((LDL < max(1,IB)) && (IB > 0)) { coreblas_error(10, "Illegal value of LDL"); return -10; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; /* Set L to 0 */ err = cudaMemset(L, 0, LDL*N*sizeof(double)); PLASMA_CUDA_ASSERT(err); double* dev_ptr = 0; err = cudaMalloc((void**)&dev_ptr, 2*sizeof(double)); PLASMA_CUDA_ASSERT(err); double* host_ptr; err = cudaMallocHost((void**)&host_ptr, 2*sizeof(double)); PLASMA_CUDA_ASSERT(err); int* piv = kaapi_memory_get_host_pointer_and_validate(IPIV); ip = 0; for (ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for (i = 0; i < sb; i++) { status = cublasIdamax(kaapi_cuda_cublas_handle(), M, &A[LDA*(ii+i)], 1, &im ); PLASMA_CUBLAS_ASSERT(status); /* get im */ err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); /* ajust index, CUBLAS is 1-based indexing */ im--; piv[ip] = ii+i+1; core_dtstrf_cmp(kaapi_cuda_kernel_stream(), &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], dev_ptr, host_ptr); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); if (host_ptr[0] == 1.0f) { /* * Swap behind. */ status = cublasDswap(kaapi_cuda_cublas_handle(), i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK ); PLASMA_CUBLAS_ASSERT(status); /* * Swap ahead. */ status = cublasDswap(kaapi_cuda_cublas_handle(), sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA ); PLASMA_CUBLAS_ASSERT(status); /* * Set IPIV. */ piv[ip] = NB + im + 1; core_dtstrf_set_zero(kaapi_cuda_kernel_stream(), A, LDA, i, ii, im, zzero ); } core_dtstrf_cmp_zzero_and_get_alpha(kaapi_cuda_kernel_stream(), &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], zzero, dev_ptr, host_ptr); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); if ((*INFO == 0) && (host_ptr[0] == 1.0f)) { *INFO = ii+i+1; } // alpha = ((double)1. / U[LDU*(ii+i)+ii+i]); alpha = host_ptr[1]; status = cublasDscal(kaapi_cuda_cublas_handle(), M, &alpha, &A[LDA*(ii+i)], 1 ); PLASMA_CUBLAS_ASSERT(status); status = cublasDcopy(kaapi_cuda_cublas_handle(), M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1 ); PLASMA_CUBLAS_ASSERT(status); status = cublasDger(kaapi_cuda_cublas_handle(), M, sb-i-1, &mzone, &A[LDA*(ii+i)], 1, &U[LDU*(ii+i+1)+ii+i], LDU, &A[LDA*(ii+i+1)], LDA ); PLASMA_CUBLAS_ASSERT(status); ip = ip+1; } /* * Apply the subpanel to the rest of the panel. */ if(ii+i < N) { for(j = ii; j < ii+sb; j++) { if (piv[j] <= NB) { piv[j] = piv[j] - ii; } } CORE_dssssm_cublas_v2( NB, N-(ii+sb), M, N-(ii+sb), sb, sb, &U[LDU*(ii+sb)+ii], LDU, &A[LDA*(ii+sb)], LDA, &L[LDL*ii], LDL, WORK, LDWORK, &piv[ii] ); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); for(j = ii; j < ii+sb; j++) { if (piv[j] <= NB) { piv[j] = piv[j] + ii; } } } } cudaFreeHost(host_ptr); cudaFree(dev_ptr); return PLASMA_SUCCESS; }