void copyBandMatrixFromDevice(gpu_symm_band_matrix gpu_matrix, double* h_matrix, cublasHandle_t handle) { int num_tiles; int bs = gpu_matrix.block_size; int order = gpu_matrix.order; int hb = gpu_matrix.half_bandwith; int i; int cur_row; int cur_bs; double* temp_tile; num_tiles = (order + bs - 1) / bs; temp_tile = (double*) malloc( (bs + hb) * bs *sizeof(double) ); for (i = 0, cur_row = 0; i < num_tiles; i++, cur_row += bs) { cur_bs = bs; if (cur_row + cur_bs > order) { cur_bs = order - cur_row; } checkCublasErrors(cublasGetMatrix(cur_bs, gpu_matrix.tile_len[i], sizeof(double), gpu_matrix.gpu_matrix_tiles[i], cur_bs, temp_tile, bs)); set_host_matrix_tile( cur_bs, cur_row, hb, order, h_matrix, temp_tile, bs, gpu_matrix.tile_len[i]); } free(temp_tile); }
void* raw( pointer host ) const { cublasGetMatrix ( height_, width_, sizeof(value_type) , gpu_ptr_, height_, host, height_ ); return host; }
void CuBlasMatrix::updateHostData() { cublasStatus_t status = cublasGetMatrix(rows, cols, sizeof(cuDoubleComplex), deviceRaw, rows, hostRaw, rows); if(status != CUBLAS_STATUS_SUCCESS) { qFatal("Data download faild"); } deviceDataChanged = false; }
float* micronn_matrix_get_vals(micronn_matrix* w) { cublasStatus_t stat; float* vals = malloc(sizeof(float) * w->rows * w->cols); stat = cublasGetMatrix(w->rows, w->cols, sizeof(float), w->devPtrvals, w->rows, vals, w->rows); if(stat != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "data download failed\n"); free(vals); return NULL; } return vals; };
void magma_getmatrix( magma_int_t m, magma_int_t n, size_t elemSize, void const* dA_src, magma_int_t lda, void* hB_dst, magma_int_t ldb ) { cublasStatus_t status; status = cublasGetMatrix( m, n, elemSize, dA_src, lda, hB_dst, ldb ); check_error( status ); }
void GPUMatrix::getMatrix(Matrix* target) { assert(target->_rows == _rows && target->_cols == _cols); cublasStatus_t cublasStat = cublasGetMatrix( _rows, _cols, sizeof(*target->_data), _data, _rows, target->_data, target->_rows ); if (cublasStat != CUBLAS_STATUS_SUCCESS) // TODO: define exception class with cublas return codes? throw std::runtime_error("CUBLAS error in GetMatrix"); }
void magma_getmatrix_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, void const* dA_src, magma_int_t lda, void* hB_dst, magma_int_t ldb, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasGetMatrix( m, n, elemSize, dA_src, lda, hB_dst, ldb ); check_xerror( status, func, file, line ); }
void magma_sgetmatrix_internal( magma_int_t m, magma_int_t n, float const* dA_src, magma_int_t lda, float* hB_dst, magma_int_t ldb, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasGetMatrix( m, n, sizeof(float), dA_src, lda, hB_dst, ldb ); check_xerror( status, func, file, line ); }
// -------------------- extern "C" void magma_zgetmatrix_internal( magma_int_t m, magma_int_t n, magmaDoubleComplex_const_ptr dA_src, magma_int_t lda, magmaDoubleComplex* hB_dst, magma_int_t ldb, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasGetMatrix( m, n, sizeof(magmaDoubleComplex), dA_src, lda, hB_dst, ldb ); check_xerror( status, func, file, line ); }
int double_copyMatrixGPU2Host_Transpose(PGM_Matriz_Double *host, PGM_Matriz_GPU* device, PGM_Matriz_Double *work){ if(work->n_colunas == device->max_dim && work->n_linhas == device->max_dim){ if(cublasGetMatrix(device->max_dim,device->max_dim,sizeof(double),device->valor, device->max_dim,work->valor,work->n_colunas) != CUBLAS_STATUS_SUCCESS){ return CUBLAS_STATUS_INVALID_VALUE; }else{ int i,j; for( i = 0; i < host->n_linhas; i++) for(j = 0; j < host->n_colunas; j++) PGM_ELEM_MATRIZ(host, i,j) = PGM_ELEM_MATRIZ(work,j,i); return CUBLAS_STATUS_SUCCESS; } }else{ return CUBLAS_STATUS_INVALID_VALUE; } }
SEXP d_getMatrix(SEXP mList, SEXP rld) { int rows, cols, ld = asInteger(rld); double * dPtr; unpackMatrix(mList, &rows, &cols, &dPtr); SEXP out, dim; PROTECT(out = allocVector(REALSXP, rows * cols)); cublasGetMatrix(rows, cols, sizeof(double), dPtr, ld, REAL(out), rows); checkCublasError("d_getMatrix"); PROTECT(dim = allocVector(INTSXP, 2)); INTEGER(dim)[0] = rows; INTEGER(dim)[1] = cols; setAttrib(out, R_DimSymbol, dim); UNPROTECT(2); return out; }
void cuda_gemm(const cublasHandle_t handle, const cublasOperation_t transa, const cublasOperation_t transb, int m, int n, int k, const T alpha, const T A[], int lda, const T B[], int ldb, const T beta, T C[], int ldc, GEMM gemm) { T *d_A = NULL; cudaMalloc((void**)&d_A, m*k*sizeof(T)); cublasSetMatrix(m, k, sizeof(T), A, m, d_A, m); T *d_B = NULL; cudaMalloc((void**)&d_B, k*n*sizeof(T)); cublasSetMatrix(k, n, sizeof(T), B, k, d_B, k); T *d_C = NULL; cudaMalloc((void**)&d_C, m*n*sizeof(T)); cublasSetMatrix(m, n, sizeof(T), C, m, d_C, m); gemm(handle, transa, transb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc); cublasGetMatrix(m, n, sizeof(T), d_C, m, C, m); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); }
void mat_prod_mat(const double* a, cublasOperation_t op_a, const double* b, cublasOperation_t op_b, double*c, int m, int n, int k){ cudaError_t cudaStat ; // cudaMalloc status cublasStatus_t stat ; // CUBLAS functions status cublasHandle_t handle ; // CUBLAS context // on the device double* d_a; // d_a - a on the device double* d_b; // d_b - b on the device double* d_c; // d_c - c on the device cudaStat = cudaMalloc((void **)&d_a ,m*k*sizeof(*a)); // device // memory alloc for a cudaStat = cudaMalloc((void **)&d_b ,k*n*sizeof(*b)); // device // memory alloc for b cudaStat = cudaMalloc((void **)&d_c ,m*n*sizeof(*c)); // device // memory alloc for c stat = cublasCreate(&handle); // initialize CUBLAS context // copy matrices from the host to the device stat = cublasSetMatrix (m,k, sizeof(*a) ,a,m,d_a ,m); //a -> d_a stat = cublasSetMatrix (k,n, sizeof(*b) ,b,k,d_b ,k); //b -> d_b stat = cublasSetMatrix (m,n, sizeof(*c) ,c,m,d_c ,m); //c -> d_c double al=1.0; double bet=1.0; // matrix - matrix multiplication : d_c = al*d_a *d_b + bet *d_c // d_a -mxk matrix , d_b -kxn matrix , d_c -mxn matrix ; // al ,bet -scalars stat=cublasDgemm(handle,op_a,op_b,m,n,k,&al,d_a,m,d_b,k,&bet,d_c,m); stat = cublasGetMatrix (m, n, sizeof(*c) ,d_c ,m,c,m); // cp d_c - >c cudaFree (d_a ); // free device memory cudaFree (d_b ); // free device memory cudaFree (d_c ); // free device memory cublasDestroy ( handle ); // destroy CUBLAS context }
int main(){ int m = 10; int n = m; cudaError_t cudaStat ; // cudaMalloc status cublasStatus_t stat ; // CUBLAS functions status cublasHandle_t handle ; // CUBLAS context int i,j; // i-row index , j-col. index double * a; // mxm matrix a on the host double * b; // mxm matrix b on the host double * c; // mxm matrix c on the host a=( double *) malloc (m*m* sizeof ( double )); // host memory for a b=( double *) malloc (m*m* sizeof ( double )); // host memory for b c=( double *) malloc (m*m* sizeof ( double )); // host memory for b int ind =1; // a: for(j=0;j<m;j ++){ // 11 for(i=0;i<m;i ++){ // 12 ,17 // if(i >=j){ // 13 ,18 ,22 a[ IDX2C(i,j,m)]=( float )ind ++; // 14 ,19 ,23 ,26 // } // 15 ,20 ,24 ,27 ,29 } // 16 ,21 ,25 ,28 ,30 ,31 } printf (" lower triangle of a:\n"); for (i=0;i<m;i ++){ for (j=0;j<m;j ++){ // if(i >=j) printf (" %5.0f",a[ IDX2C(i,j,m)]); } printf ("\n"); } ind =11; // b: for(j=0;j<n;j ++){ // 11 ,17 ,23 ,29 ,35 for(i=0;i<m;i ++){ // 12 ,18 ,24 ,30 ,36 if(i == j) b[ IDX2C(i,j,m)] = 1.0; // 13 ,19 ,25 ,31 ,37 else b[ IDX2C(i, j, m)] = 2.0; //ind ++; // 14 ,20 ,26 ,32 ,38 } // 15 ,21 ,27 ,33 ,39 } // 16 ,22 ,28 ,34 ,40 printf ("b:\n"); for (i=0;i<m;i ++){ for (j=0;j<n;j ++){ printf (" %5.0f",b[IDX2C(i,j,m)]); // print b row by row } printf ("\n"); } double * d_a; // d_a - a on the device double * d_b; // d_b - b on the device double * d_c; // d_c - c on the devicde cudaStat = cudaMalloc (( void **)& d_a ,m*m* sizeof (*a)); // device memory alloc for a cudaStat = cudaMalloc (( void **)& d_b ,m*m* sizeof (*b)); // device memory alloc for b cudaStat = cudaMalloc (( void **)& d_c ,m*m* sizeof (*c)); // device memory alloc for c stat = cublasCreate (& handle ); // initialize CUBLAS context stat = cublasSetMatrix (m,m, sizeof (*a) ,a,m,d_a ,m); //a -> d_a stat = cublasSetMatrix (m,m, sizeof (*b) ,b,m,d_b ,m); //b -> d_b double startime = CycleTimer::currentSeconds(); gpu_blas_mmul(d_a, d_b, d_c, m, m, m); double endtime = CycleTimer::currentSeconds(); stat = cublasGetMatrix (m,n, sizeof (*c) ,d_c ,m,c,m); // d_b -> b printf (" solution x from Strsm :\n"); for(i=0;i<m;i ++){ for(j=0;j<n;j ++){ printf (" %11.5f",c[IDX2C(i,j,m )]); // print b after Strsm } printf ("\n"); } cudaFree (d_a ); // free device memory cudaFree (d_b ); // free device memory cudaFree (d_c ); // free device memory cublasDestroy ( handle ); // destroy CUBLAS context free (a); // free host memory free (b); // free host memory free (c); // free host memory printf("Time taken: %lf\n", endtime - startime); return EXIT_SUCCESS ; }
int main( int argc, char **argv ) { double *A, *B, *C; double *cu_A, *cu_B, *cu_C; cudaError_t cuError; cublasStatus_t cuStatus; cublasHandle_t cuHandle; // seed rand() srand(time(NULL)); // allocate memory on CPU A = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_P); B = (double*)malloc(sizeof(double)*MATRIX_P*MATRIX_M); C = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_M); if( !A || !B || !C ) { perror("Can't allocate CPU matrices"); exit(EXIT_FAILURE); } // generate matrices for( int i = 0; i < MATRIX_N*MATRIX_P; i++ ) A[i] = 10.0*((double)rand())/RAND_MAX; for( int i = 0; i < MATRIX_P*MATRIX_M; i++ ) B[i] = 10.0*((double)rand())/RAND_MAX; // allocate memory on GPU cuError = cudaMalloc( &cu_A, sizeof(double)*MATRIX_N*MATRIX_P ); if( cuError != cudaSuccess ) { fprintf(stderr, "Can't allocate GPU matrices\n"); exit(EXIT_FAILURE); } cuError = cudaMalloc( &cu_B, sizeof(double)*MATRIX_P*MATRIX_M ); if( cuError != cudaSuccess ) { fprintf(stderr, "Can't allocate GPU matrices\n"); exit(EXIT_FAILURE); } cuError = cudaMalloc( &cu_C, sizeof(double)*MATRIX_N*MATRIX_M ); if( cuError != cudaSuccess ) { fprintf(stderr, "Can't allocate GPU matrices\n"); exit(EXIT_FAILURE); } // setup cuBlas cuStatus = cublasCreate( &cuHandle ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error initializing cuBlas\n"); exit(EXIT_FAILURE); } // setup matrices cuStatus = cublasSetMatrix( MATRIX_N, MATRIX_P, sizeof(double), A, MATRIX_N, cu_A, MATRIX_N ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error transferring matrix A\n"); exit(EXIT_FAILURE); } cuStatus = cublasSetMatrix( MATRIX_P, MATRIX_M, sizeof(double), B, MATRIX_P, cu_B, MATRIX_P ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error transferring matrix B\n"); exit(EXIT_FAILURE); } // multiply double one = 1.0; double zero = 0.0; cuStatus = cublasDgemm( cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, MATRIX_N, MATRIX_M, MATRIX_P, &one, cu_A, MATRIX_N, cu_B, MATRIX_P, &zero, cu_C, MATRIX_N ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error executing matrix mult\n"); exit(EXIT_FAILURE); } // get results cuStatus = cublasGetMatrix( MATRIX_N, MATRIX_M, sizeof(double), cu_C, MATRIX_N, C, MATRIX_N ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error transferring results\n"); exit(EXIT_FAILURE); } // check results bool good = true; for( int i = 0; i < MATRIX_N; i++ ) { for( int j = 0; j < MATRIX_M; j++ ) { double sum = 0.0; for( int k = 0; k < MATRIX_P; k++ ) { sum += A[IDX2C(i, k, MATRIX_N)]*B[IDX2C(k, j, MATRIX_P)]; } // check if( fabs(sum - C[IDX2C(i,j,MATRIX_N)]) > 0.00001 ) { good = false; printf("(%i, %i) sum = %f\tcu_C = %f\tMISMATCH\n", i, j, sum, C[IDX2C(i,j,MATRIX_N)]); } } } if( good ) printf("Results Match\n"); else printf("Results DO NOT Match\n"); // cleanup free( A ); free( B ); free( C ); cudaFree( cu_A ); cudaFree( cu_B ); cudaFree( cu_C ); cublasDestroy( cuHandle ); return 0; }
void Cpsgecopy_general_async(int m, int n, void *A, int ia, int ja, int *descA, void *B, int ib, int jb, int *descB, int is_device_to_host) { #define dA(i,j) (((float*)A) + IDX2F(i,j,descA[LLD_])) #define dT(i,j) (((float *)T) + IDX2F(i,j,descT[LLD_])) #define dB(i,j) (((float *)B) + IDX2F(i,j,descB[LLD_])) /* perform copy B( ib:(ib+m-1), jb:(jb+n-1)) <- A( ia:(ia+m-1),ja:(ja+n-1)) */ const int use_MallocHost = FALSE; const int use_igsum2d = FALSE; cublasStatus cu_status; cudaError_t cuda_status; char notrans[] = "NoTrans"; int descT[DLEN_]; int ldA,ldB,ldT; int is_same_context, is_same_mb, is_same_nb; int is_same_p, is_same_q; int is_same_offset; int is_same_Locp, is_same_Locq; int is_aligned; int lrA1,lcA1, lrA2,lcA2; int lrT1,lcT1, lrT2,lcT2; int lrB1,lcB1, lrB2,lcB2; int rsrc,csrc; int rsrcA1,csrcA1, rsrcA2, csrcA2; int rsrcB1,csrcB1, rsrcB2, csrcB2; int iia,jja, iib,jjb; int icontxt, nprow,npcol, myprow,mypcol; int LocpA,LocqA, LocpB,LocqB, LocpT,LocqT; int mm,nn, lmm,lnn; size_t nbytes; float one_[REAL_PART+IMAG_PART+1]; float *one = &(one_[0]); float zero_[REAL_PART+IMAG_PART+1]; float *zero = &(zero_[0]); float alpha_[REAL_PART+IMAG_PART+1]; float *alpha = &(alpha_[0]); float beta_[REAL_PART+IMAG_PART+1]; float *beta = &(beta_[0]); int isize, isizeT; float *T = 0; int elemSize = sizeof(float); int nnb, jstart,jend,jsize; int is_ok; int nmax; const int bufsize = 1024*1024; const int use_simple = FALSE;; one[REAL_PART] = 1.0; one[IMAG_PART] = 0.0; zero[REAL_PART] = 0.0; zero[IMAG_PART] = 0.0; if ((m <= 0) || (n <= 0)) { return; }; T = 0; ldA = descA[LLD_]; ldB = descB[LLD_]; icontxt = descA[CTXT_]; Cblacs_gridinfo( icontxt, &nprow,&npcol, &myprow, &mypcol); assert( nprow >= 1); assert( npcol >= 1); assert( (0 <= myprow) && (myprow < nprow)); assert( (0 <= mypcol) && (mypcol < npcol)); is_ok = (1 <= ia) && (ia + m-1 <= descA[M_]); if (!is_ok) { printf("Cpsgecopy (%d,%d) :ia %d m %d descA[M_] %d \n", myprow,mypcol, ia, m, descA[M_] ); printf("Cpsgecopy (%d,%d) :ja %d n %d descA[N_] %d \n", myprow,mypcol, ja, n, descA[N_] ); printf("Cpsgecopy (%d,%d) :ib %d jb %d descB[M_] %d descB[N_] %d\n", myprow,mypcol, ib, jb, descB[M_], descB[N_] ); }; assert( (1 <= ia) && (ia + m-1 <= descA[M_])); assert( (1 <= ja) && (ja + n-1 <= descA[N_])); assert( (1 <= ib) && (ib + m-1 <= descB[M_])); assert( (1 <= jb) && (jb + n-1 <= descB[N_])); is_same_context = (descA[CTXT_] == descB[CTXT_]); is_same_mb = (descA[MB_] == descB[MB_]); is_same_nb = (descA[NB_] == descB[NB_]); is_same_p = (Cindxg2p(ia,descA[MB_], myprow, descA[RSRC_],nprow) == Cindxg2p(ib,descB[MB_], myprow, descB[RSRC_],nprow) ); is_same_q = (Cindxg2p(ja,descA[NB_], mypcol, descA[CSRC_],npcol) == Cindxg2p(jb,descB[NB_], mypcol, descB[CSRC_],npcol) ); is_same_offset = (MOD(ia,descA[MB_]) == MOD(ib,descB[MB_])) && (MOD(ja,descA[NB_]) == MOD(jb,descB[NB_])); local_extent( m,n, ia,ja,descA, &LocpA,&LocqA, &lrA1,&lcA1, &lrA2,&lcA2 ); local_extent( m,n, ib,jb,descB, &LocpB,&LocqB,&lrB1,&lcB1, &lrB2,&lcB2 ); /* if ((LocpA >= 1) || (LocpB >= 1)) { is_same_Locp = (LocpA == LocpB); }; if ((LocqA >= 1) || (LocqB >= 1)) { is_same_Locq = (LocqA == LocqB); }; */ is_same_Locq = (LocqA == LocqB); is_same_Locp = (LocpA == LocpB); is_aligned = is_same_context && is_same_mb && is_same_nb && is_same_p && is_same_q && is_same_offset && is_same_Locp && is_same_Locq; assert( is_same_q ); assert( is_same_p ); assert( is_same_offset ); assert( is_same_Locp ); assert( is_same_Locq ); assert( is_aligned ); /* no communication required copy from device to host */ ldA = descA[LLD_]; ldB = descB[LLD_]; mm = LocpA; nn = LocqA; if (is_device_to_host) { /* * transfer from device to host */ if ( (mm >= 1) && (nn >= 1) ) { #ifdef USE_CUBLASV2 { cublasStatus_t istatus; istatus = cublasGetMatrixAsync(mm, nn, elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1), ldB, cublas_get_stream() ); assert( istatus == CUBLAS_STATUS_SUCCESS ); } #else cu_status = cublasGetMatrix(mm,nn, elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1),ldB ); CHKERR(cu_status); #endif }; } else { /* * transfer from host to device */ if ( (mm >= 1) && (nn >= 1) ) { #ifdef USE_CUBLASV2 { cublasStatus_t istatus; istatus = cublasSetMatrixAsync(mm,nn,elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1),ldB, cublas_get_stream() ); assert( istatus == CUBLAS_STATUS_SUCCESS ); } #else cu_status = cublasSetMatrix(mm,nn,elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1),ldB ); CHKERR(cu_status); #endif }; }; return; }
void Cpcswap_gpu( int n, cuComplex *A, int ia,int ja,int *descA, int incA, cuComplex *B, int ib,int jb,int *descB, int incB ) { /* perform pcswap operation when both distributed arrays A and B are in device memory */ /* * allocate temporary space on host * then use pcswap for communication */ const int use_MallocHost = FALSE; cublasStatus cu_status; size_t nbytes; int elemSize = sizeof( cuComplex ); float *Atmp = 0; float *Btmp = 0; int descAtmp[DLEN_]; int descBtmp[DLEN_]; int ldA, ldB, ldAtmp, ldBtmp; int nprow,npcol,myprow,mypcol; int Locp, Locq, lrindx, lcindx, mm,nn; int LocpA, LocqA, lrindxA, lcindxA; int LocpB, LocqB, lrindxB, lcindxB; int isizeA, isizeB, rsrc, csrc; int iia,jja, iib, jjb; int incAtmp, incBtmp; int lrA1,lcA1, lrA2,lcA2; int lrB1,lcB1, lrB2,lcB2; Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol ); /* * allocate storage for vector from A */ if (incA == 1) { /* * This is a column vector */ mm = n; nn = 1; } else { /* * This is a row vector */ mm = 1; nn = n; }; setup_desc( mm,nn, ia,ja, descA, &isizeA, descAtmp ); nbytes = elemSize; nbytes *= isizeA; if (use_MallocHost) { Atmp = (float *) MallocHost( nbytes ); } else { Atmp = (float *) malloc( nbytes ); }; assert( Atmp != 0 ); /* * copy vector from A */ PROFSTART("swap:GetMatrix"); local_extent( mm,nn,ia,ja,descA, &LocpA, &LocqA, &lrA1,&lcA1, &lrA2,&lcA2 ); lrindxA = lrA1; lcindxA = lcA1; ldA = descA[LLD_]; ldAtmp = descAtmp[LLD_]; if ( (LocpA >= 1) && (LocqA >= 1)) { /* * copy from GPU device to host CPU */ cu_status = cublasGetMatrix( LocpA,LocqA, elemSize, dA(lrindxA,lcindxA), ldA, Atmp, ldAtmp ); CHKERR(cu_status); }; /* * allocate storage for vector from B */ Cblacs_gridinfo( descB[CTXT_], &nprow, &npcol, &myprow, &mypcol ); if (incB == 1) { /* * This is a column vector */ mm = n; nn = 1; } else { /* * This is a row vector */ mm = 1; nn = n; }; setup_desc( mm,nn, ib,jb,descB, &isizeB, descBtmp ); ldBtmp = descBtmp[LLD_]; ldB = descB[LLD_]; nbytes = elemSize; nbytes *= isizeB; if (use_MallocHost) { Btmp = (float *) MallocHost( nbytes ); } else { Btmp = (float *) malloc( nbytes ); }; assert( Btmp != 0 ); /* * copy vector from B */ local_extent( mm,nn,ib,jb,descB, &LocpB, &LocqB, &lrB1,&lcB1, &lrB2,&lcB2 ); lrindxB = lrB1; lcindxB = lcB1; ldB = descB[LLD_]; ldBtmp = descBtmp[LLD_]; if ((LocpB >= 1) && (LocqB >= 1)) { /* * Copy from GPU to CPU host */ cu_status = cublasGetMatrix(LocpB,LocqB,elemSize, dB(lrindxB,lcindxB), ldB, Btmp, ldBtmp ); CHKERR(cu_status ); }; PROFEND("swap:GetMatrix"); iia = 1; jja = 1; iib = 1; jjb = 1; if (incA == 1) { incAtmp = 1; } else { incAtmp = descAtmp[M_]; }; if (incB == 1) { incBtmp = 1; } else { incBtmp = descBtmp[M_]; }; PROFSTART("swap:pcswap"); scalapack_pcswap( &n, Atmp, &iia, &jja, descAtmp, &incAtmp, Btmp, &iib, &jjb, descBtmp, &incBtmp ); PROFEND("swap:pcswap"); /* * copy from host CPU back to GPU */ PROFSTART("swap:SetMatrix"); if ((LocpA >= 1) && (LocqA >= 1)) { /* * Copy from CPU host to GPU device */ cu_status = cublasSetMatrix( LocpA, LocqA, elemSize, Atmp, ldAtmp, dA(lrindxA,lcindxA), ldA ); CHKERR(cu_status); }; if ((LocpB >= 1) && (LocqB >= 1)) { /* * Copy from CPU host to GPU device */ cu_status = cublasSetMatrix( LocpB, LocqB, elemSize, Btmp, ldBtmp, dB(lrindxB,lcindxB), ldB ); CHKERR(cu_status); }; PROFEND("swap:SetMatrix"); /* * clean up */ if (Atmp != 0) { if (use_MallocHost) { FreeHost(Atmp); } else { free(Atmp); }; Atmp = 0; }; if (Btmp != 0) { if (use_MallocHost) { FreeHost(Btmp); } else { free(Btmp); }; Btmp = 0; }; return; }
/* ========================================================================== */ int sci_gpuLU(char *fname) { CheckRhs(1,2); CheckLhs(2,2); #ifdef WITH_CUDA cublasStatus status; #endif SciErr sciErr; int* piAddr_A = NULL; double* h_A = NULL; double* hi_A = NULL; int rows_A; int cols_A; int* piAddr_Opt = NULL; double* option = NULL; int rows_Opt; int cols_Opt; void* d_A = NULL; int na; void* pvPtr = NULL; int size_A = sizeof(double); bool bComplex_A = FALSE; int inputType_A; int inputType_Opt; double res; int posOutput = 1; try { sciErr = getVarAddressFromPosition(pvApiCtx, 1, &piAddr_A); if(sciErr.iErr) throw sciErr; if(Rhs == 2) { sciErr = getVarAddressFromPosition(pvApiCtx, 2, &piAddr_Opt); if(sciErr.iErr) throw sciErr; sciErr = getVarType(pvApiCtx, piAddr_Opt, &inputType_Opt); if(sciErr.iErr) throw sciErr; if(inputType_Opt == sci_matrix) { sciErr = getMatrixOfDouble(pvApiCtx, piAddr_Opt, &rows_Opt, &cols_Opt, &option); if(sciErr.iErr) throw sciErr; } else throw "Option syntax is [number,number]."; } else { rows_Opt=1; cols_Opt=2; option = (double*)malloc(2*sizeof(double)); option[0]=0; option[1]=0; } if(rows_Opt != 1 || cols_Opt != 2) throw "Option syntax is [number,number]."; if((int)option[1] == 1 && !isGpuInit()) throw "gpu is not initialised. Please launch gpuInit() before use this function."; sciErr = getVarType(pvApiCtx, piAddr_A, &inputType_A); if(sciErr.iErr) throw sciErr; #ifdef WITH_CUDA if (useCuda()) { if(inputType_A == sci_pointer) { sciErr = getPointer(pvApiCtx, piAddr_A, (void**)&pvPtr); if(sciErr.iErr) throw sciErr; gpuMat_CUDA* gmat; gmat = static_cast<gpuMat_CUDA*>(pvPtr); if(!gmat->useCuda) throw "Please switch to OpenCL mode before use this data."; rows_A=gmat->rows; cols_A=gmat->columns; if(gmat->complex) { bComplex_A = TRUE; size_A = sizeof(cuDoubleComplex); d_A=(cuDoubleComplex*)gmat->ptr->get_ptr(); } else d_A=(double*)gmat->ptr->get_ptr(); // Initialize CUBLAS status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) throw status; na = rows_A * cols_A; } else if(inputType_A == 1) { // Get size and data if(isVarComplex(pvApiCtx, piAddr_A)) { sciErr = getComplexMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A, &hi_A); if(sciErr.iErr) throw sciErr; size_A = sizeof(cuDoubleComplex); bComplex_A = TRUE; } else { sciErr = getMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A); if(sciErr.iErr) throw sciErr; } na = rows_A * cols_A; // Initialize CUBLAS status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Allocate device memory status = cublasAlloc(na, size_A, (void**)&d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Initialize the device matrices with the host matrices if(!bComplex_A) { status = cublasSetMatrix(rows_A,cols_A, sizeof(double), h_A, rows_A, (double*)d_A, rows_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; } else writecucomplex(h_A, hi_A, rows_A, cols_A, (cuDoubleComplex *)d_A); } else throw "Bad argument type."; cuDoubleComplex resComplex; // Performs operation if(!bComplex_A) status = decomposeBlockedLU(rows_A, cols_A, rows_A, (double*)d_A, 1); // else // resComplex = cublasZtrsm(na,(cuDoubleComplex*)d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Put the result in scilab switch((int)option[0]) { case 2 : case 1 : sciprint("The first option must be 0 for this function. Considered as 0.\n"); case 0 : // Keep the result on the Host. { // Put the result in scilab if(!bComplex_A) { double* h_res = NULL; sciErr=allocMatrixOfDouble(pvApiCtx, Rhs + posOutput, rows_A, cols_A, &h_res); if(sciErr.iErr) throw sciErr; status = cublasGetMatrix(rows_A,cols_A, sizeof(double), (double*)d_A, rows_A, h_res, rows_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; } else { sciErr = createComplexMatrixOfDouble(pvApiCtx, Rhs + posOutput, 1, 1, &resComplex.x,&resComplex.y); if(sciErr.iErr) throw sciErr; } LhsVar(posOutput)=Rhs+posOutput; posOutput++; break; } default : throw "First option argument must be 0 or 1 or 2."; } switch((int)option[1]) { case 0 : // Don't keep the data input on Device. { if(inputType_A == sci_matrix) { status = cublasFree(d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; d_A = NULL; } break; } case 1 : // Keep data of the fisrt argument on Device and return the Device pointer. { if(inputType_A == sci_matrix) { gpuMat_CUDA* dptr; gpuMat_CUDA tmp={getCudaContext()->genMatrix<double>(getCudaQueue(),rows_A*cols_A),rows_A,cols_A}; dptr=new gpuMat_CUDA(tmp); dptr->useCuda = true; dptr->ptr->set_ptr((double*)d_A); if(bComplex_A) dptr->complex=TRUE; else dptr->complex=FALSE; sciErr = createPointer(pvApiCtx,Rhs+posOutput, (void*)dptr); if(sciErr.iErr) throw sciErr; LhsVar(posOutput)=Rhs+posOutput; } else throw "The first input argument is already a GPU variable."; posOutput++; break; } default : throw "Second option argument must be 0 or 1."; } // Shutdown status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) throw status; } #endif #ifdef WITH_OPENCL if (!useCuda()) { throw "not implemented with OpenCL."; } #endif if(Rhs == 1) { free(option); option = NULL; } if(posOutput < Lhs+1) throw "Too many output arguments."; if(posOutput > Lhs+1) throw "Too few output arguments."; PutLhsVar(); return 0; } catch(const char* str) { Scierror(999,"%s\n",str); } catch(SciErr E) { printError(&E, 0); } #ifdef WITH_CUDA catch(cudaError_t cudaE) { GpuError::treat_error<CUDAmode>((CUDAmode::Status)cudaE); } catch(cublasStatus CublasE) { GpuError::treat_error<CUDAmode>((CUDAmode::Status)CublasE,1); } if (useCuda()) { if(inputType_A == 1 && d_A != NULL) cudaFree(d_A); } #endif #ifdef WITH_OPENCL if (!useCuda()) { Scierror(999,"not implemented with OpenCL.\n"); } #endif if(Rhs == 1 && option != NULL) free(option); return EXIT_FAILURE; }
int main ( void ){ cudaError_t cudaStat ; // cudaMalloc status cublasStatus_t stat ; // CUBLAS functions status cublasHandle_t handle ; // CUBLAS context int i,j; // i-row index , j-col. index double * a; // mxm matrix a on the host double * b; // mxn matrix b on the host a=( double *) malloc (m*m* sizeof ( double )); // host memory for a b=( double *) malloc (m*n* sizeof ( double )); // host memory for b int ind =11; // a: for(j=0;j<m;j ++){ // 11 for(i=0;i<m;i ++){ // 12 ,17 if(i >=j){ // 13 ,18 ,22 a[ IDX2C(i,j,m)]=( double )ind ++; // 14 ,19 ,23 ,26 } // 15 ,20 ,24 ,27 ,29 } // 16 ,21 ,25 ,28 ,30 ,31 } printf (" lower triangle of a:\n"); /* for (i=0;i<m;i ++){ for (j=0;j<m;j ++){ if(i >=j) printf (" %5.0f",a[ IDX2C(i,j,m)]); } printf ("\n"); } */ ind =11; // b: for(j=0;j<n;j ++){ // 11 ,17 ,23 ,29 ,35 for(i=0;i<m;i ++){ // 12 ,18 ,24 ,30 ,36 if(i == j) b[IDX2C(i,i,m)] = 1.0; else b[IDX2C(i,j,m)] = 0.0; } } // b[ IDX2C(i,j,m)] = ind++; /*if(i == j) b[ IDX2C(i,j,m)] = 1.0; // 13 ,19 ,25 ,31 ,37 else b[ IDX2C(i, j, m)] = 0.0;*/ //ind ++; // 14 ,20 ,26 ,32 ,38 printf ("b:\n"); /* for (i=0;i<m;i ++){ for (j=0;j<n;j ++){ printf (" %5.0f",b[IDX2C(i,j,m)]); // print b row by row } printf ("\n"); } */ double * d_a; // d_a - a on the device double * d_b; // d_b - b on the device cudaStat = cudaMalloc (( void **)& d_a ,m*m* sizeof (*a)); // device // memory alloc for a cudaStat = cudaMalloc (( void **)& d_b ,m*n* sizeof (*b)); // device // // memory alloc for b stat = cublasCreate (& handle ); // initialize CUBLAS context stat = cublasSetMatrix (m,m, sizeof (*a) ,a,m,d_a ,m); //a -> d_a stat = cublasSetMatrix (m,n, sizeof (*b) ,b,m,d_b ,m); //b -> d_b double al =1.0f; double startime = CycleTimer::currentSeconds(); (cublasDtrsm(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,m,n,&al,d_a,m,d_b,m)); stat = cublasGetMatrix (m,n, sizeof (*b) ,d_b ,m,b,m); // d_b -> b double endtime = CycleTimer::currentSeconds(); printf (" solution x from Strsm :\n"); /* for(i=0;i<m;i ++){ for(j=0;j<n;j ++){ printf (" %11.5f",b[IDX2C(i,j,m )]); // print b after Strsm } printf ("\n"); } */ cudaFree (d_a ); // free device memory cudaFree (d_b ); // free device memory cublasDestroy ( handle ); // destroy CUBLAS context free (a); // free host memory free (b); // free host memory printf("Time taken: %lf\n", endtime - startime); return EXIT_SUCCESS ; }
int main(int argc, char *argv[]) { int testN = 1; bool check_correctness = false; if (argc > 1) { testN = atoi(argv[1]); } if (argc > 2) { check_correctness = atoi(argv[2]); } std::cout << std::endl << "----------" << std::endl; std::cout << "Running sequential MM benchmark: testN: " << testN << ", check correctness: " << check_correctness << ", size: (" << S0 << ", " << S1 << ", " << S2 << ", " << S3 << ")" << std::endl; auto t1 = std::chrono::high_resolution_clock::now(); auto t2 = t1; float *A = (float*) malloc(S0 * S1 * sizeof(float)); float *B = (float*) malloc(S1 * S2 * sizeof(float)); float *C = (float*) malloc(S2 * S3 * sizeof(float)); // Initialize matrices with random values: for (int i = 0; i < S0 * S1; i++) A[i] = std::rand() % 10; for (int i = 0; i < S1 * S2; i++) B[i] = std::rand() % 10; for (int i = 0; i < S2 * S3; i++) C[i] = std::rand() % 10; std::cout << "Buffers initialized" << std::endl << std::flush; // Note that indices are flipped (see tutorial 2) Halide::Buffer<DATA_TYPE> A_buf(A, {S1, S0}); Halide::Buffer<DATA_TYPE> B_buf(B, {S2, S1}); Halide::Buffer<DATA_TYPE> C_buf(C, {S3, S2}); Halide::Buffer<DATA_TYPE> O_buf(S3, S0); // Make a dummy call to set up GPU (initalization takes time) matmul(A_buf.raw_buffer(), B_buf.raw_buffer(), C_buf.raw_buffer(), O_buf.raw_buffer()); // CPU Multiplication for correctness check if (check_correctness) { // Reference matrix multiplication std::cout << "Running CPU multiplication.." << std::endl; Halide::Buffer<DATA_TYPE> O_val_buf(S3, S0); Halide::Buffer<DATA_TYPE> T_val_buf(S2, S0); t1 = std::chrono::high_resolution_clock::now(); for (int i = 0; i < S0; i++) { for (int k = 0; k < S2; k++) { // Note that indices are flipped (see tutorial 2) T_val_buf(k, i) = 0; } } for (int i = 0; i < S0; i++) { for (int l = 0; l < S3; l++) { // Note that indices are flipped (see tutorial 2) O_val_buf(l, i) = 0; } } for (int j = 0; j < S1; j++) { for (int i = 0; i < S0; i++) { for (int k = 0; k < S2; k++) { // Note that indices are flipped (see tutorial 2) T_val_buf(k, i) += A_buf(j, i) * B_buf(k, j); } } } for (int k = 0; k < S2; k++) { for (int i = 0; i < S0; i++) { for (int l = 0; l < S3; l++) { // Note that indices are flipped (see tutorial 2) O_val_buf(l, i) += T_val_buf(k, i) * C_buf(l, k); } } } t2 = std::chrono::high_resolution_clock::now(); std::cout << "CPU matmul done: " << (std::chrono::duration<double,std::milli>(t2 - t1)).count() << "ms" << std::endl << std::flush; compare_buffers("matmul", O_buf, O_val_buf); } // GPU Multiplication t1 = std::chrono::high_resolution_clock::now(); for (int i = 0; i < testN; i++) { matmul(A_buf.raw_buffer(), B_buf.raw_buffer(), C_buf.raw_buffer(), O_buf.raw_buffer()); } t2 = std::chrono::high_resolution_clock::now(); std::cout << "GPU matmul done: " << (std::chrono::duration<double,std::milli>(t2 - t1)).count() / testN << "ms" << std::endl << std::flush; // CUBLAS SGEMM // Transposed copies for cublas float *A_T = (float*) malloc(S0 * S1 * sizeof(float)); float *B_T = (float*) malloc(S1 * S2 * sizeof(float)); float *C_T = (float*) malloc(S2 * S3 * sizeof(float)); float *O_T = (float*) malloc(S0 * S3 * sizeof(float)); // Transpose for (int i = 0; i < S0; i++) for (int j = 0; j < S1; j++) A_T[i + j * S0] = A[i * S1 + j]; for (int i = 0; i < S1; i++) for (int j = 0; j < S2; j++) B_T[i + j * S1] = B[i * S2 + j]; for (int i = 0; i < S2; i++) for (int j = 0; j < S3; j++) C_T[i + j * S2] = C[i * S3 + j]; // Excluding handle creation which is time consuming cublasHandle_t handle; cublasCreate(&handle); t1 = std::chrono::high_resolution_clock::now(); for (int i = 0; i < testN; i++) { float *d_A; float *d_B; float *d_C; float *d_T; float *d_O; cudaMalloc((void**)&d_A, S0 * S1 * sizeof(*A)); cudaMalloc((void**)&d_B, S1 * S2 * sizeof(*A)); cudaMalloc((void**)&d_C, S2 * S3 * sizeof(*A)); cudaMalloc((void**)&d_T, S0 * S2 * sizeof(*A)); cudaMalloc((void**)&d_O, S0 * S3 * sizeof(*A)); cublasSetMatrix(S0, S1, sizeof(*A), A_T, S0, d_A, S0); cublasSetMatrix(S1, S2, sizeof(*B), B_T, S1, d_B, S1); cublasSetMatrix(S2, S3, sizeof(*C), C_T, S2, d_C, S2); float alpha_var = 1; float beta_var = 0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, S0, S2, S1, &alpha_var, d_A, S0, d_B, S1, &beta_var, d_T, S0); cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, S0, S3, S2, &alpha_var, d_T, S0, d_C, S2, &beta_var, d_O, S0); cublasGetMatrix(S0, S3, sizeof(*C), d_O, S0, O_T, S0); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_T); cudaFree(d_O); } t2 = std::chrono::high_resolution_clock::now(); std::cout << "cublas matmul done (excluding cublasHandle creation): " << (std::chrono::duration<double,std::milli>(t2 - t1) / testN).count() << "ms" << std::endl << std::flush; cublasDestroy(handle); bool check_cublas_difference = false; if (check_cublas_difference) { bool flag = true; for (int i = 0; i < S0 && flag; i++) { for (int j = 0; j < S3; j++) { if (O_buf(j, i) != O_T[i + j * S0]) { std::cout << "cublas validation mismatch:" << std::endl; std::cout << i << " " << j << " " << O_T[i + j * S0] << " " << O_buf(j, i) << std::endl; } } } if (flag) { std::cout << "cublas and validation match" << std::endl; } } free(A); free(B); free(C); free(A_T); free(B_T); free(C_T); free(O_T); std::cout << "----------" << std::endl << std::endl; return 0; }
extern "C" magma_int_t magma_zgeqrf2(magma_context *cntxt, magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.5.0-beta3) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date July 2014 Purpose ======= ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments ========= CNTXT (input) MAGMA_CONTEXT CNTXT specifies the MAGMA hardware context for this routine. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using cudaMallocHost. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -8, the GPU memory allocation failed Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) int cnt=-1; cuDoubleComplex c_one = MAGMA_Z_ONE; int i, k, lddwork, old_i, old_ib; int nbmin, nx, ib, ldda; *info = 0; magma_qr_params *qr_params = (magma_qr_params *)cntxt->params; int nb = qr_params->nb; int lwkopt = n * nb; work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); long int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } else if (lquery) return MAGMA_SUCCESS; k = min(m,n); if (k == 0) { work[0] = c_one; return MAGMA_SUCCESS; } cublasStatus status; static cudaStream_t stream[2]; cudaStreamCreate(&stream[0]); cudaStreamCreate(&stream[1]); nbmin = 2; nx = nb; lddwork = ((n+31)/32)*32; ldda = ((m+31)/32)*32; cuDoubleComplex *da; status = cublasAlloc((n)*ldda + nb*lddwork, sizeof(cuDoubleComplex), (void**)&da); if (status != CUBLAS_STATUS_SUCCESS) { *info = -8; return 0; } cuDoubleComplex *dwork = da + ldda*(n); if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ cudaMemcpy2DAsync(da_ref(0,nb), ldda*sizeof(cuDoubleComplex), a_ref(0,nb), lda *sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*(m), (n-nb), cudaMemcpyHostToDevice,stream[0]); old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); if (i>0){ cudaMemcpy2DAsync( a_ref(i,i), lda *sizeof(cuDoubleComplex), da_ref(i,i), ldda*sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*(m-i), ib, cudaMemcpyDeviceToHost,stream[1]); cudaMemcpy2DAsync( a_ref(0,i), lda *sizeof(cuDoubleComplex), da_ref(0,i), ldda*sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*i, ib, cudaMemcpyDeviceToHost,stream[0]); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, da_ref(old_i, old_i), ldda, dwork, lddwork, da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); } cudaStreamSynchronize(stream[1]); int rows = m-i; cnt++; cntxt->nb = qr_params->ib; magma_zgeqrf_mc(cntxt, &rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); cntxt->nb = nb; /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib); if (cnt < qr_params->np_gpu) { qr_params->p[cnt]=a; } zpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb); cublasSetMatrix(rows, ib, sizeof(cuDoubleComplex), a_ref(i,i), lda, da_ref(i,i), ldda); if (qr_params->flag == 1) zq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb); if (i + ib < n) { cublasSetMatrix(ib, ib, sizeof(cuDoubleComplex), qr_params->t+cnt*nb*nb, ib, dwork, lddwork); if (i+ib < k-nx) /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); else magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i!=0) cublasGetMatrix(m, ib, sizeof(cuDoubleComplex), da_ref(0,i), ldda, a_ref(0,i), lda); int rows = m-i; cnt++; lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); if (cnt < qr_params->np_gpu) { int ib2=min(ib,nb); lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib2, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib2); qr_params->p[cnt]=a; } } cudaStreamDestroy( stream[0] ); cudaStreamDestroy( stream[1] ); cublasFree(da); return MAGMA_SUCCESS; } /* magma_zgeqrf */
int main(int argc, char* argv[]) { int i,j,k,index; // Linear dimension of matrices int dim = 100; // Number of A,B,C matrix sets int batch_count = 1000; // Allocate host storage for batch_count A,B,C square matrices double **A, **B, **C; A = (double**)malloc(batch_count*sizeof(double*)); B = (double**)malloc(batch_count*sizeof(double*)); C = (double**)malloc(batch_count*sizeof(double*)); for(i=0; i<batch_count; i++) { A[i] = (double*)malloc(dim*dim*sizeof(double)); B[i] = (double*)malloc(dim*dim*sizeof(double)); C[i] = (double*)malloc(dim*dim*sizeof(double)); } // Create host pointer array to device matrix storage double **d_A, **d_B, **d_C, **h_d_A, **h_d_B, **h_d_C; h_d_A = (double**)malloc(batch_count*sizeof(double*)); h_d_B = (double**)malloc(batch_count*sizeof(double*)); h_d_C = (double**)malloc(batch_count*sizeof(double*)); for(i=0; i<batch_count; i++) { cudaMalloc((void**)&h_d_A[i], dim*dim*sizeof(double)); cudaMalloc((void**)&h_d_B[i], dim*dim*sizeof(double)); cudaMalloc((void**)&h_d_C[i], dim*dim*sizeof(double)); } // Copy the host array of device pointers to the device cudaMalloc((void**)&d_A, batch_count*sizeof(double*)); cudaMalloc((void**)&d_B, batch_count*sizeof(double*)); cudaMalloc((void**)&d_C, batch_count*sizeof(double*)); cudaMemcpy(d_A, h_d_A, batch_count*sizeof(double*), cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_d_B, batch_count*sizeof(double*), cudaMemcpyHostToDevice); cudaMemcpy(d_C, h_d_C, batch_count*sizeof(double*), cudaMemcpyHostToDevice); // Fill A,B diagonals with k*sin(i) data, C diagonal with k*cos(i)^2 // Matrices are arranged column major for(k=0; k<batch_count; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { index = j*dim + i; if(i==j) { (A[k])[index] = k*sin(index); (B[k])[index] = sin(index); (C[k])[index] = k*cos(index)*cos(index); } else { (A[k])[index] = 0.0; (B[k])[index] = 0.0; (C[k])[index] = 0.0; } } // i } // j } // k // Create cublas instance cublasHandle_t handle; cublasCreate(&handle); // Set input matrices on device for(i=0; i<batch_count; i++) { cublasSetMatrix(dim, dim, sizeof(double), A[i], dim, h_d_A[i], dim); cublasSetMatrix(dim, dim, sizeof(double), B[i], dim, h_d_B[i], dim); cublasSetMatrix(dim, dim, sizeof(double), C[i], dim, h_d_C[i], dim); } // Set matrix coefficients double alpha = 1.0; double beta = 1.0; // DGEMM: C = alpha*A*B + beta*C cublasDgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, dim, dim, dim, &alpha, (const double**)d_A, dim, (const double**)d_B, dim, &beta, d_C, dim, batch_count); // Retrieve result matrix from device for(i=0; i<batch_count; i++) cublasGetMatrix(dim, dim, sizeof(double), h_d_C[i], dim, C[i], dim); // Simple sanity test, sum up all elements double sum = 0; for(k=0; k<batch_count; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { index = j*dim + i; sum += (C[k])[index]; } } } printf("Element sum is: %f, should be: %d\n", sum, dim*(batch_count-1)*(batch_count)/2); // Clean up resources for(i=0; i<batch_count; i++) { free(A[i]); free(B[i]); free(C[i]); cudaFree(h_d_A[i]); cudaFree(h_d_B[i]); cudaFree(h_d_C[i]); } free(A); free(B); free(C); free(h_d_A); free(h_d_B); free(h_d_C); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cublasDestroy(handle); return 0; }
cublasStatus cublasSetMatrix( int m, int n, int elemSize, void *A, int ldA, void *B, int ldB ) { return( cublasGetMatrix(m,n,elemSize, A,ldA, B, ldB ) ); }