void collect_final_result_zgemm(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, cuDoubleComplex** C_dev, int block_dim, int stream_num, int x, int y, int z, int nrowc, int ncolc, int ldc, cuDoubleComplex *C) { switcher = 1 - switcher; int temp = 0; for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) { int prior_task = tasks_rs[temp+stream_num*(1-switcher)]; int i_pre = prior_task/(y+1); int k_pre = prior_task%(y+1); int current_stream = temp; int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS ); assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess); } for (temp = 0; temp < tasks_rs_size[switcher]; temp++) { int prior_task = tasks_rs[temp+stream_num*(switcher)]; int i_pre = prior_task/(y+1); int k_pre = prior_task%(y+1); int current_stream = temp; int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS ); assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess); } }
void collect_final_result_dsyrk_syr2k(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, double** C_dev, int block_dim, int stream_num, int x,int y, int z, int nrowc, int ncolc, int ldc, double *C, enum CBLAS_UPLO Uplo) { switcher = 1 - switcher; int temp = 0; for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) { int prior_task = tasks_rs[temp+stream_num*(1-switcher)]; int i_pre, k_pre; blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x); int current_stream = temp; int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]); cudaStreamSynchronize(stream[current_stream]); } for (temp = 0; temp < tasks_rs_size[switcher]; temp++) { //assume 1-switcher int prior_task = tasks_rs[temp+stream_num*(switcher)]; int i_pre, k_pre; blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x); int current_stream = temp; int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]); cudaStreamSynchronize(stream[current_stream]); } }
void collect_final_result_dtrsm_mode_1(int *tasks_rs, int *tasks_rs_size, int switcher, int switcher_rs, cudaStream_t *stream, double** buffer_dev, int block_dim, int stream_num, int x, int y, int z, int nrowb, int ncolb, int ldb, double *B, int* switch_tracker) { int temp = 0; for (temp = tasks_rs_size[switcher_rs]; temp < tasks_rs_size[1-switcher_rs] ; temp++) { // printf("retrieving B[%d, %d] @stream=%d switcher:%d\n", z, tasks_rs[temp+STREAMNUM*(1-switcher_rs)], temp, switcher); int row = z; int col = tasks_rs[temp+stream_num*(1-switcher_rs)]; int current_stream = temp; int nrow_offset = row*block_dim; int ncol_offset = col*block_dim; int nrow_dev, ncol_dev; margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev); double *starting_point = &B[nrow_offset+ncol_offset*ldb]; cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]); cudaStreamSynchronize(stream[current_stream]); } for (temp = 0; temp < tasks_rs_size[switcher_rs]; temp++) { //assume 1-switcher //printf("retrieving B[%d, %d] @stream=%d\n", z, tasks_rs[temp+STREAMNUM*switcher_rs], temp); int row = z; int col = tasks_rs[temp+stream_num*switcher_rs]; int current_stream = temp; int nrow_offset = row*block_dim; int ncol_offset = col*block_dim; int nrow_dev, ncol_dev; margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev); double *starting_point = &B[nrow_offset+ncol_offset*ldb]; cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]); cudaStreamSynchronize(stream[current_stream]); } }
void magma_getmatrix_async( magma_int_t m, magma_int_t n, size_t elemSize, void const* dA_src, magma_int_t lda, void* hB_dst, magma_int_t ldb, cudaStream_t stream ) { cublasStatus_t status; status = cublasGetMatrixAsync( m, n, elemSize, dA_src, lda, hB_dst, ldb, stream ); check_error( status ); }
void magma_getmatrix_async_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, void const* dA_src, magma_int_t lda, void* hB_dst, magma_int_t ldb, cudaStream_t stream, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasGetMatrixAsync( m, n, elemSize, dA_src, lda, hB_dst, ldb, stream ); check_xerror( status, func, file, line ); }
void magma_sgetmatrix_async_internal( magma_int_t m, magma_int_t n, float const* dA_src, magma_int_t lda, float* hB_dst, magma_int_t ldb, cudaStream_t stream, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasGetMatrixAsync( m, n, sizeof(float), dA_src, lda, hB_dst, ldb, stream ); check_xerror( status, func, file, line ); }
// -------------------- extern "C" void magma_zgetmatrix_async_internal( magma_int_t m, magma_int_t n, magmaDoubleComplex_const_ptr dA_src, magma_int_t lda, magmaDoubleComplex* hB_dst, magma_int_t ldb, magma_queue_t queue, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasGetMatrixAsync( m, n, sizeof(magmaDoubleComplex), dA_src, lda, hB_dst, ldb, queue ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_getmatrix( m, n, elemSize, dA_src, ldda, hB_dst, ldb, queue ) Copy all or part of matrix dA_src on GPU device to hB_dst on CPU host. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version synchronizes the queue after the transfer. See magma_getmatrix_async() for an asynchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dA_src Source array of dimension (ldda,n), on GPU device. @param[in] ldda Leading dimension of matrix A. ldda >= m. @param[out] hB_dst Destination array of dimension (ldb,n), on CPU host. @param[in] ldb Leading dimension of matrix B. ldb >= m. @param[in] queue Queue to execute in. @ingroup magma_getmatrix *******************************************************************************/ extern "C" void magma_getmatrix_q_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, magma_const_ptr dA_src, magma_int_t ldda, void* hB_dst, magma_int_t ldb, magma_queue_t queue, const char* func, const char* file, int line ) { assert( queue != NULL ); cublasStatus_t status; status = cublasGetMatrixAsync( int(m), int(n), int(elemSize), dA_src, int(ldda), hB_dst, int(ldb), queue->cuda_stream() ); cudaStreamSynchronize( queue->cuda_stream() ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_getmatrix_async( m, n, elemSize, dA_src, ldda, hB_dst, ldb, queue ) Copy all or part of matrix dA_src on GPU device to hB_dst on CPU host. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version is asynchronous: it may return before the transfer finishes, if hB_dst is pinned CPU memory. See magma_getmatrix() for a synchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dA_src Source array of dimension (ldda,n), on GPU device. @param[in] ldda Leading dimension of matrix A. ldda >= m. @param[out] hB_dst Destination array of dimension (ldb,n), on CPU host. @param[in] ldb Leading dimension of matrix B. ldb >= m. @param[in] queue Queue to execute in. @ingroup magma_getmatrix *******************************************************************************/ extern "C" void magma_getmatrix_async_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, magma_const_ptr dA_src, magma_int_t ldda, void* hB_dst, magma_int_t ldb, magma_queue_t queue, const char* func, const char* file, int line ) { cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } cublasStatus_t status; status = cublasGetMatrixAsync( int(m), int(n), int(elemSize), dA_src, int(ldda), hB_dst, int(ldb), stream ); check_xerror( status, func, file, line ); }
void Cpsgecopy_general_async(int m, int n, void *A, int ia, int ja, int *descA, void *B, int ib, int jb, int *descB, int is_device_to_host) { #define dA(i,j) (((float*)A) + IDX2F(i,j,descA[LLD_])) #define dT(i,j) (((float *)T) + IDX2F(i,j,descT[LLD_])) #define dB(i,j) (((float *)B) + IDX2F(i,j,descB[LLD_])) /* perform copy B( ib:(ib+m-1), jb:(jb+n-1)) <- A( ia:(ia+m-1),ja:(ja+n-1)) */ const int use_MallocHost = FALSE; const int use_igsum2d = FALSE; cublasStatus cu_status; cudaError_t cuda_status; char notrans[] = "NoTrans"; int descT[DLEN_]; int ldA,ldB,ldT; int is_same_context, is_same_mb, is_same_nb; int is_same_p, is_same_q; int is_same_offset; int is_same_Locp, is_same_Locq; int is_aligned; int lrA1,lcA1, lrA2,lcA2; int lrT1,lcT1, lrT2,lcT2; int lrB1,lcB1, lrB2,lcB2; int rsrc,csrc; int rsrcA1,csrcA1, rsrcA2, csrcA2; int rsrcB1,csrcB1, rsrcB2, csrcB2; int iia,jja, iib,jjb; int icontxt, nprow,npcol, myprow,mypcol; int LocpA,LocqA, LocpB,LocqB, LocpT,LocqT; int mm,nn, lmm,lnn; size_t nbytes; float one_[REAL_PART+IMAG_PART+1]; float *one = &(one_[0]); float zero_[REAL_PART+IMAG_PART+1]; float *zero = &(zero_[0]); float alpha_[REAL_PART+IMAG_PART+1]; float *alpha = &(alpha_[0]); float beta_[REAL_PART+IMAG_PART+1]; float *beta = &(beta_[0]); int isize, isizeT; float *T = 0; int elemSize = sizeof(float); int nnb, jstart,jend,jsize; int is_ok; int nmax; const int bufsize = 1024*1024; const int use_simple = FALSE;; one[REAL_PART] = 1.0; one[IMAG_PART] = 0.0; zero[REAL_PART] = 0.0; zero[IMAG_PART] = 0.0; if ((m <= 0) || (n <= 0)) { return; }; T = 0; ldA = descA[LLD_]; ldB = descB[LLD_]; icontxt = descA[CTXT_]; Cblacs_gridinfo( icontxt, &nprow,&npcol, &myprow, &mypcol); assert( nprow >= 1); assert( npcol >= 1); assert( (0 <= myprow) && (myprow < nprow)); assert( (0 <= mypcol) && (mypcol < npcol)); is_ok = (1 <= ia) && (ia + m-1 <= descA[M_]); if (!is_ok) { printf("Cpsgecopy (%d,%d) :ia %d m %d descA[M_] %d \n", myprow,mypcol, ia, m, descA[M_] ); printf("Cpsgecopy (%d,%d) :ja %d n %d descA[N_] %d \n", myprow,mypcol, ja, n, descA[N_] ); printf("Cpsgecopy (%d,%d) :ib %d jb %d descB[M_] %d descB[N_] %d\n", myprow,mypcol, ib, jb, descB[M_], descB[N_] ); }; assert( (1 <= ia) && (ia + m-1 <= descA[M_])); assert( (1 <= ja) && (ja + n-1 <= descA[N_])); assert( (1 <= ib) && (ib + m-1 <= descB[M_])); assert( (1 <= jb) && (jb + n-1 <= descB[N_])); is_same_context = (descA[CTXT_] == descB[CTXT_]); is_same_mb = (descA[MB_] == descB[MB_]); is_same_nb = (descA[NB_] == descB[NB_]); is_same_p = (Cindxg2p(ia,descA[MB_], myprow, descA[RSRC_],nprow) == Cindxg2p(ib,descB[MB_], myprow, descB[RSRC_],nprow) ); is_same_q = (Cindxg2p(ja,descA[NB_], mypcol, descA[CSRC_],npcol) == Cindxg2p(jb,descB[NB_], mypcol, descB[CSRC_],npcol) ); is_same_offset = (MOD(ia,descA[MB_]) == MOD(ib,descB[MB_])) && (MOD(ja,descA[NB_]) == MOD(jb,descB[NB_])); local_extent( m,n, ia,ja,descA, &LocpA,&LocqA, &lrA1,&lcA1, &lrA2,&lcA2 ); local_extent( m,n, ib,jb,descB, &LocpB,&LocqB,&lrB1,&lcB1, &lrB2,&lcB2 ); /* if ((LocpA >= 1) || (LocpB >= 1)) { is_same_Locp = (LocpA == LocpB); }; if ((LocqA >= 1) || (LocqB >= 1)) { is_same_Locq = (LocqA == LocqB); }; */ is_same_Locq = (LocqA == LocqB); is_same_Locp = (LocpA == LocpB); is_aligned = is_same_context && is_same_mb && is_same_nb && is_same_p && is_same_q && is_same_offset && is_same_Locp && is_same_Locq; assert( is_same_q ); assert( is_same_p ); assert( is_same_offset ); assert( is_same_Locp ); assert( is_same_Locq ); assert( is_aligned ); /* no communication required copy from device to host */ ldA = descA[LLD_]; ldB = descB[LLD_]; mm = LocpA; nn = LocqA; if (is_device_to_host) { /* * transfer from device to host */ if ( (mm >= 1) && (nn >= 1) ) { #ifdef USE_CUBLASV2 { cublasStatus_t istatus; istatus = cublasGetMatrixAsync(mm, nn, elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1), ldB, cublas_get_stream() ); assert( istatus == CUBLAS_STATUS_SUCCESS ); } #else cu_status = cublasGetMatrix(mm,nn, elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1),ldB ); CHKERR(cu_status); #endif }; } else { /* * transfer from host to device */ if ( (mm >= 1) && (nn >= 1) ) { #ifdef USE_CUBLASV2 { cublasStatus_t istatus; istatus = cublasSetMatrixAsync(mm,nn,elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1),ldB, cublas_get_stream() ); assert( istatus == CUBLAS_STATUS_SUCCESS ); } #else cu_status = cublasSetMatrix(mm,nn,elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1),ldB ); CHKERR(cu_status); #endif }; }; return; }
void blasx_gpu_dgemm_kernel(int j, int nrowa, int ncola, int nrowb, int ncolb, int nrowc, int ncolc, int current_task, int prior_task, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, double* A, double* B, double* C, int lda, int ldb, int ldc, int x, int y, int z, double** C_dev, cudaStream_t *stream, cublasHandle_t *handle_p, int current_stream, double alpha, double beta, int block_dim, int switcher, int* task_batch_counter, LRU_t **LRUs, int GPUs, int *mem_cpy_counter, reader_tracker *addr_track, int GPU_id) { int nrowa_dev, nrowb_dev, nrowc_dev; int ncola_dev, ncolb_dev, ncolc_dev; int nrow_offset_a, nrow_offset_b; int ncol_offset_a, ncol_offset_b; int i = current_task/(y+1); int k = current_task%(y+1); double *A_dev, *B_dev; if (TransA != CblasNoTrans) { margin_adjustment(nrowa,ncola,block_dim,j,i,&nrowa_dev,&ncola_dev); }else{ margin_adjustment(nrowa,ncola,block_dim,i,j,&nrowa_dev,&ncola_dev); } if (TransB != CblasNoTrans) { margin_adjustment(nrowb,ncolb,block_dim,k,j,&nrowb_dev,&ncolb_dev); }else{ margin_adjustment(nrowb,ncolb,block_dim,j,k,&nrowb_dev,&ncolb_dev); } margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev); if (TransA != CblasNoTrans) { nrow_offset_a = j*block_dim, ncol_offset_a = i*block_dim; }else{ nrow_offset_a = i*block_dim, ncol_offset_a = j*block_dim; } if (TransB != CblasNoTrans) { nrow_offset_b = k*block_dim, ncol_offset_b = j*block_dim; }else{ nrow_offset_b = j*block_dim, ncol_offset_b = k*block_dim; } double *starting_point_A = &A[nrow_offset_a+ncol_offset_a*lda]; double *starting_point_B = &B[nrow_offset_b+ncol_offset_b*ldb]; //Asynchonizing set matrix on GPU //----------------LRU&RBT optimization----------------// mem_control_kernel_double(starting_point_A, &A_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowa_dev, ncola_dev, lda); mem_control_kernel_double(starting_point_B, &B_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowb_dev, ncolb_dev, ldb); //----------------------------------------------------// if (j == 0) { margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev); int nrow_offset_c = i*block_dim; int ncol_offset_c = k*block_dim; double *starting_point_C = &C[nrow_offset_c+ncol_offset_c*ldc]; if (beta != 0) { assert( cublasSetMatrixAsync(nrowc_dev, ncolc_dev, sizeof(double), starting_point_C, ldc, C_dev[switcher*STREAMNUM+current_stream], block_dim, *stream) == CUBLAS_STATUS_SUCCESS ); } if (*task_batch_counter != 0) {//Set matrix back int i_pre = prior_task/(y+1); int k_pre = prior_task%(y+1); int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc,ncolc,block_dim,i_pre,k_pre,&nrowc_dev_pre,&ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*STREAMNUM], block_dim, starting_point_C_pre, ldc,*stream) == CUBLAS_STATUS_SUCCESS); } } cudaStreamSynchronize(*stream); assert( cublasSetStream(*handle_p, *stream) == CUBLAS_STATUS_SUCCESS ); double beta_inner = (j==0)?(beta):(1); int ka = (TransA != CblasNoTrans)?(nrowa_dev):(ncola_dev); cublasOperation_t transa, transb; CBLasTransToCuBlasTrans(TransA, &transa); CBLasTransToCuBlasTrans(TransB, &transb); cublasStatus_t status = cublasDgemm(*handle_p, transa, transb, nrowc_dev, ncolc_dev, ka, &alpha, A_dev, block_dim, B_dev, block_dim, &beta_inner, C_dev[switcher*STREAMNUM+current_stream], block_dim); assert( status == CUBLAS_STATUS_SUCCESS ); }