void transferMatrix(const Stream& stream, const PinnedHostMatrix& matrixHost, PRECISION* deviceMemoryPosition) { const cudaStream_t& cudaStream = stream.getCudaStream(); const int numberOfRows = matrixHost.getNumberOfRows(); const int numberOfColumns = matrixHost.getNumberOfColumns(); const PRECISION* hostMatrixPointer = matrixHost.getMemoryPointer(); handleCublasStatus( cublasSetMatrixAsync(numberOfRows, numberOfColumns, sizeof(PRECISION), hostMatrixPointer, numberOfRows, deviceMemoryPosition, numberOfRows, cudaStream), "Error when transferring matrix from host to device point: "); }
void magma_setmatrix_async( magma_int_t m, magma_int_t n, size_t elemSize, void const* hA_src, magma_int_t lda, void* dB_dst, magma_int_t ldb, cudaStream_t stream ) { cublasStatus_t status; status = cublasSetMatrixAsync( m, n, elemSize, hA_src, lda, dB_dst, ldb, stream ); check_error( status ); }
void magma_setmatrix_async_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, void const* hA_src, magma_int_t lda, void* dB_dst, magma_int_t ldb, cudaStream_t stream, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasSetMatrixAsync( m, n, elemSize, hA_src, lda, dB_dst, ldb, stream ); check_xerror( status, func, file, line ); }
DeviceMatrix* transferMatrix(const Stream& stream, const PinnedHostMatrix& matrixHost) { const cudaStream_t& cudaStream = stream.getCudaStream(); const int numberOfRows = matrixHost.getNumberOfRows(); const int numberOfColumns = matrixHost.getNumberOfColumns(); DeviceMatrix* deviceMatrix = new DeviceMatrix(numberOfRows, numberOfColumns); PRECISION* deviceMatrixPointer = deviceMatrix->getMemoryPointer(); const PRECISION* hostMatrixPointer = matrixHost.getMemoryPointer(); handleCublasStatus( cublasSetMatrixAsync(numberOfRows, numberOfColumns, sizeof(PRECISION), hostMatrixPointer, numberOfRows, deviceMatrixPointer, numberOfRows, cudaStream), "Error when transferring matrix from host to device: "); return deviceMatrix; }
// -------------------- extern "C" void magma_zsetmatrix_async_internal( magma_int_t m, magma_int_t n, magmaDoubleComplex const* hA_src, magma_int_t lda, magmaDoubleComplex_ptr dB_dst, magma_int_t ldb, magma_queue_t queue, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasSetMatrixAsync( m, n, sizeof(magmaDoubleComplex), hA_src, lda, dB_dst, ldb, queue ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_setmatrix( m, n, elemSize, hA_src, lda, dB_dst, lddb, queue ) Copy all or part of matrix hA_src on CPU host to dB_dst on GPU device. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version synchronizes the queue after the transfer. See magma_setmatrix_async() for an asynchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] hA_src Source array of dimension (lda,n), on CPU host. @param[in] lda Leading dimension of matrix A. lda >= m. @param[out] dB_dst Destination array of dimension (lddb,n), on GPU device. @param[in] lddb Leading dimension of matrix B. lddb >= m. @param[in] queue Queue to execute in. @ingroup magma_setmatrix *******************************************************************************/ extern "C" void magma_setmatrix_q_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, void const* hA_src, magma_int_t lda, magma_ptr dB_dst, magma_int_t lddb, magma_queue_t queue, const char* func, const char* file, int line ) { assert( queue != NULL ); cublasStatus_t status; status = cublasSetMatrixAsync( int(m), int(n), int(elemSize), hA_src, int(lda), dB_dst, int(lddb), queue->cuda_stream() ); cudaStreamSynchronize( queue->cuda_stream() ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_setmatrix_async( m, n, elemSize, hA_src, lda, dB_dst, lddb, queue ) Copy all or part of matrix hA_src on CPU host to dB_dst on GPU device. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version is asynchronous: it may return before the transfer finishes, if hA_src is pinned CPU memory. See magma_setmatrix() for a synchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] hA_src Source array of dimension (lda,n), on CPU host. @param[in] lda Leading dimension of matrix A. lda >= m. @param[out] dB_dst Destination array of dimension (lddb,n), on GPU device. @param[in] lddb Leading dimension of matrix B. lddb >= m. @param[in] queue Queue to execute in. @ingroup magma_setmatrix *******************************************************************************/ extern "C" void magma_setmatrix_async_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, void const* hA_src, magma_int_t lda, magma_ptr dB_dst, magma_int_t lddb, magma_queue_t queue, const char* func, const char* file, int line ) { // for backwards compatability, accepts NULL queue to mean NULL stream. cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } cublasStatus_t status; status = cublasSetMatrixAsync( int(m), int(n), int(elemSize), hA_src, int(lda), dB_dst, int(lddb), stream ); check_xerror( status, func, file, line ); }
void Cpsgecopy_general_async(int m, int n, void *A, int ia, int ja, int *descA, void *B, int ib, int jb, int *descB, int is_device_to_host) { #define dA(i,j) (((float*)A) + IDX2F(i,j,descA[LLD_])) #define dT(i,j) (((float *)T) + IDX2F(i,j,descT[LLD_])) #define dB(i,j) (((float *)B) + IDX2F(i,j,descB[LLD_])) /* perform copy B( ib:(ib+m-1), jb:(jb+n-1)) <- A( ia:(ia+m-1),ja:(ja+n-1)) */ const int use_MallocHost = FALSE; const int use_igsum2d = FALSE; cublasStatus cu_status; cudaError_t cuda_status; char notrans[] = "NoTrans"; int descT[DLEN_]; int ldA,ldB,ldT; int is_same_context, is_same_mb, is_same_nb; int is_same_p, is_same_q; int is_same_offset; int is_same_Locp, is_same_Locq; int is_aligned; int lrA1,lcA1, lrA2,lcA2; int lrT1,lcT1, lrT2,lcT2; int lrB1,lcB1, lrB2,lcB2; int rsrc,csrc; int rsrcA1,csrcA1, rsrcA2, csrcA2; int rsrcB1,csrcB1, rsrcB2, csrcB2; int iia,jja, iib,jjb; int icontxt, nprow,npcol, myprow,mypcol; int LocpA,LocqA, LocpB,LocqB, LocpT,LocqT; int mm,nn, lmm,lnn; size_t nbytes; float one_[REAL_PART+IMAG_PART+1]; float *one = &(one_[0]); float zero_[REAL_PART+IMAG_PART+1]; float *zero = &(zero_[0]); float alpha_[REAL_PART+IMAG_PART+1]; float *alpha = &(alpha_[0]); float beta_[REAL_PART+IMAG_PART+1]; float *beta = &(beta_[0]); int isize, isizeT; float *T = 0; int elemSize = sizeof(float); int nnb, jstart,jend,jsize; int is_ok; int nmax; const int bufsize = 1024*1024; const int use_simple = FALSE;; one[REAL_PART] = 1.0; one[IMAG_PART] = 0.0; zero[REAL_PART] = 0.0; zero[IMAG_PART] = 0.0; if ((m <= 0) || (n <= 0)) { return; }; T = 0; ldA = descA[LLD_]; ldB = descB[LLD_]; icontxt = descA[CTXT_]; Cblacs_gridinfo( icontxt, &nprow,&npcol, &myprow, &mypcol); assert( nprow >= 1); assert( npcol >= 1); assert( (0 <= myprow) && (myprow < nprow)); assert( (0 <= mypcol) && (mypcol < npcol)); is_ok = (1 <= ia) && (ia + m-1 <= descA[M_]); if (!is_ok) { printf("Cpsgecopy (%d,%d) :ia %d m %d descA[M_] %d \n", myprow,mypcol, ia, m, descA[M_] ); printf("Cpsgecopy (%d,%d) :ja %d n %d descA[N_] %d \n", myprow,mypcol, ja, n, descA[N_] ); printf("Cpsgecopy (%d,%d) :ib %d jb %d descB[M_] %d descB[N_] %d\n", myprow,mypcol, ib, jb, descB[M_], descB[N_] ); }; assert( (1 <= ia) && (ia + m-1 <= descA[M_])); assert( (1 <= ja) && (ja + n-1 <= descA[N_])); assert( (1 <= ib) && (ib + m-1 <= descB[M_])); assert( (1 <= jb) && (jb + n-1 <= descB[N_])); is_same_context = (descA[CTXT_] == descB[CTXT_]); is_same_mb = (descA[MB_] == descB[MB_]); is_same_nb = (descA[NB_] == descB[NB_]); is_same_p = (Cindxg2p(ia,descA[MB_], myprow, descA[RSRC_],nprow) == Cindxg2p(ib,descB[MB_], myprow, descB[RSRC_],nprow) ); is_same_q = (Cindxg2p(ja,descA[NB_], mypcol, descA[CSRC_],npcol) == Cindxg2p(jb,descB[NB_], mypcol, descB[CSRC_],npcol) ); is_same_offset = (MOD(ia,descA[MB_]) == MOD(ib,descB[MB_])) && (MOD(ja,descA[NB_]) == MOD(jb,descB[NB_])); local_extent( m,n, ia,ja,descA, &LocpA,&LocqA, &lrA1,&lcA1, &lrA2,&lcA2 ); local_extent( m,n, ib,jb,descB, &LocpB,&LocqB,&lrB1,&lcB1, &lrB2,&lcB2 ); /* if ((LocpA >= 1) || (LocpB >= 1)) { is_same_Locp = (LocpA == LocpB); }; if ((LocqA >= 1) || (LocqB >= 1)) { is_same_Locq = (LocqA == LocqB); }; */ is_same_Locq = (LocqA == LocqB); is_same_Locp = (LocpA == LocpB); is_aligned = is_same_context && is_same_mb && is_same_nb && is_same_p && is_same_q && is_same_offset && is_same_Locp && is_same_Locq; assert( is_same_q ); assert( is_same_p ); assert( is_same_offset ); assert( is_same_Locp ); assert( is_same_Locq ); assert( is_aligned ); /* no communication required copy from device to host */ ldA = descA[LLD_]; ldB = descB[LLD_]; mm = LocpA; nn = LocqA; if (is_device_to_host) { /* * transfer from device to host */ if ( (mm >= 1) && (nn >= 1) ) { #ifdef USE_CUBLASV2 { cublasStatus_t istatus; istatus = cublasGetMatrixAsync(mm, nn, elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1), ldB, cublas_get_stream() ); assert( istatus == CUBLAS_STATUS_SUCCESS ); } #else cu_status = cublasGetMatrix(mm,nn, elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1),ldB ); CHKERR(cu_status); #endif }; } else { /* * transfer from host to device */ if ( (mm >= 1) && (nn >= 1) ) { #ifdef USE_CUBLASV2 { cublasStatus_t istatus; istatus = cublasSetMatrixAsync(mm,nn,elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1),ldB, cublas_get_stream() ); assert( istatus == CUBLAS_STATUS_SUCCESS ); } #else cu_status = cublasSetMatrix(mm,nn,elemSize, (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1),ldB ); CHKERR(cu_status); #endif }; }; return; }
void blasx_gpu_dgemm_kernel(int j, int nrowa, int ncola, int nrowb, int ncolb, int nrowc, int ncolc, int current_task, int prior_task, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, double* A, double* B, double* C, int lda, int ldb, int ldc, int x, int y, int z, double** C_dev, cudaStream_t *stream, cublasHandle_t *handle_p, int current_stream, double alpha, double beta, int block_dim, int switcher, int* task_batch_counter, LRU_t **LRUs, int GPUs, int *mem_cpy_counter, reader_tracker *addr_track, int GPU_id) { int nrowa_dev, nrowb_dev, nrowc_dev; int ncola_dev, ncolb_dev, ncolc_dev; int nrow_offset_a, nrow_offset_b; int ncol_offset_a, ncol_offset_b; int i = current_task/(y+1); int k = current_task%(y+1); double *A_dev, *B_dev; if (TransA != CblasNoTrans) { margin_adjustment(nrowa,ncola,block_dim,j,i,&nrowa_dev,&ncola_dev); }else{ margin_adjustment(nrowa,ncola,block_dim,i,j,&nrowa_dev,&ncola_dev); } if (TransB != CblasNoTrans) { margin_adjustment(nrowb,ncolb,block_dim,k,j,&nrowb_dev,&ncolb_dev); }else{ margin_adjustment(nrowb,ncolb,block_dim,j,k,&nrowb_dev,&ncolb_dev); } margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev); if (TransA != CblasNoTrans) { nrow_offset_a = j*block_dim, ncol_offset_a = i*block_dim; }else{ nrow_offset_a = i*block_dim, ncol_offset_a = j*block_dim; } if (TransB != CblasNoTrans) { nrow_offset_b = k*block_dim, ncol_offset_b = j*block_dim; }else{ nrow_offset_b = j*block_dim, ncol_offset_b = k*block_dim; } double *starting_point_A = &A[nrow_offset_a+ncol_offset_a*lda]; double *starting_point_B = &B[nrow_offset_b+ncol_offset_b*ldb]; //Asynchonizing set matrix on GPU //----------------LRU&RBT optimization----------------// mem_control_kernel_double(starting_point_A, &A_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowa_dev, ncola_dev, lda); mem_control_kernel_double(starting_point_B, &B_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowb_dev, ncolb_dev, ldb); //----------------------------------------------------// if (j == 0) { margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev); int nrow_offset_c = i*block_dim; int ncol_offset_c = k*block_dim; double *starting_point_C = &C[nrow_offset_c+ncol_offset_c*ldc]; if (beta != 0) { assert( cublasSetMatrixAsync(nrowc_dev, ncolc_dev, sizeof(double), starting_point_C, ldc, C_dev[switcher*STREAMNUM+current_stream], block_dim, *stream) == CUBLAS_STATUS_SUCCESS ); } if (*task_batch_counter != 0) {//Set matrix back int i_pre = prior_task/(y+1); int k_pre = prior_task%(y+1); int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc,ncolc,block_dim,i_pre,k_pre,&nrowc_dev_pre,&ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*STREAMNUM], block_dim, starting_point_C_pre, ldc,*stream) == CUBLAS_STATUS_SUCCESS); } } cudaStreamSynchronize(*stream); assert( cublasSetStream(*handle_p, *stream) == CUBLAS_STATUS_SUCCESS ); double beta_inner = (j==0)?(beta):(1); int ka = (TransA != CblasNoTrans)?(nrowa_dev):(ncola_dev); cublasOperation_t transa, transb; CBLasTransToCuBlasTrans(TransA, &transa); CBLasTransToCuBlasTrans(TransB, &transb); cublasStatus_t status = cublasDgemm(*handle_p, transa, transb, nrowc_dev, ncolc_dev, ka, &alpha, A_dev, block_dim, B_dev, block_dim, &beta_inner, C_dev[switcher*STREAMNUM+current_stream], block_dim); assert( status == CUBLAS_STATUS_SUCCESS ); }
void mem_control_kernel_float(float *starting_point_A, float **A_dev, LRU_t **LRUs, const int GPUs, const int GPU_id, int block_dim, int *mem_cpy_counter, reader_tracker *addr_track, cudaStream_t *stream, int nrowa_dev, int ncola_dev, int lda) { rbt_node* block_A = rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map)); if( block_A == NULL ) { //new element //fprintf(stderr, "==========new element========\n"); //traverse_LRU_se(LRU); int search_l_GPU = GPU_id-1; int search_r_GPU = GPU_id+1; rbt_node *block_A_l = NULL; rbt_node *block_A_r = NULL; while (block_A_l == NULL && block_A_r == NULL) { if (search_l_GPU >= 0) { block_A_l = rbt_find(starting_point_A, &(LRUs[search_l_GPU]->hash_map)); if (block_A_l != NULL) { if (block_A_l->associated_LRU_elem->is_trans_done == 0) { int peer_access_check = 0; cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_l_GPU); if(peer_access_check == 1) block_A_l = NULL; } } search_l_GPU--; } if (search_r_GPU < GPUs) { block_A_r = rbt_find(starting_point_A, &(LRUs[search_r_GPU]->hash_map)); if (block_A_r != NULL) { if (block_A_r->associated_LRU_elem->is_trans_done == 0) { int peer_access_check = 0; cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_r_GPU); if(peer_access_check == 1) block_A_r = NULL; } } search_r_GPU++; } if (search_l_GPU < 0 && search_r_GPU >= GPUs) { break; } } //rectitfication search_l_GPU++; search_r_GPU--; assert(search_l_GPU >= 0 && search_l_GPU < GPUs); assert(search_r_GPU >= 0 && search_r_GPU < GPUs); if ( !(block_A_l == NULL && block_A_r == NULL) ) { //inter GPU communication int target_GPU_id = 0; if (block_A_l != NULL && block_A_r != NULL) { if (ABS(search_l_GPU - GPU_id) > ABS(search_r_GPU - GPU_id)) { target_GPU_id = search_r_GPU; block_A = block_A_r; } else if(ABS(search_l_GPU - GPU_id) < ABS(search_r_GPU - GPU_id)) { target_GPU_id = search_l_GPU; block_A = block_A_l; } else { int rand_select = rand()%10; if (rand_select < 5) { target_GPU_id = search_l_GPU; block_A = block_A_l; } else { target_GPU_id = search_r_GPU; block_A = block_A_r; } } if(block_A->associated_LRU_elem->is_trans_done != 1) goto new_block; //fprintf(stderr, "==>3 block on GPUs:(%d, %d), but chose %d(done:%d) as curt GPU is %d (block_A_l:%p, block_A_r:%p)\n", search_l_GPU, search_r_GPU, target_GPU_id, block_A->associated_LRU_elem->is_trans_done, GPU_id, block_A_l, block_A_r); } else { if (block_A_l != NULL && block_A_r == NULL) { target_GPU_id = search_l_GPU; block_A = block_A_l; } else if(block_A_r != NULL && block_A_l == NULL) { target_GPU_id = search_r_GPU; block_A = block_A_r; } if(block_A->associated_LRU_elem->is_trans_done != 1) goto new_block; //printf("==>2 block on GPUs:%d, and curt GPU is %d (done:%d)\n", target_GPU_id, GPU_id, block_A->associated_LRU_elem->is_trans_done); } if (rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) == NULL) goto new_block; atomic_reader_plus(block_A); *A_dev = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id); assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) != NULL); assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map))->associated_LRU_elem->is_trans_done == 1); assert( cudaMemcpyPeerAsync(*A_dev, GPU_id, block_A->associated_LRU_elem->GPU_p, target_GPU_id, sizeof(float)*block_dim*block_dim, *stream) == cudaSuccess ); //cannot dequeue the GPU mem at the target GPU addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = target_GPU_id; addr_track[*mem_cpy_counter].is_trans_done = 1; (*mem_cpy_counter) += 1; //cannnot dequeue the current new GPU mem addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = GPU_id; addr_track[*mem_cpy_counter].is_trans_done = 0; (*mem_cpy_counter) += 1; } else { new_block: //(block_A_r == NULL && block_A_l == NULL) { //bring new blocks //printf("==>1 bring new block to GPU:%d\n", GPU_id); (*A_dev) = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id); assert( cublasSetMatrixAsync(nrowa_dev, ncola_dev, sizeof(float), starting_point_A, lda, *A_dev, block_dim, *stream) == CUBLAS_STATUS_SUCCESS ); addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = GPU_id; addr_track[*mem_cpy_counter].is_trans_done = 0; (*mem_cpy_counter) += 1; } } else { atomic_reader_plus(block_A); assert( rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map)) != NULL); *A_dev = (float*) LRU_reorder(starting_point_A, LRUs[GPU_id]); addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = GPU_id; (*mem_cpy_counter) += 1; } }
void copyBandMatrixToDevice(double *h_matrix, gpu_symm_band_matrix * gpu_matrix, cublasHandle_t handle) { int num_tiles; int bs = gpu_matrix->block_size; int order = gpu_matrix->order; int hb = gpu_matrix->half_bandwith; int i; int cur_row = 0; int tile_end; int cur_bs; cublasStatus_t status; double* temp_tile; cudaStream_t streams[STREAM_COUNT]; double** stream_tiles; int max_tile_elements; num_tiles = (order + bs - 1) / bs; gpu_matrix->gpu_matrix_tiles = (double **) malloc( num_tiles * sizeof(double *)); gpu_matrix->tile_len = (int *) malloc(num_tiles * sizeof(int)); stream_tiles = (double**)malloc(STREAM_COUNT*sizeof(double*)); for(i = 0; i < STREAM_COUNT; i++) { cudaStreamCreate(&streams[i]); stream_tiles[i] = (double*) malloc( (bs + hb) * bs *sizeof(double) ); } max_tile_elements = (bs + hb) * bs; checkCudaErrors(cudaMalloc(&(gpu_matrix->tiles_storage), num_tiles * max_tile_elements*sizeof(double))); for (i = 0; i < num_tiles; i++) { tile_end = cur_row + bs - 1 + hb; if (tile_end >= order) { tile_end = order - 1; } cur_bs = bs; if (cur_row + bs > order) { cur_bs = order - cur_row; } gpu_matrix->tile_len[i] = tile_end - cur_row + 1; /*checkCudaErrors( cudaMalloc(&(gpu_matrix->gpu_matrix_tiles[i]), cur_bs * gpu_matrix->tile_len[i] * sizeof(double)));*/ gpu_matrix->gpu_matrix_tiles[i] = &(gpu_matrix->tiles_storage[max_tile_elements*i]); cudaStreamSynchronize(streams[i%STREAM_COUNT]); temp_tile = stream_tiles[i%STREAM_COUNT]; init_temp_tile(cur_bs, cur_row, temp_tile, h_matrix, gpu_matrix->tile_len[i], order, hb, bs); status = cublasSetMatrixAsync(cur_bs, gpu_matrix->tile_len[i], sizeof(double), temp_tile, bs, gpu_matrix->gpu_matrix_tiles[i], cur_bs, streams[i%STREAM_COUNT]); if(status != CUBLAS_STATUS_SUCCESS) { printf("COPY MATRIX TO DEVICE FAILED %d", status); } cur_row += bs; } printf("\nMatrix is copied to GPU\n"); for(i = 0; i < STREAM_COUNT; i++) { cudaStreamSynchronize(streams[i]); cudaStreamDestroy(streams[i]); free(stream_tiles[i]); } }