Пример #1
0
void transferMatrix(const Stream& stream, const PinnedHostMatrix& matrixHost, PRECISION* deviceMemoryPosition) {
    const cudaStream_t& cudaStream = stream.getCudaStream();
    const int numberOfRows = matrixHost.getNumberOfRows();
    const int numberOfColumns = matrixHost.getNumberOfColumns();
    const PRECISION* hostMatrixPointer = matrixHost.getMemoryPointer();

    handleCublasStatus(
        cublasSetMatrixAsync(numberOfRows, numberOfColumns, sizeof(PRECISION), hostMatrixPointer, numberOfRows,
                             deviceMemoryPosition, numberOfRows, cudaStream),
        "Error when transferring matrix from host to device point: ");
}
Пример #2
0
void magma_setmatrix_async(
    magma_int_t m, magma_int_t n, size_t elemSize,
    void const* hA_src, magma_int_t lda,
    void*       dB_dst, magma_int_t ldb,
    cudaStream_t stream )
{
    cublasStatus_t status;
    status = cublasSetMatrixAsync(
        m, n, elemSize,
        hA_src, lda,
        dB_dst, ldb, stream );
    check_error( status );
}
Пример #3
0
void magma_setmatrix_async_internal(
    magma_int_t m, magma_int_t n, magma_int_t elemSize,
    void const* hA_src, magma_int_t lda,
    void*       dB_dst, magma_int_t ldb,
    cudaStream_t stream,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasSetMatrixAsync(
        m, n, elemSize,
        hA_src, lda,
        dB_dst, ldb, stream );
    check_xerror( status, func, file, line );
}
Пример #4
0
DeviceMatrix* transferMatrix(const Stream& stream, const PinnedHostMatrix& matrixHost) {
    const cudaStream_t& cudaStream = stream.getCudaStream();
    const int numberOfRows = matrixHost.getNumberOfRows();
    const int numberOfColumns = matrixHost.getNumberOfColumns();

    DeviceMatrix* deviceMatrix = new DeviceMatrix(numberOfRows, numberOfColumns);
    PRECISION* deviceMatrixPointer = deviceMatrix->getMemoryPointer();
    const PRECISION* hostMatrixPointer = matrixHost.getMemoryPointer();

    handleCublasStatus(
        cublasSetMatrixAsync(numberOfRows, numberOfColumns, sizeof(PRECISION), hostMatrixPointer, numberOfRows,
                             deviceMatrixPointer, numberOfRows, cudaStream), "Error when transferring matrix from host to device: ");

    return deviceMatrix;
}
Пример #5
0
// --------------------
extern "C" void
magma_zsetmatrix_async_internal(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex const* hA_src, magma_int_t lda,
    magmaDoubleComplex_ptr    dB_dst, magma_int_t ldb,
    magma_queue_t queue,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasSetMatrixAsync(
        m, n, sizeof(magmaDoubleComplex),
        hA_src, lda,
        dB_dst, ldb, queue );
    check_xerror( status, func, file, line );
}
Пример #6
0
/***************************************************************************//**
    @fn magma_setmatrix( m, n, elemSize, hA_src, lda, dB_dst, lddb, queue )

    Copy all or part of matrix hA_src on CPU host to dB_dst on GPU device.
    Elements may be arbitrary size.
    Type-safe versions set elemSize appropriately.

    This version synchronizes the queue after the transfer.
    See magma_setmatrix_async() for an asynchronous version.

    @param[in]
    m           Number of rows of matrix A. m >= 0.

    @param[in]
    n           Number of columns of matrix A. n >= 0.

    @param[in]
    elemSize    Size of each element, e.g., sizeof(double).

    @param[in]
    hA_src      Source array of dimension (lda,n), on CPU host.

    @param[in]
    lda         Leading dimension of matrix A. lda >= m.

    @param[out]
    dB_dst      Destination array of dimension (lddb,n), on GPU device.

    @param[in]
    lddb        Leading dimension of matrix B. lddb >= m.

    @param[in]
    queue       Queue to execute in.

    @ingroup magma_setmatrix
*******************************************************************************/
extern "C" void
magma_setmatrix_q_internal(
    magma_int_t m, magma_int_t n, magma_int_t elemSize,
    void const* hA_src, magma_int_t lda,
    magma_ptr   dB_dst, magma_int_t lddb,
    magma_queue_t queue,
    const char* func, const char* file, int line )
{
    assert( queue != NULL );
    cublasStatus_t status;
    status = cublasSetMatrixAsync(
        int(m), int(n), int(elemSize),
        hA_src, int(lda),
        dB_dst, int(lddb), queue->cuda_stream() );
    cudaStreamSynchronize( queue->cuda_stream() );
    check_xerror( status, func, file, line );
}
Пример #7
0
/***************************************************************************//**
    @fn magma_setmatrix_async( m, n, elemSize, hA_src, lda, dB_dst, lddb, queue )

    Copy all or part of matrix hA_src on CPU host to dB_dst on GPU device.
    Elements may be arbitrary size.
    Type-safe versions set elemSize appropriately.

    This version is asynchronous: it may return before the transfer finishes,
    if hA_src is pinned CPU memory.
    See magma_setmatrix() for a synchronous version.

    @param[in]
    m           Number of rows of matrix A. m >= 0.

    @param[in]
    n           Number of columns of matrix A. n >= 0.

    @param[in]
    elemSize    Size of each element, e.g., sizeof(double).

    @param[in]
    hA_src      Source array of dimension (lda,n), on CPU host.

    @param[in]
    lda         Leading dimension of matrix A. lda >= m.

    @param[out]
    dB_dst      Destination array of dimension (lddb,n), on GPU device.

    @param[in]
    lddb        Leading dimension of matrix B. lddb >= m.

    @param[in]
    queue       Queue to execute in.

    @ingroup magma_setmatrix
*******************************************************************************/
extern "C" void
magma_setmatrix_async_internal(
    magma_int_t m, magma_int_t n, magma_int_t elemSize,
    void const* hA_src, magma_int_t lda,
    magma_ptr   dB_dst, magma_int_t lddb,
    magma_queue_t queue,
    const char* func, const char* file, int line )
{
    // for backwards compatability, accepts NULL queue to mean NULL stream.
    cudaStream_t stream = NULL;
    if ( queue != NULL ) {
        stream = queue->cuda_stream();
    }
    else {
        fprintf( stderr, "Warning: %s got NULL queue\n", __func__ );
    }
    cublasStatus_t status;
    status = cublasSetMatrixAsync(
        int(m), int(n), int(elemSize),
        hA_src, int(lda),
        dB_dst, int(lddb), stream );
    check_xerror( status, func, file, line );
}
void Cpsgecopy_general_async(int m, int n, 
        void *A, int ia, int ja, int *descA,
        void *B, int ib, int jb, int *descB, int is_device_to_host)
{
#define dA(i,j)  (((float*)A) + IDX2F(i,j,descA[LLD_]))
#define dT(i,j) (((float *)T) + IDX2F(i,j,descT[LLD_]))

#define dB(i,j)  (((float *)B) + IDX2F(i,j,descB[LLD_]))
/*
  perform    copy

  B( ib:(ib+m-1), jb:(jb+n-1)) <-  A( ia:(ia+m-1),ja:(ja+n-1))

 */

const int use_MallocHost = FALSE;
const int use_igsum2d = FALSE;

cublasStatus cu_status;

cudaError_t cuda_status;

char notrans[] = "NoTrans";

int descT[DLEN_];

int ldA,ldB,ldT;

int is_same_context, is_same_mb, is_same_nb;
int is_same_p, is_same_q;
int is_same_offset;
int is_same_Locp, is_same_Locq;
int is_aligned;

int lrA1,lcA1, lrA2,lcA2;
int lrT1,lcT1, lrT2,lcT2;
int lrB1,lcB1, lrB2,lcB2;
int rsrc,csrc;
int rsrcA1,csrcA1,  rsrcA2, csrcA2;
int rsrcB1,csrcB1,  rsrcB2, csrcB2;
int iia,jja, iib,jjb;
int icontxt, nprow,npcol, myprow,mypcol;
int LocpA,LocqA,  LocpB,LocqB, LocpT,LocqT;
int mm,nn, lmm,lnn;
size_t nbytes;

float one_[REAL_PART+IMAG_PART+1];
float *one = &(one_[0]);

float zero_[REAL_PART+IMAG_PART+1];
float *zero = &(zero_[0]);


float alpha_[REAL_PART+IMAG_PART+1];
float *alpha = &(alpha_[0]);

float beta_[REAL_PART+IMAG_PART+1];
float *beta = &(beta_[0]);

int isize, isizeT;
float *T = 0;

int elemSize = sizeof(float);
int nnb, jstart,jend,jsize;
int is_ok;

int nmax;
const int bufsize =  1024*1024;
const int use_simple = FALSE;;

one[REAL_PART] = 1.0;
one[IMAG_PART] = 0.0;
zero[REAL_PART] = 0.0;
zero[IMAG_PART] = 0.0;

 if ((m <= 0) || (n <= 0)) {
   return;
 };

 T = 0; 

 ldA = descA[LLD_];
 ldB = descB[LLD_];

  icontxt = descA[CTXT_];
  Cblacs_gridinfo( icontxt, &nprow,&npcol, &myprow, &mypcol);
  assert( nprow >= 1);
  assert( npcol >= 1);
  assert( (0 <= myprow) && (myprow < nprow));
  assert( (0 <= mypcol) && (mypcol < npcol));

  is_ok = (1 <= ia) && (ia + m-1 <= descA[M_]);
  if (!is_ok) {
    printf("Cpsgecopy (%d,%d) :ia %d m %d descA[M_] %d  \n",
            myprow,mypcol,     ia,   m,   descA[M_] );
    printf("Cpsgecopy (%d,%d) :ja %d n %d descA[N_] %d \n",
            myprow,mypcol,     ja,   n,   descA[N_] );
    printf("Cpsgecopy (%d,%d) :ib %d jb %d descB[M_] %d descB[N_] %d\n",
            myprow,mypcol,     ib,   jb,   descB[M_],   descB[N_] );
  };
  assert( (1 <= ia) && (ia + m-1 <= descA[M_]));
  assert( (1 <= ja) && (ja + n-1 <= descA[N_]));
  assert( (1 <= ib) && (ib + m-1 <= descB[M_]));
  assert( (1 <= jb) && (jb + n-1 <= descB[N_]));


  is_same_context = (descA[CTXT_] == descB[CTXT_]);
  is_same_mb = (descA[MB_] == descB[MB_]);
  is_same_nb = (descA[NB_] == descB[NB_]);

  is_same_p = (Cindxg2p(ia,descA[MB_], myprow, descA[RSRC_],nprow) ==
               Cindxg2p(ib,descB[MB_], myprow, descB[RSRC_],nprow) );

  is_same_q = (Cindxg2p(ja,descA[NB_], mypcol, descA[CSRC_],npcol) ==
               Cindxg2p(jb,descB[NB_], mypcol, descB[CSRC_],npcol) );

  is_same_offset = (MOD(ia,descA[MB_]) == MOD(ib,descB[MB_])) &&
                   (MOD(ja,descA[NB_]) == MOD(jb,descB[NB_]));


  local_extent( m,n, ia,ja,descA, &LocpA,&LocqA, &lrA1,&lcA1, &lrA2,&lcA2 );


  local_extent( m,n, ib,jb,descB, &LocpB,&LocqB,&lrB1,&lcB1, &lrB2,&lcB2 );



  /*
  if ((LocpA >= 1) || (LocpB >= 1)) {
     is_same_Locp = (LocpA == LocpB);
  };
  if ((LocqA >= 1) || (LocqB >= 1)) {
     is_same_Locq = (LocqA == LocqB);
  };
  */

  is_same_Locq = (LocqA == LocqB);

  is_same_Locp = (LocpA == LocpB);

  is_aligned = is_same_context &&
               is_same_mb && is_same_nb &&
               is_same_p && is_same_q &&
               is_same_offset &&
               is_same_Locp && is_same_Locq;

  assert( is_same_q );

  assert( is_same_p );

  assert( is_same_offset );

  assert( is_same_Locp );

  assert( is_same_Locq );

  assert( is_aligned );


        
       /*
        no communication required
        copy from device to host
        */

       ldA = descA[LLD_];
       ldB = descB[LLD_];

       mm = LocpA;
       nn = LocqA;

       if (is_device_to_host) {
         /* 
          * transfer from device to host
          */
         if ( (mm >= 1) && (nn >= 1) ) {
#ifdef USE_CUBLASV2
           {
             cublasStatus_t istatus;
             istatus = cublasGetMatrixAsync(mm, nn, elemSize, 
                 (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1), ldB,
                 cublas_get_stream() );
             assert( istatus == CUBLAS_STATUS_SUCCESS );
           }
#else
           cu_status = cublasGetMatrix(mm,nn, elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB );
            CHKERR(cu_status);
#endif
           };
         }
       else {
         /* 
          * transfer from host to device
          */
         if ( (mm >= 1) && (nn >= 1) ) {
#ifdef USE_CUBLASV2
           {
             cublasStatus_t istatus;

             istatus = cublasSetMatrixAsync(mm,nn,elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB,
               cublas_get_stream()   );

             assert( istatus == CUBLAS_STATUS_SUCCESS );

           }
#else
            cu_status = cublasSetMatrix(mm,nn,elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB );
            CHKERR(cu_status);
#endif
           };
         };
               



  return;
}
Пример #9
0
void blasx_gpu_dgemm_kernel(int j,
                            int nrowa, int ncola,
                            int nrowb, int ncolb,
                            int nrowc, int ncolc,
                            int current_task, int prior_task,
                            enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
                            double* A, double* B, double* C,
                            int lda, int ldb, int ldc,
                            int x, int y, int z,
                            double** C_dev,
                            cudaStream_t *stream, cublasHandle_t *handle_p,
                            int current_stream,
                            double alpha, double beta, int block_dim,
                            int switcher, int* task_batch_counter,
                            LRU_t **LRUs, int GPUs,
                            int *mem_cpy_counter,
                            reader_tracker *addr_track,
                            int GPU_id)
{
    int nrowa_dev, nrowb_dev, nrowc_dev;
    int ncola_dev, ncolb_dev, ncolc_dev;
    int nrow_offset_a, nrow_offset_b;
    int ncol_offset_a, ncol_offset_b;
    int i = current_task/(y+1);
    int k = current_task%(y+1);
    double *A_dev, *B_dev;
    if (TransA != CblasNoTrans) {
        margin_adjustment(nrowa,ncola,block_dim,j,i,&nrowa_dev,&ncola_dev);
    }else{
        margin_adjustment(nrowa,ncola,block_dim,i,j,&nrowa_dev,&ncola_dev);
    }
    if (TransB != CblasNoTrans) {
        margin_adjustment(nrowb,ncolb,block_dim,k,j,&nrowb_dev,&ncolb_dev);
    }else{
        margin_adjustment(nrowb,ncolb,block_dim,j,k,&nrowb_dev,&ncolb_dev);
    }
    margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev);
    if (TransA != CblasNoTrans) {
        nrow_offset_a = j*block_dim, ncol_offset_a = i*block_dim;
    }else{
        nrow_offset_a = i*block_dim, ncol_offset_a = j*block_dim;
    }
    if (TransB != CblasNoTrans) {
        nrow_offset_b = k*block_dim, ncol_offset_b = j*block_dim;
    }else{
        nrow_offset_b = j*block_dim, ncol_offset_b = k*block_dim;
    }
    double *starting_point_A = &A[nrow_offset_a+ncol_offset_a*lda];
    double *starting_point_B = &B[nrow_offset_b+ncol_offset_b*ldb];
    //Asynchonizing set matrix on GPU
    //----------------LRU&RBT optimization----------------//
    mem_control_kernel_double(starting_point_A, &A_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowa_dev, ncola_dev, lda);
    mem_control_kernel_double(starting_point_B, &B_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowb_dev, ncolb_dev, ldb);
    //----------------------------------------------------//
    
    if (j == 0) {
        margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev);
        int nrow_offset_c = i*block_dim;
        int ncol_offset_c = k*block_dim;
        double *starting_point_C = &C[nrow_offset_c+ncol_offset_c*ldc];
        if (beta != 0) {
            assert( cublasSetMatrixAsync(nrowc_dev, ncolc_dev, sizeof(double), starting_point_C, ldc, C_dev[switcher*STREAMNUM+current_stream], block_dim, *stream) == CUBLAS_STATUS_SUCCESS );
        }
        if (*task_batch_counter != 0) {//Set matrix back
            int i_pre = prior_task/(y+1);
            int k_pre = prior_task%(y+1);
            int nrowc_dev_pre, ncolc_dev_pre;
            margin_adjustment(nrowc,ncolc,block_dim,i_pre,k_pre,&nrowc_dev_pre,&ncolc_dev_pre);
            int nrow_offset_c_pre = i_pre*block_dim;
            int ncol_offset_c_pre = k_pre*block_dim;
            double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
            assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*STREAMNUM], block_dim, starting_point_C_pre, ldc,*stream) == CUBLAS_STATUS_SUCCESS);
        }
    }
    cudaStreamSynchronize(*stream);
    assert( cublasSetStream(*handle_p, *stream) == CUBLAS_STATUS_SUCCESS );

    double beta_inner = (j==0)?(beta):(1);
    int ka = (TransA != CblasNoTrans)?(nrowa_dev):(ncola_dev);
    cublasOperation_t transa, transb;
    CBLasTransToCuBlasTrans(TransA, &transa);
    CBLasTransToCuBlasTrans(TransB, &transb);
    cublasStatus_t status = cublasDgemm(*handle_p,
                                        transa, transb,
                                        nrowc_dev, ncolc_dev, ka,
                                        &alpha,
                                        A_dev, block_dim,
                                        B_dev, block_dim,
                                        &beta_inner,
                                        C_dev[switcher*STREAMNUM+current_stream], block_dim);
    assert( status == CUBLAS_STATUS_SUCCESS );
}
Пример #10
0
void mem_control_kernel_float(float *starting_point_A, float **A_dev,
                        LRU_t **LRUs, const int GPUs, const int GPU_id, int block_dim,
                        int *mem_cpy_counter, reader_tracker *addr_track,
                        cudaStream_t *stream,
                        int nrowa_dev, int ncola_dev, int lda) {
    rbt_node* block_A = rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map));
    if( block_A == NULL ) { //new element
        //fprintf(stderr, "==========new element========\n");
        //traverse_LRU_se(LRU);
        int search_l_GPU = GPU_id-1;
        int search_r_GPU = GPU_id+1;
        rbt_node *block_A_l = NULL;
        rbt_node *block_A_r = NULL;
        while (block_A_l == NULL && block_A_r == NULL) {
            if (search_l_GPU >= 0) {
                block_A_l = rbt_find(starting_point_A, &(LRUs[search_l_GPU]->hash_map));
                if (block_A_l != NULL) {
                    if (block_A_l->associated_LRU_elem->is_trans_done == 0) {
                        int peer_access_check = 0;
                        cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_l_GPU);
                        if(peer_access_check == 1) block_A_l = NULL;
                    }
                }
                search_l_GPU--;
            }
            if (search_r_GPU < GPUs) {
                block_A_r = rbt_find(starting_point_A, &(LRUs[search_r_GPU]->hash_map));
                if (block_A_r != NULL) {
                    if (block_A_r->associated_LRU_elem->is_trans_done == 0) {
                        int peer_access_check = 0;
                        cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_r_GPU);
                        if(peer_access_check == 1) block_A_r = NULL;
                    }
                }
                search_r_GPU++;
            }
            if (search_l_GPU < 0 && search_r_GPU >= GPUs) {
                break;
            }
        }
        //rectitfication
        search_l_GPU++; search_r_GPU--;
        assert(search_l_GPU >= 0 && search_l_GPU < GPUs);
        assert(search_r_GPU >= 0 && search_r_GPU < GPUs);
        
        if ( !(block_A_l == NULL && block_A_r == NULL) ) {
            //inter GPU communication
            int target_GPU_id = 0;
            if (block_A_l != NULL && block_A_r != NULL) {
                if (ABS(search_l_GPU - GPU_id) > ABS(search_r_GPU - GPU_id)) {
                    target_GPU_id = search_r_GPU;
                    block_A       = block_A_r;
                } else if(ABS(search_l_GPU - GPU_id) < ABS(search_r_GPU - GPU_id)) {
                    target_GPU_id = search_l_GPU;
                    block_A       = block_A_l;
                } else {
                    int rand_select = rand()%10;
                    if (rand_select < 5) {
                        target_GPU_id = search_l_GPU;
                        block_A       = block_A_l;
                    } else {
                        target_GPU_id = search_r_GPU;
                        block_A       = block_A_r;
                    }
                }
                if(block_A->associated_LRU_elem->is_trans_done != 1)
                goto new_block;
                //fprintf(stderr, "==>3  block on GPUs:(%d, %d), but chose %d(done:%d) as curt GPU is %d (block_A_l:%p, block_A_r:%p)\n", search_l_GPU, search_r_GPU, target_GPU_id, block_A->associated_LRU_elem->is_trans_done, GPU_id, block_A_l, block_A_r);
            } else {
                if (block_A_l != NULL && block_A_r == NULL) {
                    target_GPU_id = search_l_GPU;
                    block_A       = block_A_l;
                } else if(block_A_r != NULL && block_A_l == NULL) {
                    target_GPU_id = search_r_GPU;
                    block_A       = block_A_r;
                }
                if(block_A->associated_LRU_elem->is_trans_done != 1)
                goto new_block;
                //printf("==>2  block on GPUs:%d, and curt GPU is %d (done:%d)\n", target_GPU_id, GPU_id, block_A->associated_LRU_elem->is_trans_done);
            }
            if (rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) == NULL)
            goto new_block;
            atomic_reader_plus(block_A);
            *A_dev = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id);
            assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) != NULL);
            assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map))->associated_LRU_elem->is_trans_done == 1);
            assert( cudaMemcpyPeerAsync(*A_dev, GPU_id, block_A->associated_LRU_elem->GPU_p, target_GPU_id, sizeof(float)*block_dim*block_dim, *stream) == cudaSuccess );
            //cannot dequeue the GPU mem at the target GPU
            addr_track[*mem_cpy_counter].addr   = starting_point_A;
            addr_track[*mem_cpy_counter].GPU_id = target_GPU_id;
            addr_track[*mem_cpy_counter].is_trans_done = 1;
            (*mem_cpy_counter) += 1;
            //cannnot dequeue the current new GPU mem
            addr_track[*mem_cpy_counter].addr   = starting_point_A;
            addr_track[*mem_cpy_counter].GPU_id = GPU_id;
            addr_track[*mem_cpy_counter].is_trans_done = 0;
            (*mem_cpy_counter) += 1;
        } else {
        new_block:
            //(block_A_r == NULL && block_A_l == NULL) {
            //bring new blocks
            //printf("==>1  bring new block to GPU:%d\n", GPU_id);
            (*A_dev) = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id);
            assert( cublasSetMatrixAsync(nrowa_dev, ncola_dev, sizeof(float), starting_point_A, lda, *A_dev, block_dim, *stream) == CUBLAS_STATUS_SUCCESS );
            addr_track[*mem_cpy_counter].addr          = starting_point_A;
            addr_track[*mem_cpy_counter].GPU_id        = GPU_id;
            addr_track[*mem_cpy_counter].is_trans_done = 0;
            (*mem_cpy_counter) += 1;
        }
    } else {
        atomic_reader_plus(block_A);
        assert( rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map)) != NULL);
        *A_dev = (float*) LRU_reorder(starting_point_A, LRUs[GPU_id]);
        addr_track[*mem_cpy_counter].addr   = starting_point_A;
        addr_track[*mem_cpy_counter].GPU_id = GPU_id;
        (*mem_cpy_counter) += 1;
    }
}
void copyBandMatrixToDevice(double *h_matrix, gpu_symm_band_matrix * gpu_matrix,
	cublasHandle_t handle)
{
	int num_tiles;
	int bs = gpu_matrix->block_size;
	int order = gpu_matrix->order;
	int hb = gpu_matrix->half_bandwith;
	int i;
	int cur_row = 0;
	int tile_end;
	int cur_bs;
	cublasStatus_t status;
	double* temp_tile;
	cudaStream_t streams[STREAM_COUNT];
	double** stream_tiles;
	int max_tile_elements;

	num_tiles = (order + bs - 1) / bs;
	gpu_matrix->gpu_matrix_tiles = (double **) malloc(
		num_tiles * sizeof(double *));
	gpu_matrix->tile_len = (int *) malloc(num_tiles * sizeof(int));

	stream_tiles = (double**)malloc(STREAM_COUNT*sizeof(double*));
	for(i = 0; i < STREAM_COUNT; i++)
	{
		cudaStreamCreate(&streams[i]);
		stream_tiles[i] = (double*) malloc( (bs + hb) * bs *sizeof(double) );
	}
	
	max_tile_elements = (bs + hb) * bs;
	checkCudaErrors(cudaMalloc(&(gpu_matrix->tiles_storage), num_tiles * max_tile_elements*sizeof(double)));

	for (i = 0; i < num_tiles; i++)
	{
		tile_end = cur_row + bs - 1 + hb;
		if (tile_end >= order)
		{
			tile_end = order - 1;
		}
		cur_bs = bs;
		if (cur_row + bs > order)
		{
			cur_bs = order - cur_row;
		}

		gpu_matrix->tile_len[i] = tile_end - cur_row + 1;
		/*checkCudaErrors(
			cudaMalloc(&(gpu_matrix->gpu_matrix_tiles[i]), cur_bs * gpu_matrix->tile_len[i] * sizeof(double)));*/
		gpu_matrix->gpu_matrix_tiles[i] = &(gpu_matrix->tiles_storage[max_tile_elements*i]);

		cudaStreamSynchronize(streams[i%STREAM_COUNT]);
		temp_tile = stream_tiles[i%STREAM_COUNT];
		init_temp_tile(cur_bs, cur_row, temp_tile, h_matrix, gpu_matrix->tile_len[i], order, hb, bs);

		status = cublasSetMatrixAsync(cur_bs, gpu_matrix->tile_len[i], sizeof(double), temp_tile, bs, gpu_matrix->gpu_matrix_tiles[i], cur_bs, streams[i%STREAM_COUNT]);
		if(status != CUBLAS_STATUS_SUCCESS)
		{
			printf("COPY MATRIX TO DEVICE FAILED %d", status);
		}

		cur_row += bs;
	}
	printf("\nMatrix is copied to GPU\n");
	for(i = 0; i < STREAM_COUNT; i++)
	{
		cudaStreamSynchronize(streams[i]);
		cudaStreamDestroy(streams[i]);
		free(stream_tiles[i]);
	}
}