Beispiel #1
0
void collect_final_result_zgemm(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, cuDoubleComplex** C_dev, int block_dim, int stream_num, int x, int y, int z, int nrowc, int ncolc, int ldc, cuDoubleComplex *C) {
    switcher = 1 - switcher;
    int temp = 0;
    for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(1-switcher)];
        int i_pre = prior_task/(y+1);
        int k_pre = prior_task%(y+1);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS );
        assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess);
    }
    for (temp = 0; temp < tasks_rs_size[switcher]; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(switcher)];
        int i_pre = prior_task/(y+1);
        int k_pre = prior_task%(y+1);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS );
        assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess);
    }
}
Beispiel #2
0
void collect_final_result_dsyrk_syr2k(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, double** C_dev, int block_dim, int stream_num, int x,int y, int z, int nrowc, int ncolc, int ldc, double *C, enum CBLAS_UPLO Uplo) {
    switcher = 1 - switcher;
    int temp = 0;
    for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(1-switcher)];
        int i_pre, k_pre;
        blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
    for (temp = 0; temp < tasks_rs_size[switcher]; temp++) {
        //assume 1-switcher
        int prior_task = tasks_rs[temp+stream_num*(switcher)];
        int i_pre, k_pre;
        blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
}
Beispiel #3
0
void collect_final_result_dtrsm_mode_1(int *tasks_rs, int *tasks_rs_size, int switcher, int switcher_rs, cudaStream_t *stream, double** buffer_dev, int block_dim, int stream_num, int x, int y, int z, int nrowb, int ncolb, int ldb, double *B, int* switch_tracker) {
    int temp = 0;
    for (temp = tasks_rs_size[switcher_rs]; temp < tasks_rs_size[1-switcher_rs] ; temp++) {
        //            printf("retrieving B[%d, %d] @stream=%d switcher:%d\n", z, tasks_rs[temp+STREAMNUM*(1-switcher_rs)], temp, switcher);
        int row = z;
        int col = tasks_rs[temp+stream_num*(1-switcher_rs)];
        int current_stream = temp;
        int nrow_offset = row*block_dim;
        int ncol_offset = col*block_dim;
        int nrow_dev, ncol_dev;
        margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev);
        double *starting_point = &B[nrow_offset+ncol_offset*ldb];
        cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
    for (temp = 0; temp < tasks_rs_size[switcher_rs]; temp++) {
        //assume 1-switcher
        //printf("retrieving B[%d, %d] @stream=%d\n", z, tasks_rs[temp+STREAMNUM*switcher_rs], temp);
        int row = z;
        int col = tasks_rs[temp+stream_num*switcher_rs];
        int current_stream = temp;
        int nrow_offset = row*block_dim;
        int ncol_offset = col*block_dim;
        int nrow_dev, ncol_dev;
        margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev);
        double *starting_point = &B[nrow_offset+ncol_offset*ldb];
        cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
}
Beispiel #4
0
void magma_getmatrix_async(
    magma_int_t m, magma_int_t n, size_t elemSize,
    void const* dA_src, magma_int_t lda,
    void*       hB_dst, magma_int_t ldb,
    cudaStream_t stream )
{
    cublasStatus_t status;
    status = cublasGetMatrixAsync(
        m, n, elemSize,
        dA_src, lda,
        hB_dst, ldb, stream );
    check_error( status );
}
Beispiel #5
0
void magma_getmatrix_async_internal(
    magma_int_t m, magma_int_t n, magma_int_t elemSize,
    void const* dA_src, magma_int_t lda,
    void*       hB_dst, magma_int_t ldb,
    cudaStream_t stream,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasGetMatrixAsync(
        m, n, elemSize,
        dA_src, lda,
        hB_dst, ldb, stream );
    check_xerror( status, func, file, line );
}
Beispiel #6
0
void magma_sgetmatrix_async_internal(
    magma_int_t m, magma_int_t n,
    float const* dA_src, magma_int_t lda,
    float*       hB_dst, magma_int_t ldb,
    cudaStream_t stream,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasGetMatrixAsync(
        m, n, sizeof(float),
        dA_src, lda,
        hB_dst, ldb, stream );
    check_xerror( status, func, file, line );
}
Beispiel #7
0
// --------------------
extern "C" void
magma_zgetmatrix_async_internal(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex_const_ptr dA_src, magma_int_t lda,
    magmaDoubleComplex*          hB_dst, magma_int_t ldb,
    magma_queue_t queue,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasGetMatrixAsync(
        m, n, sizeof(magmaDoubleComplex),
        dA_src, lda,
        hB_dst, ldb, queue );
    check_xerror( status, func, file, line );
}
Beispiel #8
0
/***************************************************************************//**
    @fn magma_getmatrix( m, n, elemSize, dA_src, ldda, hB_dst, ldb, queue )

    Copy all or part of matrix dA_src on GPU device to hB_dst on CPU host.
    Elements may be arbitrary size.
    Type-safe versions set elemSize appropriately.

    This version synchronizes the queue after the transfer.
    See magma_getmatrix_async() for an asynchronous version.

    @param[in]
    m           Number of rows of matrix A. m >= 0.

    @param[in]
    n           Number of columns of matrix A. n >= 0.

    @param[in]
    elemSize    Size of each element, e.g., sizeof(double).

    @param[in]
    dA_src      Source array of dimension (ldda,n), on GPU device.

    @param[in]
    ldda        Leading dimension of matrix A. ldda >= m.

    @param[out]
    hB_dst      Destination array of dimension (ldb,n), on CPU host.

    @param[in]
    ldb         Leading dimension of matrix B. ldb >= m.

    @param[in]
    queue       Queue to execute in.

    @ingroup magma_getmatrix
*******************************************************************************/
extern "C" void
magma_getmatrix_q_internal(
    magma_int_t m, magma_int_t n, magma_int_t elemSize,
    magma_const_ptr dA_src, magma_int_t ldda,
    void*           hB_dst, magma_int_t ldb,
    magma_queue_t queue,
    const char* func, const char* file, int line )
{
    assert( queue != NULL );
    cublasStatus_t status;
    status = cublasGetMatrixAsync(
        int(m), int(n), int(elemSize),
        dA_src, int(ldda),
        hB_dst, int(ldb), queue->cuda_stream() );
    cudaStreamSynchronize( queue->cuda_stream() );
    check_xerror( status, func, file, line );
}
Beispiel #9
0
/***************************************************************************//**
    @fn magma_getmatrix_async( m, n, elemSize, dA_src, ldda, hB_dst, ldb, queue )

    Copy all or part of matrix dA_src on GPU device to hB_dst on CPU host.
    Elements may be arbitrary size.
    Type-safe versions set elemSize appropriately.

    This version is asynchronous: it may return before the transfer finishes,
    if hB_dst is pinned CPU memory.
    See magma_getmatrix() for a synchronous version.

    @param[in]
    m           Number of rows of matrix A. m >= 0.

    @param[in]
    n           Number of columns of matrix A. n >= 0.

    @param[in]
    elemSize    Size of each element, e.g., sizeof(double).

    @param[in]
    dA_src      Source array of dimension (ldda,n), on GPU device.

    @param[in]
    ldda        Leading dimension of matrix A. ldda >= m.

    @param[out]
    hB_dst      Destination array of dimension (ldb,n), on CPU host.

    @param[in]
    ldb         Leading dimension of matrix B. ldb >= m.

    @param[in]
    queue       Queue to execute in.

    @ingroup magma_getmatrix
*******************************************************************************/
extern "C" void
magma_getmatrix_async_internal(
    magma_int_t m, magma_int_t n, magma_int_t elemSize,
    magma_const_ptr dA_src, magma_int_t ldda,
    void*           hB_dst, magma_int_t ldb,
    magma_queue_t queue,
    const char* func, const char* file, int line )
{
    cudaStream_t stream = NULL;
    if ( queue != NULL ) {
        stream = queue->cuda_stream();
    }
    else {
        fprintf( stderr, "Warning: %s got NULL queue\n", __func__ );
    }
    cublasStatus_t status;
    status = cublasGetMatrixAsync(
        int(m), int(n), int(elemSize),
        dA_src, int(ldda),
        hB_dst, int(ldb), stream );
    check_xerror( status, func, file, line );
}
void Cpsgecopy_general_async(int m, int n, 
        void *A, int ia, int ja, int *descA,
        void *B, int ib, int jb, int *descB, int is_device_to_host)
{
#define dA(i,j)  (((float*)A) + IDX2F(i,j,descA[LLD_]))
#define dT(i,j) (((float *)T) + IDX2F(i,j,descT[LLD_]))

#define dB(i,j)  (((float *)B) + IDX2F(i,j,descB[LLD_]))
/*
  perform    copy

  B( ib:(ib+m-1), jb:(jb+n-1)) <-  A( ia:(ia+m-1),ja:(ja+n-1))

 */

const int use_MallocHost = FALSE;
const int use_igsum2d = FALSE;

cublasStatus cu_status;

cudaError_t cuda_status;

char notrans[] = "NoTrans";

int descT[DLEN_];

int ldA,ldB,ldT;

int is_same_context, is_same_mb, is_same_nb;
int is_same_p, is_same_q;
int is_same_offset;
int is_same_Locp, is_same_Locq;
int is_aligned;

int lrA1,lcA1, lrA2,lcA2;
int lrT1,lcT1, lrT2,lcT2;
int lrB1,lcB1, lrB2,lcB2;
int rsrc,csrc;
int rsrcA1,csrcA1,  rsrcA2, csrcA2;
int rsrcB1,csrcB1,  rsrcB2, csrcB2;
int iia,jja, iib,jjb;
int icontxt, nprow,npcol, myprow,mypcol;
int LocpA,LocqA,  LocpB,LocqB, LocpT,LocqT;
int mm,nn, lmm,lnn;
size_t nbytes;

float one_[REAL_PART+IMAG_PART+1];
float *one = &(one_[0]);

float zero_[REAL_PART+IMAG_PART+1];
float *zero = &(zero_[0]);


float alpha_[REAL_PART+IMAG_PART+1];
float *alpha = &(alpha_[0]);

float beta_[REAL_PART+IMAG_PART+1];
float *beta = &(beta_[0]);

int isize, isizeT;
float *T = 0;

int elemSize = sizeof(float);
int nnb, jstart,jend,jsize;
int is_ok;

int nmax;
const int bufsize =  1024*1024;
const int use_simple = FALSE;;

one[REAL_PART] = 1.0;
one[IMAG_PART] = 0.0;
zero[REAL_PART] = 0.0;
zero[IMAG_PART] = 0.0;

 if ((m <= 0) || (n <= 0)) {
   return;
 };

 T = 0; 

 ldA = descA[LLD_];
 ldB = descB[LLD_];

  icontxt = descA[CTXT_];
  Cblacs_gridinfo( icontxt, &nprow,&npcol, &myprow, &mypcol);
  assert( nprow >= 1);
  assert( npcol >= 1);
  assert( (0 <= myprow) && (myprow < nprow));
  assert( (0 <= mypcol) && (mypcol < npcol));

  is_ok = (1 <= ia) && (ia + m-1 <= descA[M_]);
  if (!is_ok) {
    printf("Cpsgecopy (%d,%d) :ia %d m %d descA[M_] %d  \n",
            myprow,mypcol,     ia,   m,   descA[M_] );
    printf("Cpsgecopy (%d,%d) :ja %d n %d descA[N_] %d \n",
            myprow,mypcol,     ja,   n,   descA[N_] );
    printf("Cpsgecopy (%d,%d) :ib %d jb %d descB[M_] %d descB[N_] %d\n",
            myprow,mypcol,     ib,   jb,   descB[M_],   descB[N_] );
  };
  assert( (1 <= ia) && (ia + m-1 <= descA[M_]));
  assert( (1 <= ja) && (ja + n-1 <= descA[N_]));
  assert( (1 <= ib) && (ib + m-1 <= descB[M_]));
  assert( (1 <= jb) && (jb + n-1 <= descB[N_]));


  is_same_context = (descA[CTXT_] == descB[CTXT_]);
  is_same_mb = (descA[MB_] == descB[MB_]);
  is_same_nb = (descA[NB_] == descB[NB_]);

  is_same_p = (Cindxg2p(ia,descA[MB_], myprow, descA[RSRC_],nprow) ==
               Cindxg2p(ib,descB[MB_], myprow, descB[RSRC_],nprow) );

  is_same_q = (Cindxg2p(ja,descA[NB_], mypcol, descA[CSRC_],npcol) ==
               Cindxg2p(jb,descB[NB_], mypcol, descB[CSRC_],npcol) );

  is_same_offset = (MOD(ia,descA[MB_]) == MOD(ib,descB[MB_])) &&
                   (MOD(ja,descA[NB_]) == MOD(jb,descB[NB_]));


  local_extent( m,n, ia,ja,descA, &LocpA,&LocqA, &lrA1,&lcA1, &lrA2,&lcA2 );


  local_extent( m,n, ib,jb,descB, &LocpB,&LocqB,&lrB1,&lcB1, &lrB2,&lcB2 );



  /*
  if ((LocpA >= 1) || (LocpB >= 1)) {
     is_same_Locp = (LocpA == LocpB);
  };
  if ((LocqA >= 1) || (LocqB >= 1)) {
     is_same_Locq = (LocqA == LocqB);
  };
  */

  is_same_Locq = (LocqA == LocqB);

  is_same_Locp = (LocpA == LocpB);

  is_aligned = is_same_context &&
               is_same_mb && is_same_nb &&
               is_same_p && is_same_q &&
               is_same_offset &&
               is_same_Locp && is_same_Locq;

  assert( is_same_q );

  assert( is_same_p );

  assert( is_same_offset );

  assert( is_same_Locp );

  assert( is_same_Locq );

  assert( is_aligned );


        
       /*
        no communication required
        copy from device to host
        */

       ldA = descA[LLD_];
       ldB = descB[LLD_];

       mm = LocpA;
       nn = LocqA;

       if (is_device_to_host) {
         /* 
          * transfer from device to host
          */
         if ( (mm >= 1) && (nn >= 1) ) {
#ifdef USE_CUBLASV2
           {
             cublasStatus_t istatus;
             istatus = cublasGetMatrixAsync(mm, nn, elemSize, 
                 (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1), ldB,
                 cublas_get_stream() );
             assert( istatus == CUBLAS_STATUS_SUCCESS );
           }
#else
           cu_status = cublasGetMatrix(mm,nn, elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB );
            CHKERR(cu_status);
#endif
           };
         }
       else {
         /* 
          * transfer from host to device
          */
         if ( (mm >= 1) && (nn >= 1) ) {
#ifdef USE_CUBLASV2
           {
             cublasStatus_t istatus;

             istatus = cublasSetMatrixAsync(mm,nn,elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB,
               cublas_get_stream()   );

             assert( istatus == CUBLAS_STATUS_SUCCESS );

           }
#else
            cu_status = cublasSetMatrix(mm,nn,elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB );
            CHKERR(cu_status);
#endif
           };
         };
               



  return;
}
Beispiel #11
0
void blasx_gpu_dgemm_kernel(int j,
                            int nrowa, int ncola,
                            int nrowb, int ncolb,
                            int nrowc, int ncolc,
                            int current_task, int prior_task,
                            enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
                            double* A, double* B, double* C,
                            int lda, int ldb, int ldc,
                            int x, int y, int z,
                            double** C_dev,
                            cudaStream_t *stream, cublasHandle_t *handle_p,
                            int current_stream,
                            double alpha, double beta, int block_dim,
                            int switcher, int* task_batch_counter,
                            LRU_t **LRUs, int GPUs,
                            int *mem_cpy_counter,
                            reader_tracker *addr_track,
                            int GPU_id)
{
    int nrowa_dev, nrowb_dev, nrowc_dev;
    int ncola_dev, ncolb_dev, ncolc_dev;
    int nrow_offset_a, nrow_offset_b;
    int ncol_offset_a, ncol_offset_b;
    int i = current_task/(y+1);
    int k = current_task%(y+1);
    double *A_dev, *B_dev;
    if (TransA != CblasNoTrans) {
        margin_adjustment(nrowa,ncola,block_dim,j,i,&nrowa_dev,&ncola_dev);
    }else{
        margin_adjustment(nrowa,ncola,block_dim,i,j,&nrowa_dev,&ncola_dev);
    }
    if (TransB != CblasNoTrans) {
        margin_adjustment(nrowb,ncolb,block_dim,k,j,&nrowb_dev,&ncolb_dev);
    }else{
        margin_adjustment(nrowb,ncolb,block_dim,j,k,&nrowb_dev,&ncolb_dev);
    }
    margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev);
    if (TransA != CblasNoTrans) {
        nrow_offset_a = j*block_dim, ncol_offset_a = i*block_dim;
    }else{
        nrow_offset_a = i*block_dim, ncol_offset_a = j*block_dim;
    }
    if (TransB != CblasNoTrans) {
        nrow_offset_b = k*block_dim, ncol_offset_b = j*block_dim;
    }else{
        nrow_offset_b = j*block_dim, ncol_offset_b = k*block_dim;
    }
    double *starting_point_A = &A[nrow_offset_a+ncol_offset_a*lda];
    double *starting_point_B = &B[nrow_offset_b+ncol_offset_b*ldb];
    //Asynchonizing set matrix on GPU
    //----------------LRU&RBT optimization----------------//
    mem_control_kernel_double(starting_point_A, &A_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowa_dev, ncola_dev, lda);
    mem_control_kernel_double(starting_point_B, &B_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowb_dev, ncolb_dev, ldb);
    //----------------------------------------------------//
    
    if (j == 0) {
        margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev);
        int nrow_offset_c = i*block_dim;
        int ncol_offset_c = k*block_dim;
        double *starting_point_C = &C[nrow_offset_c+ncol_offset_c*ldc];
        if (beta != 0) {
            assert( cublasSetMatrixAsync(nrowc_dev, ncolc_dev, sizeof(double), starting_point_C, ldc, C_dev[switcher*STREAMNUM+current_stream], block_dim, *stream) == CUBLAS_STATUS_SUCCESS );
        }
        if (*task_batch_counter != 0) {//Set matrix back
            int i_pre = prior_task/(y+1);
            int k_pre = prior_task%(y+1);
            int nrowc_dev_pre, ncolc_dev_pre;
            margin_adjustment(nrowc,ncolc,block_dim,i_pre,k_pre,&nrowc_dev_pre,&ncolc_dev_pre);
            int nrow_offset_c_pre = i_pre*block_dim;
            int ncol_offset_c_pre = k_pre*block_dim;
            double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
            assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*STREAMNUM], block_dim, starting_point_C_pre, ldc,*stream) == CUBLAS_STATUS_SUCCESS);
        }
    }
    cudaStreamSynchronize(*stream);
    assert( cublasSetStream(*handle_p, *stream) == CUBLAS_STATUS_SUCCESS );

    double beta_inner = (j==0)?(beta):(1);
    int ka = (TransA != CblasNoTrans)?(nrowa_dev):(ncola_dev);
    cublasOperation_t transa, transb;
    CBLasTransToCuBlasTrans(TransA, &transa);
    CBLasTransToCuBlasTrans(TransB, &transb);
    cublasStatus_t status = cublasDgemm(*handle_p,
                                        transa, transb,
                                        nrowc_dev, ncolc_dev, ka,
                                        &alpha,
                                        A_dev, block_dim,
                                        B_dev, block_dim,
                                        &beta_inner,
                                        C_dev[switcher*STREAMNUM+current_stream], block_dim);
    assert( status == CUBLAS_STATUS_SUCCESS );
}