Ejemplo n.º 1
0
//===================================================================================================================
//===================================================================================================================
//===================================================================================================================
extern "C" void
magma_dlarft_sm32x32_batched(magma_int_t n, magma_int_t k, double **v_array, magma_int_t ldv,
                    double **tau_array, double **T_array, magma_int_t ldt, magma_int_t batchCount, cublasHandle_t myhandle, magma_queue_t queue)
{

    if( k <= 0) return;

     //==================================
     //          GEMV
     //==================================
#define USE_GEMV2
#define use_gemm_larft_sm32

#if defined(use_gemm_larft_sm32)
    //magmablas_dgemm_batched( MagmaConjTrans, MagmaNoTrans, k, k, n, MAGMA_D_ONE, v_array, ldv, v_array, ldv, MAGMA_D_ZERO, T_array, ldt, batchCount, queue);
    cublasDgemmBatched(myhandle, CUBLAS_OP_C, CUBLAS_OP_N, k, k, n,
                             &one, (const double**) v_array, ldv,
                                    (const double**) v_array, ldv,
                             &zero,  T_array, ldt, batchCount);

    magmablas_dlaset_batched(MagmaLower, k, k, MAGMA_D_ZERO, MAGMA_D_ZERO, T_array, ldt, batchCount, queue);
#else
    #if 1
    for(int i=0; i<k; i++)
    {
        //W(1:i-1) := - tau(i) * V(i:n,1:i-1)' * V(i:n,i)
        //T( i, i ) = tau( i ) 
        //custom implementation.
        #ifdef USE_GEMV2
        magmablas_dlarft_gemvrowwise_batched( n-i, i, 
                            tau_array,
                            v_array, ldv, 
                            T_array, ldt,
                            batchCount, queue);
                            
        #else       
        magmablas_dlarft_gemvcolwise_batched( n-i, i, v_array, ldv, T_array, ldt, tau_array, batchCount, queue);
        #endif
    }
    #else
        //seems to be very slow when k=32 while the one by one loop above is faster
        dlarft_gemv_loop_inside_kernel_batched(n, k, tau_array, v_array, ldv, T_array, ldt, batchCount, queue); 
    #endif
#endif
     //==================================
     //          TRMV
     //==================================
     //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k]
     magmablas_dlarft_dtrmv_sm32x32_batched(k, k, tau_array, T_array, ldt, T_array, ldt, batchCount, queue);
}
Ejemplo n.º 2
0
void caffe_gpu_gemm_batched<double>(const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
    const double alpha, const double** A, const double** B, const double beta,
    double** C,
	int batch_count){
	
	// Note that cublas follows fortran order.
	int lda = (TransA == CblasNoTrans) ? K : M;
	int ldb = (TransB == CblasNoTrans) ? N : K;
	cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
	cublasOperation_t cuTransB =
      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
	
	CUBLAS_CHECK(cublasDgemmBatched(Caffe::get_current_cublas_handle(), cuTransB, cuTransA,
      N, M, K, &alpha, B, ldb, A, lda, &beta, C, N,
	  batch_count));
}
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, double alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      double beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cuda_context *ctx;
  size_t *lt, t;
  gpudata **T;
  size_t i;
  cb_transpose transT;
  cublasStatus_t err;

  if (batchCount == 0) return GA_NO_ERROR;

  ASSERT_BUF(A[0]);
  ctx = A[0]->ctx;
  cuda_enter(ctx);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    lt = offA;
    offA = offB;
    offB = lt;
  }

  // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products
  const size_t threshold = 650;
  const int multiple_dispatch = M * N * K > threshold * threshold * threshold;
  if (multiple_dispatch) {
    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(B[i], CUDA_WAIT_READ);
      cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);

      err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h,
                        convT(transA), convT(transB),
                        M, N, K, &alpha,
                        (double*)A[i]->ptr + offA[i], lda,
                        (double*)B[i]->ptr + offB[i], ldb,
                        &beta,
                        (double*)C[i]->ptr + offC[i], ldc);
      if (err != CUBLAS_STATUS_SUCCESS) {
        cuda_exit(ctx);
        if (err == CUBLAS_STATUS_ARCH_MISMATCH)
          return GA_DEVSUP_ERROR;
        return GA_BLAS_ERROR;
      }

      cuda_record(A[i], CUDA_WAIT_READ);
      cuda_record(B[i], CUDA_WAIT_READ);
      cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
    }
  } else {
    double **T_l = alloca(sizeof(double *) * batchCount * 3);
    const double **A_l = (const double **)T_l;
    const double **B_l = (const double **)T_l + batchCount;
    double **C_l = T_l + (batchCount * 2);
    CUdeviceptr Ta, Aa, Ba, Ca;

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(B[i], CUDA_WAIT_READ);
      cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
      A_l[i] = ((double *)A[i]->ptr) + offA[i];
      B_l[i] = ((double *)B[i]->ptr) + offB[i];
      C_l[i] = ((double *)C[i]->ptr) + offC[i];
    }

    cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3);
    Aa = Ta;
    Ba = Ta + (batchCount * sizeof(double *));
    Ca = Ta + (batchCount * sizeof(double *) * 2);

    cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3);

    err = cublasDgemmBatched(((blas_handle *)ctx->blas_handle)->h,
                             convT(transA), convT(transB),
                             M, N, K, &alpha, (const double **)Aa, lda,
                             (const double **)Ba, ldb, &beta,
                             (double **)Ca, ldc, batchCount);
    cuMemFree(Ta);
    if (err != CUBLAS_STATUS_SUCCESS) {
      cuda_exit(ctx);
      if (err == CUBLAS_STATUS_ARCH_MISMATCH)
        return GA_DEVSUP_ERROR;
      return GA_BLAS_ERROR;
    }

    for (i = 0; i < batchCount; i++) {
      cuda_record(A[i], CUDA_WAIT_READ);
      cuda_record(B[i], CUDA_WAIT_READ);
      cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
    }
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
int main(int argc, char* argv[])
{
    int i,j,k,index;

    // Linear dimension of matrices
    int dim = 100;

    // Number of A,B,C matrix sets
    int batch_count = 1000;

    // Allocate host storage for batch_count A,B,C square matrices
    double **A, **B, **C;
    A = (double**)malloc(batch_count*sizeof(double*));
    B = (double**)malloc(batch_count*sizeof(double*));
    C = (double**)malloc(batch_count*sizeof(double*));
    for(i=0; i<batch_count; i++) {
        A[i] = (double*)malloc(dim*dim*sizeof(double));
        B[i] = (double*)malloc(dim*dim*sizeof(double));
        C[i] = (double*)malloc(dim*dim*sizeof(double));
    }

    // Create host pointer array to device matrix storage
    double **d_A, **d_B, **d_C, **h_d_A, **h_d_B, **h_d_C;
    h_d_A = (double**)malloc(batch_count*sizeof(double*));
    h_d_B = (double**)malloc(batch_count*sizeof(double*));
    h_d_C = (double**)malloc(batch_count*sizeof(double*));

    for(i=0; i<batch_count; i++) {
        cudaMalloc((void**)&h_d_A[i], dim*dim*sizeof(double));
        cudaMalloc((void**)&h_d_B[i], dim*dim*sizeof(double));
        cudaMalloc((void**)&h_d_C[i], dim*dim*sizeof(double));
    }
    // Copy the host array of device pointers to the device
    cudaMalloc((void**)&d_A, batch_count*sizeof(double*));
    cudaMalloc((void**)&d_B, batch_count*sizeof(double*));
    cudaMalloc((void**)&d_C, batch_count*sizeof(double*));
    cudaMemcpy(d_A, h_d_A, batch_count*sizeof(double*), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_d_B, batch_count*sizeof(double*), cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, h_d_C, batch_count*sizeof(double*), cudaMemcpyHostToDevice);

    // Fill A,B diagonals with k*sin(i) data, C diagonal with k*cos(i)^2
    // Matrices are arranged column major
    for(k=0; k<batch_count; k++) {
        for(j=0; j<dim; j++) {
            for(i=0; i<dim; i++) {
                index = j*dim + i;
                if(i==j) {
                    (A[k])[index] = k*sin(index);
                    (B[k])[index] = sin(index);
                    (C[k])[index] = k*cos(index)*cos(index);
                }
		else {
                    (A[k])[index] = 0.0;
                    (B[k])[index] = 0.0;
                    (C[k])[index] = 0.0;
                }
            } // i   
        } // j
    } // k

    // Create cublas instance
    cublasHandle_t handle;
    cublasCreate(&handle);

    // Set input matrices on device
    for(i=0; i<batch_count; i++) {
        cublasSetMatrix(dim, dim, sizeof(double), A[i], dim, h_d_A[i], dim);
        cublasSetMatrix(dim, dim, sizeof(double), B[i], dim, h_d_B[i], dim);
        cublasSetMatrix(dim, dim, sizeof(double), C[i], dim, h_d_C[i], dim);
    }

    // Set matrix coefficients
    double alpha = 1.0;
    double beta  = 1.0;

    // DGEMM: C = alpha*A*B + beta*C
    cublasDgemmBatched(handle,
                       CUBLAS_OP_N, CUBLAS_OP_N,
                       dim, dim, dim,
                       &alpha,
                       (const double**)d_A, dim,
                       (const double**)d_B, dim,
                       &beta,
                       d_C, dim,
                       batch_count);

    // Retrieve result matrix from device
    for(i=0; i<batch_count; i++)
        cublasGetMatrix(dim, dim, sizeof(double), h_d_C[i], dim, C[i], dim);

    // Simple sanity test, sum up all elements
    double sum = 0;
    for(k=0; k<batch_count; k++) {
        for(j=0; j<dim; j++) {
            for(i=0; i<dim; i++) {
                index = j*dim + i;
                sum += (C[k])[index];
            }
        }
    }
    printf("Element sum is: %f, should be: %d\n", sum, dim*(batch_count-1)*(batch_count)/2);   

    // Clean up resources

    for(i=0; i<batch_count; i++) {
        free(A[i]);
        free(B[i]);
        free(C[i]);
        cudaFree(h_d_A[i]);
        cudaFree(h_d_B[i]);
        cudaFree(h_d_C[i]);
    }

    free(A);
    free(B);
    free(C);
    free(h_d_A);
    free(h_d_B);
    free(h_d_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cublasDestroy(handle);

    return 0;
}
Ejemplo n.º 5
0
extern "C" magma_int_t
magma_dlarft_batched(magma_int_t n, magma_int_t k, magma_int_t stair_T, 
                double **v_array, magma_int_t ldv,
                double **tau_array, double **T_array, magma_int_t ldt, 
                double **work_array, magma_int_t lwork, magma_int_t batchCount, cublasHandle_t myhandle, 
                magma_queue_t queue)
{
    if( k <= 0) return 0;
    if( stair_T > 0 && k <= stair_T) return 0;

    magma_int_t maxnb = max_shared_bsiz;

    if( lwork < k*ldt) 
    {
        magma_xerbla( __func__, -(10) );
        return -10;
    }

    if( stair_T > 0 && stair_T > maxnb)
    { 
        magma_xerbla( __func__, -(3) );
        return -3;
    }
    magma_int_t DEBUG=0;
    magma_int_t nb = stair_T == 0 ? min(k,maxnb) : stair_T;

    magma_int_t i, j, prev_n, mycol, rows;

    double **dW1_displ  = NULL;
    double **dW2_displ  = NULL;
    double **dW3_displ  = NULL;
    double **dTstep_array  = NULL;

    magma_malloc((void**)&dW1_displ,  batchCount * sizeof(*dW1_displ));
    magma_malloc((void**)&dW2_displ,  batchCount * sizeof(*dW2_displ));
    magma_malloc((void**)&dW3_displ,  batchCount * sizeof(*dW3_displ));
    magma_malloc((void**)&dTstep_array,  batchCount * sizeof(*dTstep_array));

    //double *Tstep =  k > nb ? work : T;
    if(k > nb)
    {
        magma_ddisplace_pointers(dTstep_array, work_array, lwork, 0, 0, batchCount, queue);
    }
    else
    {
        magma_ddisplace_pointers(dTstep_array, T_array, ldt, 0, 0, batchCount, queue);
    }

    //magma_int_t ldtstep = k > nb ? k : ldt;
    magma_int_t ldtstep = ldt; //a enlever
    // stair_T = 0 meaning all T
    // stair_T > 0 meaning the triangular portion of T has been computed. 
    //                    the value of stair_T is the nb of these triangulars
   

    //GEMV compute the whole triangular upper portion of T (phase 1)
    // TODO addcublas to check perf

#ifdef RFT_MAG_GEM
    magmablas_dgemm_batched( MagmaConjTrans, MagmaNoTrans, 
            k, k, n, 
            one,  v_array, ldv, 
                  v_array, ldv, 
            zero, dTstep_array, ldtstep, 
            batchCount, queue);
#else
    cublasDgemmBatched(myhandle, CUBLAS_OP_C, CUBLAS_OP_N, k, k, n,
                             &one, (const double**) v_array, ldv,
                                    (const double**) v_array, ldv,
                             &zero, dTstep_array, ldtstep, batchCount);
#endif

    magmablas_dlaset_batched(MagmaLower, k, k, MAGMA_D_ZERO, MAGMA_D_ZERO, dTstep_array, ldtstep, batchCount, queue);
    // no need for it as T is expected to be lower zero
    //if(k > nb) magmablas_dlaset_batched(MagmaLower, k, k, MAGMA_D_ZERO, MAGMA_D_ZERO, dTstep_array, ldtstep, batchCount);
    

    //TRMV
    //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k]
    // TRMV is split over block of column of size nb 
    // the update should be done from top to bottom so:
    // 1- a gemm using the previous computed columns
    //    of T to update rectangular upper protion above 
    //    the triangle of my columns 
    // 2- the columns need to be updated by a serial 
    //    loop over of gemv over itself. since we limit the
    //    shared memory to nb, this nb column 
    //    are split vertically by chunk of nb rows

    dim3 grid(1, 1, batchCount);

    for(j=0; j<k; j+=nb)
    {
        prev_n =  j;
        mycol  =  min(nb, k-j);
        // note that myrow = prev_n + mycol;
        if(prev_n>0 && mycol>0){

            if(DEBUG==3) printf("doing gemm on the rectangular portion of size %d %d of T(%d,%d)\n",prev_n,mycol,0,j);

            magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, 0, j, batchCount, queue);
            magma_ddisplace_pointers(dW2_displ, T_array,     ldt, 0, j, batchCount, queue);
#ifdef RFT_MAG_GEM
            magmablas_dgemm_batched( MagmaNoTrans, MagmaNoTrans, 
                    prev_n, mycol, prev_n, 
                    one,  T_array, ldt, 
                          dW1_displ, ldtstep, 
                    zero, dW2_displ, ldt, 
                    batchCount, queue );
#else
            cublasDgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N, 
                    prev_n, mycol, prev_n,
                    &one, (const double**) T_array, ldt,
                          (const double**) dW1_displ, ldtstep,
                    &zero, dW2_displ, ldt, batchCount);
#endif

            // update my rectangular portion (prev_n,mycol) using sequence of gemv 
            magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue);
            magma_ddisplace_pointers(dW3_displ, tau_array,  1, j, 0, batchCount, queue);

            for(i=0; i<prev_n; i+=nb)
            {
                rows = min(nb,prev_n-i);
                if(DEBUG==3) printf("        doing recdtrmv on the rectangular portion of size %d %d of T(%d,%d)\n",rows,mycol,i,j);

                if(rows>0 && mycol>0)
                {
                    magma_ddisplace_pointers(dW2_displ, T_array,     ldt, i, j, batchCount, queue);
                    magmablas_dlarft_recdtrmv_sm32x32_batched(rows, mycol, dW3_displ, dW2_displ, ldt, dW1_displ, ldtstep, batchCount, queue);
                }
            }
        }

        // the upper rectangular protion is updated, now if needed update the triangular portion
        if(stair_T == 0){
            if(DEBUG==3) printf("doing dtrmv on the triangular portion of size %d %d of T(%d,%d)\n",mycol,mycol,j,j);

            if(mycol>0)
            {
                magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue);
                magma_ddisplace_pointers(dW3_displ, tau_array,  1, j, 0, batchCount, queue);
                magma_ddisplace_pointers(dW2_displ, T_array,     ldt, j, j, batchCount, queue);
                magmablas_dlarft_dtrmv_sm32x32_batched(mycol, mycol, dW3_displ, dW1_displ, ldtstep, dW2_displ, ldt, batchCount, queue);

            }
        }
    }// end of j

    magma_free(dW1_displ);
    magma_free(dW2_displ);
    magma_free(dW3_displ);
    magma_free(dTstep_array);

    return 0;
}