//=================================================================================================================== //=================================================================================================================== //=================================================================================================================== extern "C" void magma_dlarft_sm32x32_batched(magma_int_t n, magma_int_t k, double **v_array, magma_int_t ldv, double **tau_array, double **T_array, magma_int_t ldt, magma_int_t batchCount, cublasHandle_t myhandle, magma_queue_t queue) { if( k <= 0) return; //================================== // GEMV //================================== #define USE_GEMV2 #define use_gemm_larft_sm32 #if defined(use_gemm_larft_sm32) //magmablas_dgemm_batched( MagmaConjTrans, MagmaNoTrans, k, k, n, MAGMA_D_ONE, v_array, ldv, v_array, ldv, MAGMA_D_ZERO, T_array, ldt, batchCount, queue); cublasDgemmBatched(myhandle, CUBLAS_OP_C, CUBLAS_OP_N, k, k, n, &one, (const double**) v_array, ldv, (const double**) v_array, ldv, &zero, T_array, ldt, batchCount); magmablas_dlaset_batched(MagmaLower, k, k, MAGMA_D_ZERO, MAGMA_D_ZERO, T_array, ldt, batchCount, queue); #else #if 1 for(int i=0; i<k; i++) { //W(1:i-1) := - tau(i) * V(i:n,1:i-1)' * V(i:n,i) //T( i, i ) = tau( i ) //custom implementation. #ifdef USE_GEMV2 magmablas_dlarft_gemvrowwise_batched( n-i, i, tau_array, v_array, ldv, T_array, ldt, batchCount, queue); #else magmablas_dlarft_gemvcolwise_batched( n-i, i, v_array, ldv, T_array, ldt, tau_array, batchCount, queue); #endif } #else //seems to be very slow when k=32 while the one by one loop above is faster dlarft_gemv_loop_inside_kernel_batched(n, k, tau_array, v_array, ldv, T_array, ldt, batchCount, queue); #endif #endif //================================== // TRMV //================================== //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k] magmablas_dlarft_dtrmv_sm32x32_batched(k, k, tau_array, T_array, ldt, T_array, ldt, batchCount, queue); }
void caffe_gpu_gemm_batched<double>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double** A, const double** B, const double beta, double** C, int batch_count){ // Note that cublas follows fortran order. int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; CUBLAS_CHECK(cublasDgemmBatched(Caffe::get_current_cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N, batch_count)); }
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cuda_context *ctx; size_t *lt, t; gpudata **T; size_t i; cb_transpose transT; cublasStatus_t err; if (batchCount == 0) return GA_NO_ERROR; ASSERT_BUF(A[0]); ctx = A[0]->ctx; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; lt = offA; offA = offB; offB = lt; } // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products const size_t threshold = 650; const int multiple_dispatch = M * N * K > threshold * threshold * threshold; if (multiple_dispatch) { for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (double*)A[i]->ptr + offA[i], lda, (double*)B[i]->ptr + offB[i], ldb, &beta, (double*)C[i]->ptr + offC[i], ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } else { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **B_l = (const double **)T_l + batchCount; double **C_l = T_l + (batchCount * 2); CUdeviceptr Ta, Aa, Ba, Ca; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); A_l[i] = ((double *)A[i]->ptr) + offA[i]; B_l[i] = ((double *)B[i]->ptr) + offB[i]; C_l[i] = ((double *)C[i]->ptr) + offC[i]; } cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3); Aa = Ta; Ba = Ta + (batchCount * sizeof(double *)); Ca = Ta + (batchCount * sizeof(double *) * 2); cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3); err = cublasDgemmBatched(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (const double **)Aa, lda, (const double **)Ba, ldb, &beta, (double **)Ca, ldc, batchCount); cuMemFree(Ta); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } cuda_exit(ctx); return GA_NO_ERROR; }
int main(int argc, char* argv[]) { int i,j,k,index; // Linear dimension of matrices int dim = 100; // Number of A,B,C matrix sets int batch_count = 1000; // Allocate host storage for batch_count A,B,C square matrices double **A, **B, **C; A = (double**)malloc(batch_count*sizeof(double*)); B = (double**)malloc(batch_count*sizeof(double*)); C = (double**)malloc(batch_count*sizeof(double*)); for(i=0; i<batch_count; i++) { A[i] = (double*)malloc(dim*dim*sizeof(double)); B[i] = (double*)malloc(dim*dim*sizeof(double)); C[i] = (double*)malloc(dim*dim*sizeof(double)); } // Create host pointer array to device matrix storage double **d_A, **d_B, **d_C, **h_d_A, **h_d_B, **h_d_C; h_d_A = (double**)malloc(batch_count*sizeof(double*)); h_d_B = (double**)malloc(batch_count*sizeof(double*)); h_d_C = (double**)malloc(batch_count*sizeof(double*)); for(i=0; i<batch_count; i++) { cudaMalloc((void**)&h_d_A[i], dim*dim*sizeof(double)); cudaMalloc((void**)&h_d_B[i], dim*dim*sizeof(double)); cudaMalloc((void**)&h_d_C[i], dim*dim*sizeof(double)); } // Copy the host array of device pointers to the device cudaMalloc((void**)&d_A, batch_count*sizeof(double*)); cudaMalloc((void**)&d_B, batch_count*sizeof(double*)); cudaMalloc((void**)&d_C, batch_count*sizeof(double*)); cudaMemcpy(d_A, h_d_A, batch_count*sizeof(double*), cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_d_B, batch_count*sizeof(double*), cudaMemcpyHostToDevice); cudaMemcpy(d_C, h_d_C, batch_count*sizeof(double*), cudaMemcpyHostToDevice); // Fill A,B diagonals with k*sin(i) data, C diagonal with k*cos(i)^2 // Matrices are arranged column major for(k=0; k<batch_count; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { index = j*dim + i; if(i==j) { (A[k])[index] = k*sin(index); (B[k])[index] = sin(index); (C[k])[index] = k*cos(index)*cos(index); } else { (A[k])[index] = 0.0; (B[k])[index] = 0.0; (C[k])[index] = 0.0; } } // i } // j } // k // Create cublas instance cublasHandle_t handle; cublasCreate(&handle); // Set input matrices on device for(i=0; i<batch_count; i++) { cublasSetMatrix(dim, dim, sizeof(double), A[i], dim, h_d_A[i], dim); cublasSetMatrix(dim, dim, sizeof(double), B[i], dim, h_d_B[i], dim); cublasSetMatrix(dim, dim, sizeof(double), C[i], dim, h_d_C[i], dim); } // Set matrix coefficients double alpha = 1.0; double beta = 1.0; // DGEMM: C = alpha*A*B + beta*C cublasDgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, dim, dim, dim, &alpha, (const double**)d_A, dim, (const double**)d_B, dim, &beta, d_C, dim, batch_count); // Retrieve result matrix from device for(i=0; i<batch_count; i++) cublasGetMatrix(dim, dim, sizeof(double), h_d_C[i], dim, C[i], dim); // Simple sanity test, sum up all elements double sum = 0; for(k=0; k<batch_count; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { index = j*dim + i; sum += (C[k])[index]; } } } printf("Element sum is: %f, should be: %d\n", sum, dim*(batch_count-1)*(batch_count)/2); // Clean up resources for(i=0; i<batch_count; i++) { free(A[i]); free(B[i]); free(C[i]); cudaFree(h_d_A[i]); cudaFree(h_d_B[i]); cudaFree(h_d_C[i]); } free(A); free(B); free(C); free(h_d_A); free(h_d_B); free(h_d_C); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cublasDestroy(handle); return 0; }
extern "C" magma_int_t magma_dlarft_batched(magma_int_t n, magma_int_t k, magma_int_t stair_T, double **v_array, magma_int_t ldv, double **tau_array, double **T_array, magma_int_t ldt, double **work_array, magma_int_t lwork, magma_int_t batchCount, cublasHandle_t myhandle, magma_queue_t queue) { if( k <= 0) return 0; if( stair_T > 0 && k <= stair_T) return 0; magma_int_t maxnb = max_shared_bsiz; if( lwork < k*ldt) { magma_xerbla( __func__, -(10) ); return -10; } if( stair_T > 0 && stair_T > maxnb) { magma_xerbla( __func__, -(3) ); return -3; } magma_int_t DEBUG=0; magma_int_t nb = stair_T == 0 ? min(k,maxnb) : stair_T; magma_int_t i, j, prev_n, mycol, rows; double **dW1_displ = NULL; double **dW2_displ = NULL; double **dW3_displ = NULL; double **dTstep_array = NULL; magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dTstep_array, batchCount * sizeof(*dTstep_array)); //double *Tstep = k > nb ? work : T; if(k > nb) { magma_ddisplace_pointers(dTstep_array, work_array, lwork, 0, 0, batchCount, queue); } else { magma_ddisplace_pointers(dTstep_array, T_array, ldt, 0, 0, batchCount, queue); } //magma_int_t ldtstep = k > nb ? k : ldt; magma_int_t ldtstep = ldt; //a enlever // stair_T = 0 meaning all T // stair_T > 0 meaning the triangular portion of T has been computed. // the value of stair_T is the nb of these triangulars //GEMV compute the whole triangular upper portion of T (phase 1) // TODO addcublas to check perf #ifdef RFT_MAG_GEM magmablas_dgemm_batched( MagmaConjTrans, MagmaNoTrans, k, k, n, one, v_array, ldv, v_array, ldv, zero, dTstep_array, ldtstep, batchCount, queue); #else cublasDgemmBatched(myhandle, CUBLAS_OP_C, CUBLAS_OP_N, k, k, n, &one, (const double**) v_array, ldv, (const double**) v_array, ldv, &zero, dTstep_array, ldtstep, batchCount); #endif magmablas_dlaset_batched(MagmaLower, k, k, MAGMA_D_ZERO, MAGMA_D_ZERO, dTstep_array, ldtstep, batchCount, queue); // no need for it as T is expected to be lower zero //if(k > nb) magmablas_dlaset_batched(MagmaLower, k, k, MAGMA_D_ZERO, MAGMA_D_ZERO, dTstep_array, ldtstep, batchCount); //TRMV //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k] // TRMV is split over block of column of size nb // the update should be done from top to bottom so: // 1- a gemm using the previous computed columns // of T to update rectangular upper protion above // the triangle of my columns // 2- the columns need to be updated by a serial // loop over of gemv over itself. since we limit the // shared memory to nb, this nb column // are split vertically by chunk of nb rows dim3 grid(1, 1, batchCount); for(j=0; j<k; j+=nb) { prev_n = j; mycol = min(nb, k-j); // note that myrow = prev_n + mycol; if(prev_n>0 && mycol>0){ if(DEBUG==3) printf("doing gemm on the rectangular portion of size %d %d of T(%d,%d)\n",prev_n,mycol,0,j); magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, 0, j, batchCount, queue); magma_ddisplace_pointers(dW2_displ, T_array, ldt, 0, j, batchCount, queue); #ifdef RFT_MAG_GEM magmablas_dgemm_batched( MagmaNoTrans, MagmaNoTrans, prev_n, mycol, prev_n, one, T_array, ldt, dW1_displ, ldtstep, zero, dW2_displ, ldt, batchCount, queue ); #else cublasDgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N, prev_n, mycol, prev_n, &one, (const double**) T_array, ldt, (const double**) dW1_displ, ldtstep, &zero, dW2_displ, ldt, batchCount); #endif // update my rectangular portion (prev_n,mycol) using sequence of gemv magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); magma_ddisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); for(i=0; i<prev_n; i+=nb) { rows = min(nb,prev_n-i); if(DEBUG==3) printf(" doing recdtrmv on the rectangular portion of size %d %d of T(%d,%d)\n",rows,mycol,i,j); if(rows>0 && mycol>0) { magma_ddisplace_pointers(dW2_displ, T_array, ldt, i, j, batchCount, queue); magmablas_dlarft_recdtrmv_sm32x32_batched(rows, mycol, dW3_displ, dW2_displ, ldt, dW1_displ, ldtstep, batchCount, queue); } } } // the upper rectangular protion is updated, now if needed update the triangular portion if(stair_T == 0){ if(DEBUG==3) printf("doing dtrmv on the triangular portion of size %d %d of T(%d,%d)\n",mycol,mycol,j,j); if(mycol>0) { magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); magma_ddisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); magma_ddisplace_pointers(dW2_displ, T_array, ldt, j, j, batchCount, queue); magmablas_dlarft_dtrmv_sm32x32_batched(mycol, mycol, dW3_displ, dW1_displ, ldtstep, dW2_displ, ldt, batchCount, queue); } } }// end of j magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dTstep_array); return 0; }