/* * This method was inspired by the popular BLAS gemm function. It perform a * general matrix by matrix multiplication in which A and B can have different * sizes, and stores the result on the C matrix. If A and B are big enough that * the total operations exceeds the GPU threshold, then the computation will be * performed in the GPU. Otherwise the CPU is used instead by using a modified * version of the algorithm implemented in the GNU GSL library: gsl_blas_dgmm. * The math resolves to: * * C[mxp] = A[mxn] * B[nxp] */ void blas_gemm(float* A, float* B, float* C, size_t m, size_t n, size_t p) { if (math_gpu_threshold_reached(m * n * p)) { const float alpha = 1.0f; const float beta = 0.0f; cublasStatus_t status = cublasXtSgemm(cublasXth, CUBLAS_OP_N, CUBLAS_OP_N, p, m, n, &alpha, B, p, A, n, &beta, C, p); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "Error: %d\n", status); fflush (stderr); exit (EXIT_FAILURE); } } else { math_vector_values(C, 0.0, m * p); for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < n; j++) { const float pivot = A[i * n + j]; for (size_t k = 0; k < p; k++) { C[i * p + k] += pivot * B[j * p + k]; } } } } }
/* * This method is similar to blas_gemm, but it operates on the transpose * of the B matrix. The strategy for determining if GPU of CPU is the same * as blas_gemm. The math resolves to: * * C[mxp] = A[mxn] * T(B[pxn]) */ void blas_gemmt(float* A, float* B, float* C, size_t m, size_t n, size_t p) { if (math_gpu_threshold_reached(m * n * p)) { const float alpha = 1.0f; const float beta = 0.0f; cublasStatus_t status = cublasXtSgemm(cublasXth, CUBLAS_OP_T, CUBLAS_OP_N, p, m, n, &alpha, B, n, A, n, &beta, C, p); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "Error: %d\n", status); fflush (stderr); exit (EXIT_FAILURE); } } else { for (size_t i = 0; i < m; i++) { for (size_t k = 0; k < p; k++) { double sum = 0.0; for (size_t j = 0; j < n; j++) { sum += A[i * n + j] * B[k * n + j]; } C[i * p + k] = sum; } } } }
void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc ) { cublasOperation_t transa, transb; cublasStatus_t status; /*---error handler---*/ int nrowa, ncola, nrowb, ncolb; if (TransA == CblasNoTrans) { nrowa = M; ncola = K; } else { nrowa = K; ncola = M; } if (TransB == CblasNoTrans) { nrowb = K; ncolb = N; } else { nrowb = N; ncolb = K; } int nrowc = M; int ncolc = N; int info = 0; if (CBLasTransToCuBlasTrans(TransA,&transa) < 0) info = 1; else if (CBLasTransToCuBlasTrans(TransB,&transb) < 0) info = 2; else if (M < 0) info = 3; else if (N < 0) info = 4; else if (K < 0) info = 5; else if (lda < MAX(1, nrowa)) info = 8; else if (ldb < MAX(1, nrowb)) info = 10; else if (ldc < MAX(1, M)) info = 13; if (info != 0) { xerbla_(ERROR_NAME, &info); return; } /*-------------------*/ /*----dispatcher-----*/ int type = 0; //1:cpu 2:cublasxt 3:blasx if (M <= 0 || N <= 0 || K <= 0) type = 1; if (type == 0 && (M > 1000 || N > 1000 || K > 1000)) type = 3; else type = 1; //Blasx_Debug_Output("type after dispatcher:%d\n",type); /*-------------------*/ switch (type) { case 1: CPU_BLAS: Blasx_Debug_Output("calling cblas_sgemm:"); if (cpublas_handle == NULL) blasx_init(CPU); if (cblas_sgemm_p == NULL) blasx_init_cblas_func(&cblas_sgemm_p, "cblas_sgemm"); (*cblas_sgemm_p)(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc); break; case 2: if (cublasXt_handle == NULL) blasx_init(CUBLASXT); Blasx_Debug_Output("calling cublasSgemmXt:"); status = cublasXtSgemm(cublasXt_handle, transa, transb, M, N, K, (float*)&alpha, (float*)A, lda, (float*)B, ldb, (float*)&beta, (float*)C, ldc); if( status != CUBLAS_STATUS_SUCCESS ) goto CPU_BLAS; break; case 3: Blasx_Debug_Output("calling BLASX:\n"); cudaHostRegister(A,sizeof(float)*nrowa*ncola,cudaHostRegisterPortable); cudaHostRegister(B,sizeof(float)*nrowb*ncolb,cudaHostRegisterPortable); cudaHostRegister(C,sizeof(float)*nrowc*ncolc,cudaHostRegisterPortable); #ifdef BENCHMARK double Gflops = FLOPS_DGEMM(M, N, K)/(1000000000); double gpu_start, gpu_end; gpu_start = get_cur_time(); #endif if (is_blasx_enable == 0) blasx_init(BLASX); assert( is_blasx_enable == 1 ); assert( SYS_GPUS > 0 ); assert( event_SGEMM[0] != NULL ); assert( C_dev_SGEMM[0] != NULL ); assert( handles_SGEMM[0] != NULL ); assert( streams_SGEMM[0] != NULL ); LRU_t* LRUs[10]; int GPU_id = 0; for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++) LRUs[GPU_id] = LRU_init( GPU_id ); blasx_sgemm(SYS_GPUS, handles_SGEMM, LRUs, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++) LRU_free( LRUs[GPU_id], GPU_id ); #ifdef BENCHMARK gpu_end = get_cur_time(); printf("BLASX (M:%5d,N:%5d,K:%5d) Speed:%9.1f type:%2d\n", M, N, K, (double)Gflops/(gpu_end - gpu_start), type); #endif cudaHostUnregister(A); cudaHostUnregister(B); cudaHostUnregister(C); break; default: break; } //Blasx_Debug_Output("eventually use type:%d to compute\n",type); }