예제 #1
0
파일: math.c 프로젝트: yalcantara/neuralnet
/*
 * This method was inspired by the popular BLAS gemm function. It perform a
 * general matrix by matrix multiplication in which A and B can have different
 * sizes, and stores the result on the C matrix. If A and B are big enough that
 * the total operations exceeds the GPU threshold, then the computation will be
 * performed in the GPU. Otherwise the CPU is used instead by using a modified
 * version of the algorithm implemented in the GNU GSL library: gsl_blas_dgmm.
 * The math resolves to:
 *
 * C[mxp] = A[mxn] * B[nxp]
 */
void blas_gemm(float* A, float* B, float* C, size_t m, size_t n, size_t p) {
	
	if (math_gpu_threshold_reached(m * n * p)) {
		const float alpha = 1.0f;
		const float beta = 0.0f;
		
		cublasStatus_t status = cublasXtSgemm(cublasXth, CUBLAS_OP_N, CUBLAS_OP_N, p, m, n, &alpha,
				B, p, A, n, &beta, C, p);
		
		if (status != CUBLAS_STATUS_SUCCESS) {
			
			fprintf(stderr, "Error: %d\n", status);
			fflush (stderr);
			exit (EXIT_FAILURE);
		}
	} else {
		math_vector_values(C, 0.0, m * p);
		
		for (size_t i = 0; i < m; i++) {
			for (size_t j = 0; j < n; j++) {
				const float pivot = A[i * n + j];
				for (size_t k = 0; k < p; k++) {
					C[i * p + k] += pivot * B[j * p + k];
				}
			}
		}
	}
}
예제 #2
0
파일: math.c 프로젝트: yalcantara/neuralnet
/*
 * This method is similar to blas_gemm, but it operates on the transpose
 * of the B matrix. The strategy for determining if GPU of CPU is the same
 * as blas_gemm. The math resolves to:
 *
 * C[mxp] = A[mxn] * T(B[pxn])
 */
void blas_gemmt(float* A, float* B, float* C, size_t m, size_t n, size_t p) {
	
	if (math_gpu_threshold_reached(m * n * p)) {
		const float alpha = 1.0f;
		const float beta = 0.0f;
		
		cublasStatus_t status = cublasXtSgemm(cublasXth, CUBLAS_OP_T, CUBLAS_OP_N, p, m, n, &alpha,
				B, n, A, n, &beta, C, p);
		
		if (status != CUBLAS_STATUS_SUCCESS) {
			
			fprintf(stderr, "Error: %d\n", status);
			fflush (stderr);
			exit (EXIT_FAILURE);
		}
	} else {
		for (size_t i = 0; i < m; i++) {
			for (size_t k = 0; k < p; k++) {
				double sum = 0.0;
				for (size_t j = 0; j < n; j++) {
					sum += A[i * n + j] * B[k * n + j];
				}
				C[i * p + k] = sum;
			}
		}
	}
}
예제 #3
0
파일: sgemm.c 프로젝트: 529038378/BLASX
void cblas_sgemm(const enum CBLAS_ORDER Order, 
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, 
                 const int M, const int N, const int K, 
                 const float alpha, const float *A, const int lda, 
                 const float *B, const int ldb,
                 const float beta, float *C, const int ldc )
{
    cublasOperation_t transa, transb;
    cublasStatus_t status;
    /*---error handler---*/
    int nrowa, ncola, nrowb, ncolb;
    if (TransA == CblasNoTrans) {
        nrowa = M;
        ncola = K;
    } else {
        nrowa = K;
        ncola = M;
    }
    if (TransB == CblasNoTrans) {
        nrowb = K;
        ncolb = N;
    } else {
        nrowb = N;
        ncolb = K;
    }
    int nrowc = M;
    int ncolc = N;
    int info = 0;
    if (CBLasTransToCuBlasTrans(TransA,&transa) <  0) info = 1;
    else if (CBLasTransToCuBlasTrans(TransB,&transb) < 0) info = 2;
    else if (M < 0) info = 3;
    else if (N < 0) info = 4;
    else if (K < 0) info = 5;
    else if (lda < MAX(1, nrowa)) info = 8;
    else if (ldb < MAX(1, nrowb)) info = 10;
    else if (ldc < MAX(1, M)) info = 13;
    if (info != 0) {
        xerbla_(ERROR_NAME, &info);
        return;
    }
    /*-------------------*/
    
    /*----dispatcher-----*/
    int type = 0; //1:cpu 2:cublasxt 3:blasx
    if (M <= 0 || N <= 0 || K <= 0) type = 1;
    if (type == 0 && (M > 1000 || N > 1000 || K > 1000)) type = 3;
    else                                                 type = 1;
    //Blasx_Debug_Output("type after dispatcher:%d\n",type);
    /*-------------------*/
    

    switch (type) {
        case 1:
        CPU_BLAS:
            Blasx_Debug_Output("calling cblas_sgemm:");
            if (cpublas_handle == NULL) blasx_init(CPU);
            if (cblas_sgemm_p == NULL) blasx_init_cblas_func(&cblas_sgemm_p, "cblas_sgemm");
            (*cblas_sgemm_p)(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
            break;
        case 2:
            if (cublasXt_handle == NULL) blasx_init(CUBLASXT);
            Blasx_Debug_Output("calling cublasSgemmXt:");
            status = cublasXtSgemm(cublasXt_handle,
                                   transa, transb,
                                   M, N, K,
                                   (float*)&alpha, (float*)A, lda,
                                   (float*)B, ldb,
                                   (float*)&beta, (float*)C, ldc);
            if( status != CUBLAS_STATUS_SUCCESS ) goto CPU_BLAS;
            break;
        case 3:
            Blasx_Debug_Output("calling BLASX:\n");
            cudaHostRegister(A,sizeof(float)*nrowa*ncola,cudaHostRegisterPortable);
            cudaHostRegister(B,sizeof(float)*nrowb*ncolb,cudaHostRegisterPortable);
            cudaHostRegister(C,sizeof(float)*nrowc*ncolc,cudaHostRegisterPortable);
#ifdef BENCHMARK
            double Gflops = FLOPS_DGEMM(M, N, K)/(1000000000);
            double gpu_start, gpu_end;
            gpu_start = get_cur_time();
#endif
            if (is_blasx_enable == 0) blasx_init(BLASX);
            assert( is_blasx_enable == 1 );
            assert( SYS_GPUS > 0 );
            assert( event_SGEMM[0] != NULL );
            assert( C_dev_SGEMM[0] != NULL );
            assert( handles_SGEMM[0] != NULL );
            assert( streams_SGEMM[0] != NULL );
            LRU_t* LRUs[10];
            int GPU_id = 0;
            for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++)    LRUs[GPU_id] = LRU_init( GPU_id );
            blasx_sgemm(SYS_GPUS, handles_SGEMM, LRUs,
                        TransA, TransB,
                        M, N, K, alpha,
                        A, lda,
                        B, ldb,
                        beta,
                        C, ldc);
            for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++)    LRU_free( LRUs[GPU_id], GPU_id );
#ifdef BENCHMARK
            gpu_end = get_cur_time();
            printf("BLASX (M:%5d,N:%5d,K:%5d) Speed:%9.1f type:%2d\n", M, N, K, (double)Gflops/(gpu_end - gpu_start), type);
#endif
            cudaHostUnregister(A);
            cudaHostUnregister(B);
            cudaHostUnregister(C);
            break;
        default:
            break;
    }

    //Blasx_Debug_Output("eventually use type:%d to compute\n",type);
}