示例#1
0
void GPUMatrix::multWithMatrix(const GPUMatrix& other, GPUMatrix* target,
    bool transpose, bool transposeOther,
    unsigned int m, unsigned int k, unsigned int n,
    unsigned int rowOffset, unsigned int colOffset,
    unsigned int rowOffsetOther, unsigned int colOffsetOther,
    unsigned int rowOffsetTarget, unsigned int colOffsetTarget) const
{
    // We might allocate these on the device later, to get the last bit of performance.
    const Elem alpha = 1.0;
    const Elem beta  = 0.0;
    cublasStatus_t rv = CUBLAS_GEMM(
        _cublasHandle,
        transpose      ? CUBLAS_OP_T : CUBLAS_OP_N,
        transposeOther ? CUBLAS_OP_T : CUBLAS_OP_N,
        m,
        n,
        k,
        &alpha,
        _data + colOffset * _rows + rowOffset,
        this->_rows,    // lda
        other._data + colOffsetOther * other._rows + rowOffsetOther,
        other._rows,    // ldb
        &beta,
        target->_data + colOffsetTarget * target->_rows + rowOffsetTarget,
        target->_rows   // ldc
    );
    if (rv != CUBLAS_STATUS_SUCCESS) 
        // TODO: define exception class with cublas return codes?
        throw std::runtime_error("CUBLAS error in Dgemm");
}
示例#2
0
static void cublas_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
{
	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);

	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);

	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);

	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
				     (TYPE)0.0, subC, ldC);
}