Ejemplo n.º 1
0
/** Perform matrix-matrix product, \f$ C = \alpha op(A) op(B) + \beta C \f$.

    @param[in]
    transA  Operation op(A) to perform on matrix A.

    @param[in]
    transB  Operation op(B) to perform on matrix B.

    @param[in]
    m       Number of rows of C and op(A). m >= 0.

    @param[in]
    n       Number of columns of C and op(B). n >= 0.

    @param[in]
    k       Number of columns of op(A) and rows of op(B). k >= 0.

    @param[in]
    alpha   Scalar \f$ \alpha \f$

    @param[in]
    dA      COMPLEX array on GPU device.
            If transA == MagmaNoTrans, the m-by-k matrix A of dimension (ldda,k), ldda >= max(1,m); \n
            otherwise,                 the k-by-m matrix A of dimension (ldda,m), ldda >= max(1,k).

    @param[in]
    ldda    Leading dimension of dA.

    @param[in]
    dB      COMPLEX array on GPU device.
            If transB == MagmaNoTrans, the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k); \n
            otherwise,                 the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n).

    @param[in]
    lddb    Leading dimension of dB.

    @param[in]
    beta    Scalar \f$ \beta \f$

    @param[in,out]
    dC      COMPLEX array on GPU device.
            The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m).

    @param[in]
    lddc    Leading dimension of dC.

    @ingroup magma_cblas3
*/
extern "C" void
magma_cgemm(
    magma_trans_t transA, magma_trans_t transB,
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaFloatComplex alpha,
    magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaFloatComplex_const_ptr dB, size_t dB_offset, magma_int_t lddb,
    magmaFloatComplex beta,
    magmaFloatComplex_ptr       dC, size_t dC_offset, magma_int_t lddc,
    magma_queue_t queue )
{
    if ( m <= 0 || n <= 0 || k <= 0 )
        return;

    cl_int err = clblasCgemm(
        clblasColumnMajor,
        clblas_trans_const( transA ),
        clblas_trans_const( transB ),
        m, n, k,
        alpha, dA, dA_offset, ldda,
               dB, dB_offset, lddb,
        beta,  dC, dC_offset, lddc,
        1, &queue, 0, NULL, g_event );
    clFlush(queue);
    check_error( err );
}
Ejemplo n.º 2
0
void 
xGemm<cl_float2>::
xGemm_Function(bool flush, cl_uint apiCallCount )
{
  for (unsigned int i = 0; i < numQueues; i++) {
    events_[i] = NULL;
  }
  for (unsigned int i = 0; i < apiCallCount; i++)
	{
	  clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                     buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
  }
	//flush==true if only the kernel time (library call) is timed
	//flush==false if memory time is also timed
	if (flush==true)
	{
    // check if any valid events returned
    cl_uint numValidEvents = 0;
    for (unsigned int i = 0; i < numQueuesToUse; i++) {
      if (events_[i]) {
        cl_uint clReferenceCount;
        cl_int err = clGetEventInfo(events_[i], CL_EVENT_REFERENCE_COUNT, sizeof(clReferenceCount), &clReferenceCount, NULL);
        if ( err == CL_SUCCESS) {
          //printf("events[%u/%u] has %u references\n", i, numQueuesToUse, clReferenceCount );
          numValidEvents++;
        } else {
          //printf("events[%u/%u] invalid; err = %i\n", i, numQueuesToUse, err );
        }
      } else {
        //printf("events[%u/%u] is NULL\n", i, numQueuesToUse );
      }
    }
    
    for (unsigned int i = 0; i < numQueuesToUse; i++) {
      clFlush(queues_[i]);
    }
		clWaitForEvents(numValidEvents, events_);
	}
}