/** Perform matrix-matrix product, \f$ C = \alpha op(A) op(B) + \beta C \f$. @param[in] transA Operation op(A) to perform on matrix A. @param[in] transB Operation op(B) to perform on matrix B. @param[in] m Number of rows of C and op(A). m >= 0. @param[in] n Number of columns of C and op(B). n >= 0. @param[in] k Number of columns of op(A) and rows of op(B). k >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX array on GPU device. If transA == MagmaNoTrans, the m-by-k matrix A of dimension (ldda,k), ldda >= max(1,m); \n otherwise, the k-by-m matrix A of dimension (ldda,m), ldda >= max(1,k). @param[in] ldda Leading dimension of dA. @param[in] dB COMPLEX array on GPU device. If transB == MagmaNoTrans, the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k); \n otherwise, the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n). @param[in] lddb Leading dimension of dB. @param[in] beta Scalar \f$ \beta \f$ @param[in,out] dC COMPLEX array on GPU device. The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m). @param[in] lddc Leading dimension of dC. @ingroup magma_cblas3 */ extern "C" void magma_cgemm( magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex_const_ptr dB, size_t dB_offset, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, size_t dC_offset, magma_int_t lddc, magma_queue_t queue ) { if ( m <= 0 || n <= 0 || k <= 0 ) return; cl_int err = clblasCgemm( clblasColumnMajor, clblas_trans_const( transA ), clblas_trans_const( transB ), m, n, k, alpha, dA, dA_offset, ldda, dB, dB_offset, lddb, beta, dC, dC_offset, lddc, 1, &queue, 0, NULL, g_event ); clFlush(queue); check_error( err ); }
void xGemm<cl_float2>:: xGemm_Function(bool flush, cl_uint apiCallCount ) { for (unsigned int i = 0; i < numQueues; i++) { events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) { clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } //flush==true if only the kernel time (library call) is timed //flush==false if memory time is also timed if (flush==true) { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { if (events_[i]) { cl_uint clReferenceCount; cl_int err = clGetEventInfo(events_[i], CL_EVENT_REFERENCE_COUNT, sizeof(clReferenceCount), &clReferenceCount, NULL); if ( err == CL_SUCCESS) { //printf("events[%u/%u] has %u references\n", i, numQueuesToUse, clReferenceCount ); numValidEvents++; } else { //printf("events[%u/%u] invalid; err = %i\n", i, numQueuesToUse, err ); } } else { //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } clWaitForEvents(numValidEvents, events_); } }