/* Generates Householder elementary reflector H = I - tau v v^T to reduce H [ dx0 ] = [ beta ] [ dx ] [ 0 ] with beta = ±norm( [dx0, dx] ) = ±dxnorm[0]. Stores v over dx; first element of v is 1 and is not stored. Stores beta over dx0. Stores tau. The difference with LAPACK's clarfg is that the norm of dx, and hance beta, are computed outside the routine and passed to it in dxnorm (array on the GPU). */ extern "C" magma_err_t magma_clarfgtx_gpu(int n, magmaFloatComplex_ptr dx0, size_t dx0_offset, magmaFloatComplex_ptr dx, size_t dx_offset, magmaFloatComplex_ptr dtau, size_t dtau_offset, magmaFloat_ptr dxnorm, size_t dxnorm_offset, magmaFloatComplex_ptr dA, size_t dA_offset, int i, magmaFloatComplex_ptr V, size_t V_offset, int ldv, magmaFloatComplex_ptr T, size_t T_offset, int ldt, magmaFloatComplex_ptr work, size_t work_offset, magma_queue_t queue) { /* Generate the elementary reflector H(i) */ magma_clarfgx_gpu(n, dx0, dx0_offset, dx, dx_offset, dtau, dtau_offset, dxnorm, dxnorm_offset, dA, dA_offset, i, queue); if (i==0){ magmaFloatComplex tt = MAGMA_C_ONE; magmablas_clacpy(MagmaFull, 1, 1, dtau, dtau_offset, 1, T, T_offset+i+i*ldt, 1, queue); magma_csetmatrix(1, 1, &tt, 0, 1, dx0, dx0_offset, 1, queue); } else { /* Compute the i-th column of T */ cl_int ciErrNum; // Error code var cl_kernel ckKernel=NULL; ckKernel = rt->KernelPool["magma_cgemv_kernel3"]; // in clarfbx.cl if (!ckKernel) { printf ("Error: cannot locate kernel in line %d, file %s\n", __LINE__, __FILE__); return MAGMA_ERR_UNKNOWN; } int nn = 0; ciErrNum = clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&n ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&V ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&V_offset ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&ldv ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&dx0 ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&dx0_offset ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&work ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&work_offset ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&dtau ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&dtau_offset ); if (ciErrNum != CL_SUCCESS) { printf("Error: clSetKernelArg at %d in file %s!\n", __LINE__, __FILE__); return MAGMA_ERR_UNKNOWN; } size_t GlobalWorkSize[1]={0}, LocalWorkSize[1]={0}; LocalWorkSize[0] = BLOCK_SIZE; GlobalWorkSize[0] = i*LocalWorkSize[0]; // launch kernel ciErrNum = clEnqueueNDRangeKernel( queue, ckKernel, 1, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); if (ciErrNum != CL_SUCCESS) { printf("Error: clEnqueueNDRangeKernel at %d in file %s \"%s\"\n", __LINE__, __FILE__, rt->GetErrorCode(ciErrNum)); return MAGMA_ERR_UNKNOWN; } //magma_cgemv_kernel3<<< i, BLOCK_SIZE, 0, magma_stream >>>(n, V, ldv, dx0, work, dtau); clFlush(queue); ckKernel = rt->KernelPool["magma_ctrmv_kernel2"]; // in clarfx.cl if (!ckKernel) { printf ("Error: cannot locate kernel in line %d, file %s\n", __LINE__, __FILE__); return MAGMA_ERR_UNKNOWN; } nn = 0; ciErrNum = clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&T ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&T_offset ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&ldt ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&work ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&work_offset ); magmaFloatComplex_ptr T1 = T; size_t T1_offset = T_offset + i*ldt; ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&T1 ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&T1_offset ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&dtau ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&dtau_offset ); if (ciErrNum != CL_SUCCESS) { printf("Error: clSetKernelArg at %d in file %s!\n", __LINE__, __FILE__); return MAGMA_ERR_UNKNOWN; } LocalWorkSize[0] = i; GlobalWorkSize[0] = i*LocalWorkSize[0]; // launch kernel ciErrNum = clEnqueueNDRangeKernel( queue, ckKernel, 1, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); if (ciErrNum != CL_SUCCESS) { printf("Error: clEnqueueNDRangeKernel at %d in file %s \"%s\"\n", __LINE__, __FILE__, rt->GetErrorCode(ciErrNum)); printf("block: %d, group: %d\n", LocalWorkSize[0], GlobalWorkSize[0]); return MAGMA_ERR_UNKNOWN; } //magma_ctrmv_kernel2<<< i, i, 0, magma_stream >>>( T, ldt, work, T+i*ldt, dtau); clFlush(queue); } return MAGMA_SUCCESS; }
/** Purpose ------- CGERFS improve the computed solution to a system of linear equations. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(n)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by SLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA COMPLEX array on the GPU, dimension (ldda,N) the N-by-N coefficient matrix A. @param[in] ldda INTEGER The leading dimension of the array dA. ldda >= max(1,N). @param[in] dB COMPLEX array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. @param[in] lddb INTEGER The leading dimension of the array dB. lddb >= max(1,N). @param[in, out] dX COMPLEX array on the GPU, dimension (lddx,NRHS) On entry, the solution matrix X, as computed by CGETRS_NOPIV. On exit, the improved solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. lddx >= max(1,N). @param dworkd (workspace) COMPLEX array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. @param dAF COMPLEX array on the GPU, dimension (ldda,n) The factors L and U from the factorization A = L*U as computed by CGETRF_NOPIV. @param[out] iter INTEGER - < 0: iterative refinement has failed, real factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGETRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value - > 0: if info = i, U(i,i) computed in REAL is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. @ingroup magma_cgesv_driver ********************************************************************/ extern "C" magma_int_t magma_cgerfs_nopiv_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb, magmaFloatComplex_ptr dX, magma_int_t lddx, magmaFloatComplex_ptr dworkd, magmaFloatComplex_ptr dAF, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t ione = 1; magmaFloatComplex_ptr dR; magmaFloatComplex Xnrmv, Rnrmv; float Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dR = dworkd; eps = lapackf77_slamch("Epsilon"); Anrm = magmablas_clange(MagmaInfNorm, n, n, dA, ldda, (float*)dworkd ); cte = Anrm * eps * pow( (float)n, (float)0.5 ) * BWDMAX; // residual dR = dB - dA*dX in real magmablas_clacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_cgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_cgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_C_ABS( dX(i,j) ) instead of clange? for( j=0; j < nrhs; j++ ) { i = magma_icamax( n, dX(0,j), 1) - 1; magma_cgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_clange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_icamax ( n, dR(0,j), 1 ) - 1; magma_cgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_clange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); // printf("Rnrm : %e, Xnrm*cte : %e\n", Rnrm, Xnrm*cte); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // solve dAF*dX = dR // it's okay that dR is used for both dB input and dX output. magma_cgetrs_nopiv_gpu( trans, n, nrhs, dAF, lddsa, dR, lddr, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_caxpycp2( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in real if ( nrhs == 1 ) { magma_cgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_cgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_icamax( n, dX(0,j), 1) - 1; magma_cgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_clange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_icamax ( n, dR(0,j), 1 ) - 1; magma_cgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_clange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly. */ *iter = -ITERMAX - 1; FALLBACK: /* Iterative refinement failed to converge to a * satisfactory solution. */ return *info; }
extern "C" magma_int_t magma_cqr( magma_int_t m, magma_int_t n, magma_c_matrix A, magma_int_t lda, magma_c_matrix *Q, magma_c_matrix *R, magma_queue_t queue ) { magma_int_t info = 0; // local constants const magmaFloatComplex c_zero = MAGMA_C_ZERO; // local variables magma_int_t inc = 1; magma_int_t k = min(m,n); magma_int_t ldt; magma_int_t nb; magmaFloatComplex *tau = NULL; magmaFloatComplex *dT = NULL; magmaFloatComplex *dA = NULL; magma_c_matrix dR1 = {Magma_CSR}; // allocate CPU resources CHECK( magma_cmalloc_pinned( &tau, k ) ); // query number of blocks required for QR factorization nb = magma_get_cgeqrf_nb( m, n ); ldt = (2 * k + magma_roundup(n, 32)) * nb; CHECK( magma_cmalloc( &dT, ldt ) ); // get copy of matrix array if ( A.memory_location == Magma_DEV ) { dA = A.dval; } else { CHECK( magma_cmalloc( &dA, lda * n ) ); magma_csetvector( lda * n, A.val, inc, dA, inc, queue ); } // QR factorization magma_cgeqrf_gpu( m, n, dA, lda, tau, dT, &info ); // construct R matrix if ( R != NULL ) { if ( A.memory_location == Magma_DEV ) { CHECK( magma_cvinit( R, Magma_DEV, lda, n, c_zero, queue ) ); magmablas_clacpy( MagmaUpper, k, n, dA, lda, R->dval, lda, queue ); } else { CHECK( magma_cvinit( &dR1, Magma_DEV, lda, n, c_zero, queue ) ); magmablas_clacpy( MagmaUpper, k, n, dA, lda, dR1.dval, lda, queue ); CHECK( magma_cvinit( R, Magma_CPU, lda, n, c_zero, queue ) ); magma_cgetvector( lda * n, dR1.dval, inc, R->val, inc, queue ); } } // construct Q matrix if ( Q != NULL ) { magma_cungqr_gpu( m, n, k, dA, lda, tau, dT, nb, &info ); if ( A.memory_location == Magma_DEV ) { CHECK( magma_cvinit( Q, Magma_DEV, lda, n, c_zero, queue ) ); magma_ccopyvector( lda * n, dA, inc, Q->dval, inc, queue ); } else { CHECK( magma_cvinit( Q, Magma_CPU, lda, n, c_zero, queue ) ); magma_cgetvector( lda * n, dA, inc, Q->val, inc, queue ); } } cleanup: if( info != 0 ){ magma_cmfree( Q, queue ); magma_cmfree( R, queue ); magma_cmfree( &dR1, queue ); } // free resources magma_free_pinned( tau ); magma_free( dT ); if ( A.memory_location == Magma_CPU ) { magma_free( dA ); } return info; }
/** Purpose ------- CGETRI computes the inverse of a matrix using the LU factorization computed by CGETRF. This method inverts U and then computes inv(A) by solving the system inv(A)*L = inv(U) for inv(A). Note that it is generally both faster and more accurate to use CGESV, or CGETRF and CGETRS, to solve the system AX = B, rather than inverting the matrix and multiplying to form X = inv(A)*B. Only in special instances should an explicit inverse be computed with this routine. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N) On entry, the factors L and U from the factorization A = P*L*U as computed by CGETRF_GPU. On exit, if INFO = 0, the inverse of the original matrix A. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in] ipiv INTEGER array, dimension (N) The pivot indices from CGETRF; for 1 <= i <= N, row i of the matrix was interchanged with row IPIV(i). @param[out] dwork (workspace) COMPLEX array on the GPU, dimension (MAX(1,LWORK)) @param[in] lwork INTEGER The dimension of the array DWORK. LWORK >= N*NB, where NB is the optimal blocksize returned by magma_get_cgetri_nb(n). \n Unlike LAPACK, this version does not currently support a workspace query, because the workspace is on the GPU. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, U(i,i) is exactly zero; the matrix is singular and its cannot be computed. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetri_gpu( magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magmaFloatComplex *dwork, magma_int_t lwork, magma_int_t *info ) { #define dA(i, j) (dA + (i) + (j)*ldda) #define dL(i, j) (dL + (i) + (j)*lddl) /* Local variables */ magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dL = dwork; magma_int_t lddl = n; magma_int_t nb = magma_get_cgetri_nb(n); magma_int_t j, jmax, jb, jp; *info = 0; if (n < 0) *info = -1; else if (ldda < max(1,n)) *info = -3; else if ( lwork < n*nb ) *info = -6; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( n == 0 ) return *info; /* Invert the triangular factor U */ magma_ctrtri_gpu( MagmaUpper, MagmaNonUnit, n, dA, ldda, info ); if ( *info != 0 ) return *info; jmax = ((n-1) / nb)*nb; for( j = jmax; j >= 0; j -= nb ) { jb = min( nb, n-j ); // copy current block column of A to work space dL // (only needs lower trapezoid, but we also copy upper triangle), // then zero the strictly lower trapezoid block column of A. magmablas_clacpy( MagmaFull, n-j, jb, dA(j,j), ldda, dL(j,0), lddl ); magmablas_claset( MagmaLower, n-j-1, jb, c_zero, c_zero, dA(j+1,j), ldda ); // compute current block column of Ainv // Ainv(:, j:j+jb-1) // = ( U(:, j:j+jb-1) - Ainv(:, j+jb:n) L(j+jb:n, j:j+jb-1) ) // * L(j:j+jb-1, j:j+jb-1)^{-1} // where L(:, j:j+jb-1) is stored in dL. if ( j+jb < n ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n, jb, n-j-jb, c_neg_one, dA(0,j+jb), ldda, dL(j+jb,0), lddl, c_one, dA(0,j), ldda ); } // TODO use magmablas work interface magma_ctrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaUnit, n, jb, c_one, dL(j,0), lddl, dA(0,j), ldda ); } // Apply column interchanges for( j = n-2; j >= 0; --j ) { jp = ipiv[j] - 1; if ( jp != j ) { magmablas_cswap( n, dA(0,j), 1, dA(0,jp), 1 ); } } return *info; }
/** Purpose ------- CLAHR2 reduces the first NB columns of a complex general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by CGEHRD. Arguments --------- @param[in] n INTEGER The order of the matrix A. @param[in] k INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. @param[in] nb INTEGER The number of columns to be reduced. @param[in,out] A COMPLEX array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau COMPLEX array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. @param[out] T COMPLEX array, dimension (LDT,NB) The upper triangular matrix T. @param[in] ldt INTEGER The leading dimension of the array T. LDT >= NB. @param[out] Y COMPLEX array, dimension (LDY,NB) The n-by-nb matrix Y. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[in,out] data Structure with pointers to dA, dT, dV, dW, dY which are distributed across multiple GPUs. Further Details --------------- The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: @verbatim ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) @endverbatim where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_cgeev_aux ********************************************************************/ extern "C" magma_int_t magma_clahr2_m( magma_int_t n, magma_int_t k, magma_int_t nb, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *T, magma_int_t ldt, magmaFloatComplex *Y, magma_int_t ldy, struct cgehrd_data *data ) { #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex tmp; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t ione = 1; magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid; magma_int_t n_k_i_1, n_k; magmaFloatComplex scale; magma_int_t i; magmaFloatComplex ei = MAGMA_C_ZERO; magma_int_t info_data = 0; magma_int_t *info = &info_data; if (n < 0) { *info = -1; } else if (k < 0 || k >= n) { *info = -2; } else if (nb < 1 || nb > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldt < nb) { *info = -8; } else if (ldy < max(1,n)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // zero out current top block of V on all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); magmablas_claset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv ); } // set all Y=0 lapackf77_claset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy ); for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Finish applying I - V * T * V' on right tmp = MAGMA_C_NEGATE( tau[i-1] ); blasf77_caxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_ccopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_ctrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_cgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_ctrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_cgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_ctrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_caxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_clarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // compute yi = A vi = sum_g A{d} vi{d} nblocks = (n-1) / nb / ngpu + 1; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // dV(k+i+1:n-1, i) = VA(k+i:n, i) magma_csetvector_async( n_k_i_1, A(k+i+1,i), 1, dV(d, k+i+1, i), 1, data->streams[d] ); // copy column of dV -> dVd, using block cyclic distribution. // This assumes V and Vd have been padded so that // a 2D matrix copy doesn't access them out-of-bounds gblock = k / nb; lblock = gblock / ngpu; lgid = gblock % ngpu; if ( d < lgid ) { lblock += 1; } // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix magmablas_clacpy( MagmaFull, nb, nblocks-lblock, dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu, dVd(d, 0 + lblock*nb, i), nb ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i) // skip if matrix is empty // each GPU copies to different temporary vector in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_cgemv( MagmaNoTrans, n-k, dn-dki1, c_one, dA (d, k, dki1), ldda, dVd(d, dki1, i), 1, c_zero, dY (d, k, i), 1 ); // copy vector to host, storing in column nb+d of Y // as temporary space (Y has >= nb+ngpu columns) magma_cgetvector_async( n-k, dY(d, k, i), 1, Y(k, nb+d), 1, data->streams[d] ); } } // while GPU is doing above Ag*v... // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_C_NEGATE( tau[i] ); blasf77_cgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_ctrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // apply reflectors to next column, A(i+1), on right only. // one axpy will be required to finish this, in the next iteration above if ( i > 0 && i+1 < nb ) { // Update next column, A(k:n,i+1), applying Q on right. // One axpy will be required to finish this, in the next iteration // above, after yi is computed. // This updates one more row than LAPACK does (row k), // making block above panel an even multiple of nb. // Use last column of T as workspace, w. magma_int_t i1 = i+1; // If complex, conjugate row of V, and undo afterwards #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i1, A(k+i1,0), &lda ); #endif // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)' // T is now rectangular, so we use gemv instead of trmv as in lapack. blasf77_cgemv( "No trans", &i, &i1, &c_one, T(0,0), &ldt, A(k+i1,0), &lda, &c_zero, T(0,nb-1), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i1, A(k+i1,0), &lda ); #endif // A(k:n, i+1) -= Y(k:n, 0:i) * w blasf77_cgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i1), &ione ); } // yi = sum_g yi{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( data->streams[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // yi = yi + yi{d} blasf77_caxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione ); } } } // Restore diagonal element *A(k+nb,nb-1) = ei; // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:) for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :) // skip if matrix is empty // each GPU copies to different temporary block in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1, c_one, dA (d, 0, dki1), ldda, dVd(d, dki1, 0), ldvd, c_zero, dY (d, 0, 0), ldda ); // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y // as temporary space (Y has nb + nb*ngpu columns) magma_cgetmatrix_async( k, nb, dY(d, 0, 0), ldda, Y(0,nb+nb*d), ldy, data->streams[d] ); } } // Y = sum_g Y{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( 0 ); magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // Y = Y + Am V for( i = 0; i < nb; ++i ) { blasf77_caxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione ); } } } // copy Y and T matrices to GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_csetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] ); magma_csetmatrix_async( nb, nb, T, nb, dTi(d), nb, data->streams[d] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_clahr2 */
void magmablas_chemm_mgpu_spec( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaFloatComplex_ptr dB[], magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC[], magma_int_t lddc, magmaFloatComplex_ptr dwork[], magma_int_t dworksiz, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work[], magma_int_t worksiz, // TODO unused magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[][20], magma_int_t nqueue, magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork) #define C(i, j) (C + (i) + (j)*ldc) if ( side != MagmaLeft || uplo != MagmaLower ) { fprintf( stderr, "%s: only Left Lower implemented\n", __func__ ); } assert( ldda >= m ); assert( lddb >= m ); assert( lddc >= m ); assert( nqueue >= ngpu ); assert( nbevents >= ngpu*ngpu ); magmaFloatComplex_ptr dwork1[MagmaMaxGPUs]; magmaFloatComplex_ptr dwork2[MagmaMaxGPUs]; magma_int_t lddwork = lddc; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dwork1[dev] = dwork[dev]; dwork2[dev] = dwork[dev]+n*lddwork; } assert( dworksiz >= (2*n*lddwork) ); magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t dev, devperm, myblk, mycolsize, myblkoffst; magma_int_t gdev, gcolsize, gmaster, gngpu; magma_int_t masterdev, lcdev, lccolsize, myngpu; magma_int_t stdev = (offset/nb)%ngpu; magma_int_t blockoffset = offset % nb; magma_int_t fstblksiz = 0; if(blockoffset>0){ fstblksiz = min(m, (nb - blockoffset)); } //magma_int_t nbblk = magma_ceildiv(m, nb); magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t maxgsize = n*nb*magma_ceildiv(nbblk, ngpu); magma_int_t remm = m- fstblksiz; magma_int_t nbblkoffst = offset/nb; magma_int_t nblstblks = -1; magma_int_t devlstblk = -1; magma_int_t lstblksiz = remm%nb; if(lstblksiz>0){ nblstblks = nbblk%ngpu; devlstblk = (nblstblks-1+ngpu)%ngpu; } magma_int_t nbcmplxactive = 0; magma_int_t cmplxisactive[MagmaMaxGPUs]; magma_int_t gpuisactive[MagmaMaxGPUs]; memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); //******************************* // each GPU make a GEMM with the // transpose of its blocks to compute // a final portion of X=A*VT //******************************* /* dB = V*T already ==> dB**H = T**H * V**H * compute T**H * V**H * X is equal to compute locally (VT)**H_i*X_i * then each GPU broadcast its X_i to assemble the full X which is used * to compute W = X - 0.5 * V * T**H * V**H * X = X - 0.5 * V *dwork3 */ if(ngpu ==1){ magma_setdevice( 0 ); magmablasSetKernelStream( queues[ 0 ][ 0 ] ); // compute X[me] = A*VT = A[me]^tr *VT; magma_cgemm( MagmaConjTrans, MagmaNoTrans, m, n, m, alpha, dA(0, offset, offset), ldda, dB[0], lddb, beta, dC[0], lddc ); return; } //ngpu>1 for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { masterdev = -1; gnode[cmplxid][MagmaMaxGPUs+1] = -1; myngpu = gnode[cmplxid][MagmaMaxGPUs]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; devperm = (dev-stdev+ngpu)%ngpu; myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); mycolsize = myblk*nb; myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0)); if(dev==stdev){ mycolsize -= blockoffset; myblkoffst += blockoffset; // local index in parent matrix } if((devperm==devlstblk)&&(lstblksiz>0)){ mycolsize -= (nb-(remm%nb)); } mycolsize = min(mycolsize, m); if(mycolsize>0){ if(masterdev==-1) masterdev = dev; //printf("dev %d devperm %d on cmplx %d master %d nbblk %d myblk %d m %d n %d mycolsize %d stdev %d fstblksize %d lastdev %d lastsize %d dA(%d, %d, %d) ==> dwork(%d, %d)\n", dev, devperm, cmplxid, masterdev, nbblk, myblk, m, n, mycolsize, stdev, fstblksiz, devlstblk, remm%nb, dev, offset, myblkoffst, dev, maxgsize*dev); gpuisactive[dev] = mycolsize; magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ dev ] ); magma_cgemm( MagmaConjTrans, MagmaNoTrans, mycolsize, n, m, alpha, dA(dev, offset, myblkoffst), ldda, dB(dev, 0, 0), lddb, beta, &dwork[dev][maxgsize*dev], mycolsize ); magma_event_record(redevents[dev][dev*ngpu+dev], queues[dev][dev]); } if(dev == masterdev){ nbcmplxactive = nbcmplxactive +1; cmplxisactive[cmplxid] = 1; gnode[cmplxid][MagmaMaxGPUs+1] = masterdev; } } } /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( queues[ dev ][ dev ] ); } */ //******************************* // each Master GPU has the final // result either by receiving // from CPU of by making the add // by himself, so now it is time // to broadcast over the GPUs of // its board. //******************************* //printf("=======================================================================\n"); //printf(" sending \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU send my portion local // to all active gpu of my cmplex and global to the // active master of the other complex and they should // send it out to their actives slaves. magma_setdevice( dev ); //============================================== // sending to the master of the active complex //============================================== //printf ("\n\n**************GPU %d\n ", dev); //printf (" GPU %d sending to cmplx masters\n", dev); for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //complex is active //printf (" device %d from cmplx %d is sending to master %d on cmplx %d block of size %d event %d\n", dev, cmplxid, gmaster, k, mycolsize, redevents[dev][gmaster*ngpu+dev]); magma_queue_wait_event(queues[ dev ][ gmaster ], redevents[dev][dev*ngpu+dev]); magma_ccopymatrix_async( mycolsize, n, &dwork[dev ][maxgsize*dev], mycolsize, &dwork[gmaster][maxgsize*dev], mycolsize, queues[dev][gmaster] ); magma_event_record(redevents[dev][gmaster*ngpu+dev], queues[dev][gmaster]); } } } //============================================== // //============================================== // sending to the active GPUs of my complex //============================================== //printf (" GPU %d sending internal\n", dev); for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=dev)&&(lccolsize>0)){ //printf (" device %d from cmplx %d is sending internal to dev %d block of size %d event %d\n", dev, cmplxid, lcdev, mycolsize, redevents[dev][lcdev*ngpu+dev]); magma_queue_wait_event(queues[ dev ][ lcdev ], redevents[dev][dev*ngpu+dev]); magma_ccopymatrix_async( mycolsize, n, &dwork[dev ][maxgsize*dev], mycolsize, &dwork[lcdev][maxgsize*dev], mycolsize, queues[dev][lcdev] ); magma_event_record(redevents[dev][lcdev*ngpu+dev], queues[dev][lcdev]); } } //============================================== }// end if mycolsize>0 }// for idev }// for cmplxid //printf("=======================================================================\n"); //printf(" master wait and resend internally \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //============================================== // if I am active master so wait receiving contribution // of the GPUs of other complex and send it locally //============================================== if(masterdev != -1){ mycolsize = gpuisactive[masterdev]; magma_setdevice( masterdev ); //printf(" GPU %d distributing internal\n", masterdev); for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gngpu = gnode[k][MagmaMaxGPUs]; for( magma_int_t g = 0; g < gngpu; ++g ) { gdev = gnode[k][g]; gcolsize = gpuisactive[gdev]; // check if I received from this GPU, // if yes send it to my group if(gcolsize>0){ magma_queue_wait_event(queues[ masterdev ][ gdev ], redevents[gdev][masterdev*ngpu+gdev]); for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ //printf(" Master %d on cmplx %d waiting on event %d is distributing internal results of %d to lcdev %d block of size %d event %d\n", masterdev, cmplxid, redevents[gdev][masterdev*ngpu+gdev], gdev, lcdev, gcolsize, redevents[masterdev][lcdev*ngpu+gdev]); magma_ccopymatrix_async( gcolsize, n, &dwork[masterdev][maxgsize*gdev], gcolsize, &dwork[lcdev ][maxgsize*gdev], gcolsize, queues[masterdev][gdev] ); magma_event_record(redevents[masterdev][lcdev*ngpu+gdev], queues[masterdev][gdev]); } } } } } } }// if active master //============================================== }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( queues[ dev ][ 0 ] ); for( magma_int_t s = 0; s < ngpu; ++s ) { magma_queue_sync( queues[ dev ][ s ] ); } } */ //printf("=======================================================================\n"); //printf(" distributing \n"); //printf("=======================================================================\n"); magma_int_t lcblki, gbblki, gblk, ib; for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU //printf("\n\n==============GPU %d collecting\n", dev); magma_setdevice( dev ); // collect my results first as tyhere is no need to wait to // receive nothing, just wait that my gemm are done. // in theory this should be inside the loop but cuda was not // able to run it first for all gpu and on gpu>0 it was waiting // however it was on different stream so it should run. but maybe // this is because there are too many function call and this make // cuda not handleit so nice. anyway it coul dbe removed when cuda // is able to lunch it first without wait. gdev = dev; gcolsize = gpuisactive[gdev]; if(gcolsize>0){ devperm = (gdev-stdev+ngpu)%ngpu; gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); magmablasSetKernelStream( queues[ dev ][ gdev ] ); magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); //printf (" GPU %d stream %d doing clacpy\n", dev, queues[ dev ][ gdev ]); for( magma_int_t blki = 0; blki < gblk; ++blki){ gbblki = (blki*ngpu + devperm)*nb - blockoffset; lcblki = blki*nb; ib = nb;//min(nb, m-gbblki); if(gdev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } ib = min(ib, m-gbblki); //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset, nbblk, stdev, gdev, gblk, gcolsize, blki, ib, n, lcblki, gbblki); magmablas_clacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc); }// end blki } for( magma_int_t k = 0; k < nbcmplx; ++k ) { gngpu = gnode[k][MagmaMaxGPUs]; for( magma_int_t g = 0; g < gngpu; ++g ) { gdev = gnode[k][g]; gcolsize = gpuisactive[gdev]; // if gcolsize>0, ==> gpu gdev was active and so // I received from him/computed a portion of dwork, // so go over its gblk and distribute it on dC. if(gdev!=dev){ if(gcolsize>0){ devperm = (gdev-stdev+ngpu)%ngpu; gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); magmablasSetKernelStream( queues[ dev ][ gdev ] ); if(k==cmplxid){ //we are on the same group so wait on event issued by gdev for me citing his id magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); //printf (" GPU %d queue %d waiting on event %d to collecte from %d the size of gcolsize %d\n", dev, queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev], gdev, gcolsize); }else{ //we are on different group so: //if I am the master wait on the event issued by gdev for me citing his id //else wait event issued by my master for me on the behalf of gdev //printf (" GPU %d queue %d waiting on event %d to collecte from %d the size of gcolsize %d\n", dev, queues[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev], gdev, gcolsize); if(dev==masterdev) magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); else magma_queue_wait_event(queues[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev]); } //printf (" GPU %d stream %d doing clacpy\n", dev, queues[ dev ][ gdev ]); for( magma_int_t blki = 0; blki < gblk; ++blki){ gbblki = (blki*ngpu + devperm)*nb - blockoffset; lcblki = blki*nb; ib = nb;//min(nb, m-gbblki); if(gdev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } ib = min(ib, m-gbblki); //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset, nbblk, stdev, gdev, gblk, gcolsize, blki, ib, n, lcblki, gbblki); magmablas_clacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc); }// end blki }// en gcolsize>0 meaning gdev is active } // end if gdev != dev }// end loop over the g gpus of the cmplx k }//end loop over the complex k }// end mycolsize>0 meaning that I am active }// end loop over idev of cmplxid }// end loop of the cmplx for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } // put back the input gpu and its input stream magma_setdevice( cdev ); magmablasSetKernelStream( cstream ); }