extern "C" magma_int_t magma_cgmres( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t stat = 0; // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); magma_int_t stat_cpu = 0, stat_dev = 0; // prepare solver feedback solver_par->solver = Magma_GMRES; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; float nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace //magma_setdevice(0); magmaFloatComplex *H, *HH, *y, *h1; stat_cpu += magma_cmalloc_pinned( &H, (ldh+1)*ldh ); stat_cpu += magma_cmalloc_pinned( &y, ldh ); stat_cpu += magma_cmalloc_pinned( &HH, ldh*ldh ); stat_cpu += magma_cmalloc_pinned( &h1, ldh ); if( stat_cpu != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_HOST_ALLOC; } // GPU workspace magma_c_vector r, q, q_t; magma_c_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero, queue ); q_t.memory_location = Magma_DEV; q_t.dval = NULL; q_t.num_rows = q_t.nnz = dofs; q_t.num_cols = 1; magmaFloatComplex *dy = NULL, *dH = NULL; stat_dev += magma_cmalloc( &dy, ldh ); stat_dev += magma_cmalloc( &dH, (ldh+1)*ldh ); if( stat_dev != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magma_free( dH ); magma_free( dy ); magma_free( dH ); magma_free( dy ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_DEVICE_ALLOC; } // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); //magmablasSetKernelStream(stream[0]); magma_cscal( dofs, c_zero, x->dval, 1 ); // x = 0 magma_ccopy( dofs, b.dval, 1, r.dval, 1 ); // r = b nom0 = betanom = magma_scnrm2( dofs, r.dval, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_C_MAKE( nom0, 0. ); magma_csetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom0 * solver_par->epsilon ) < ATOLERANCE ){ r0 = solver_par->epsilon; } if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { for(k=1; k<=restart; k++) { magma_ccopy(dofs, r.dval, 1, q(k-1), 1); // q[0] = 1.0/||r|| magma_cscal(dofs, 1./H(k,k-1), q(k-1), 1); // (to be fused) q_t.dval = q(k-1); //magmablasSetKernelStream(stream[0]); magma_c_spmv( c_one, A, q_t, c_zero, r, queue ); // r = A q[k] // if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt for (i=1; i<=k; i++) { H(i,k) =magma_cdotc(dofs, q(i-1), 1, r.dval, 1); // H(i,k) = q[i] . r magma_caxpy(dofs,-H(i,k), q(i-1), 1, r.dval, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = ||r|| /*} else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing cgemv with scnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.dval, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_cgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_ccopyscale( dofs, k, r.dval, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_cgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1, queue ); // dH(1:k,k) = q[0:k-1] . r #ifndef SCNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); #ifdef SCNRM2SCALE magma_ccopy(dofs, r.dval, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_scnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = sqrt(r . r) if ( k<solver_par->restart ) { magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.dval, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif }*/ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { HH(k,i) = magma_cblas_cdotc( i+1, &H(1,k), 1, &H(1,i), 1 ); } h1[k] = H(1,k)*H(1,0); if (k != 1) { for (i=1; i<k; i++) { HH(k,i) = HH(k,i)/HH(i,i);// for (m=i+1; m<=k; m++) { HH(k,m) -= HH(k,i) * HH(m,i) * HH(i,i); } h1[k] -= h1[i] * HH(k,i); } } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_C_REAL(H(k+1,k))); }/* Minimization done */ // compute solution approximation magma_csetmatrix(m, 1, y+1, m, dy, m ); magma_cgemv(MagmaNoTrans, dofs, m, c_one, q(0), dofs, dy, 1, c_one, x->dval, 1); // compute residual magma_c_spmv( c_mone, A, *x, c_zero, r, queue ); // r = - A * x magma_caxpy(dofs, c_one, b.dval, 1, r.dval, 1); // r = r + b H(1,0) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_C_REAL( H(1,0) ); betanom = fabs(RNorm); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual, queue ); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_c_vfree(&r, queue ); magma_c_vfree(&q, queue ); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); //magmablasSetKernelStream(NULL); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_cgmres */
/** Purpose ------- CGERFS improve the computed solution to a system of linear equations. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(n)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by SLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA COMPLEX array on the GPU, dimension (ldda,N) the N-by-N coefficient matrix A. @param[in] ldda INTEGER The leading dimension of the array dA. ldda >= max(1,N). @param[in] dB COMPLEX array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. @param[in] lddb INTEGER The leading dimension of the array dB. lddb >= max(1,N). @param[in, out] dX COMPLEX array on the GPU, dimension (lddx,NRHS) On entry, the solution matrix X, as computed by CGETRS_NOPIV. On exit, the improved solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. lddx >= max(1,N). @param dworkd (workspace) COMPLEX array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. @param dAF COMPLEX array on the GPU, dimension (ldda,n) The factors L and U from the factorization A = L*U as computed by CGETRF_NOPIV. @param[out] iter INTEGER - < 0: iterative refinement has failed, real factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGETRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value - > 0: if info = i, U(i,i) computed in REAL is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. @ingroup magma_cgesv_driver ********************************************************************/ extern "C" magma_int_t magma_cgerfs_nopiv_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb, magmaFloatComplex_ptr dX, magma_int_t lddx, magmaFloatComplex_ptr dworkd, magmaFloatComplex_ptr dAF, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t ione = 1; magmaFloatComplex_ptr dR; magmaFloatComplex Xnrmv, Rnrmv; float Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dR = dworkd; eps = lapackf77_slamch("Epsilon"); Anrm = magmablas_clange(MagmaInfNorm, n, n, dA, ldda, (float*)dworkd ); cte = Anrm * eps * pow( (float)n, (float)0.5 ) * BWDMAX; // residual dR = dB - dA*dX in real magmablas_clacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_cgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_cgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_C_ABS( dX(i,j) ) instead of clange? for( j=0; j < nrhs; j++ ) { i = magma_icamax( n, dX(0,j), 1) - 1; magma_cgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_clange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_icamax ( n, dR(0,j), 1 ) - 1; magma_cgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_clange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); // printf("Rnrm : %e, Xnrm*cte : %e\n", Rnrm, Xnrm*cte); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // solve dAF*dX = dR // it's okay that dR is used for both dB input and dX output. magma_cgetrs_nopiv_gpu( trans, n, nrhs, dAF, lddsa, dR, lddr, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_caxpycp2( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in real if ( nrhs == 1 ) { magma_cgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_cgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_icamax( n, dX(0,j), 1) - 1; magma_cgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_clange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_icamax ( n, dR(0,j), 1 ) - 1; magma_cgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_clange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly. */ *iter = -ITERMAX - 1; FALLBACK: /* Iterative refinement failed to converge to a * satisfactory solution. */ return *info; }
/***************************************************************************//** Purpose ------- CGEQRS solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by CGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA COMPLEX array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by CGEQRF_GPU in the first n columns of its array argument A. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau COMPLEX array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_CGEQRF_GPU. @param[in,out] dB COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in,out] dT COMPLEX array that is the output (the 6th argument) of magma_cgeqrf_gpu of size 2*MIN(M, N)*NB + ceil(N/32)*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block inverses for the R matrix, followed by work space of size (ceil(N/32)*32)* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_cgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_geqrs *******************************************************************************/ extern "C" magma_int_t magma_cgeqrs_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex const *tau, magmaFloatComplex_ptr dT, magmaFloatComplex_ptr dB, magma_int_t lddb, magmaFloatComplex *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_) (dT + (lddwork + (i_))*nb) /* Constants */ const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; /* Local variables */ magmaFloatComplex_ptr dwork; magma_int_t i, min_mn, lddwork, rows, ib; magma_int_t nb = magma_get_cgeqrf_nb( m, n ); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; bool lquery = (lwork == -1); hwork[0] = magma_cmake_lwork( lwkopt ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -9; else if (lwork < lwkopt && ! lquery) *info = -11; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min(m,n); if (min_mn == 0) { hwork[0] = c_one; return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* B := Q^H * B */ magma_cunmqr_gpu( MagmaLeft, Magma_ConjTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { magma_queue_destroy( queue ); return *info; } /* Solve R*X = B(1:n,:) */ lddwork= min_mn; if (nb < min_mn) dwork = dT+2*lddwork*nb; else dwork = dT; // To do: Why did we have this line originally; seems to be a bug (Stan)? // dwork = dT; i = (min_mn - 1)/nb * nb; ib = n-i; rows = m-i; // TODO: this assumes that, on exit from magma_cunmqr_gpu, hwork contains // the last block of A and B (i.e., C in cunmqr). This should be fixed. // Seems this data should already be on the GPU, so could switch to // magma_ctrsm and drop the csetmatrix. if ( nrhs == 1 ) { blasf77_ctrsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, hwork, &rows, hwork+rows*ib, &ione); } else { blasf77_ctrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, &nrhs, &c_one, hwork, &rows, hwork+rows*ib, &rows); } // update the solution vector magma_csetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork+i, lddwork, queue ); // update c if (nrhs == 1) { magma_cgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1, queue ); } else { magma_cgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb, queue ); } magma_int_t start = i-nb; if (nb < min_mn) { for (i = start; i >= 0; i -= nb) { ib = min(min_mn - i, nb); rows = m - i; if (i + ib < n) { if (nrhs == 1) { magma_cgemv( MagmaNoTrans, ib, ib, c_one, dT(i), ib, dB+i, 1, c_zero, dwork+i, 1, queue ); magma_cgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1, queue ); } else { magma_cgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib, c_one, dT(i), ib, dB+i, lddb, c_zero, dwork+i, lddwork, queue ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb, queue ); } } } } magma_ccopymatrix( n, nrhs, dwork, lddwork, dB, lddb, queue ); magma_queue_destroy( queue ); return *info; }
/** @deprecated Purpose ------- CLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] dA COMPLEX array, dimension (LDDA,N), on the GPU. On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 REAL array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 REAL array, dimension (N) The vector with the exact column norms. @param[in,out] dauxv COMPLEX array, dimension (NB), on the GPU Auxiliary vector. @param[in,out] dF COMPLEX array, dimension (LDDF,NB), on the GPU Matrix F' = L*Y'*A. @param[in] lddf INTEGER The leading dimension of the array F. LDDF >= max(1,N). @ingroup magma_cgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_claqps_gpu( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaFloatComplex *tau, float *vn1, float *vn2, magmaFloatComplex_ptr dauxv, magmaFloatComplex_ptr dF, magma_int_t lddf) { #define dA(i, j) (dA + (i) + (j)*(ldda)) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaFloatComplex c_zero = MAGMA_C_MAKE( 0.,0.); magmaFloatComplex c_one = MAGMA_C_MAKE( 1.,0.); magmaFloatComplex c_neg_one = MAGMA_C_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; magmaFloatComplex z__1; magma_int_t k, rk; magmaFloatComplex_ptr dAks; magmaFloatComplex tauk = MAGMA_C_ZERO; magma_int_t pvt; float tol3z; magma_int_t itemp; float lsticc; magmaFloat_ptr dlsticcs; magma_smalloc( &dlsticcs, 1+256*(n+255)/256 ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); lsticc = 0; k = 0; magma_cmalloc( &dAks, nb ); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran/CUBLAS isamax; pvt, k are 0-based. pvt = k + magma_isamax( n-k, &vn1[k], ione, queue ) - 1; if (pvt != k) { /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; magmablas_cswap( m, dA(0, pvt), ione, dA(0, k), ione, queue ); magmablas_cswap( i__1, dF(pvt, 0), lddf, dF(k, 0), lddf, queue ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; magma_sswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset, queue ); } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, dA(rk, 0), ldda, dF(k, 0), lddf, c_one, dA(rk, k), ione, queue ); #endif } /* Generate elementary reflector H(k). */ magma_clarfg_gpu( m-rk, dA(rk, k), dA(rk + 1, k), &tau[k], &vn1[k], &dAks[k], queue ); /* needed to avoid the race condition */ if (k == 0) magma_csetvector( 1, &c_one, 1, dA(rk, k), 1, queue ); else magma_ccopymatrix( 1, 1, dA(offset, 0), 1, dA(rk, k), 1, queue ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_cgetvector( 1, &tau[k], 1, &tauk, 1, queue ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Multiply on GPU */ magma_cgemv( MagmaConjTrans, m-rk, n-k-1, tauk, dA( rk, k+1 ), ldda, dA( rk, k ), 1, c_zero, dF( k+1, k ), 1, queue ); } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ if (k > 0) { z__1 = MAGMA_C_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, dA(offset+nb, 0), lda, dA(offset+nb, k), ione, c_zero, dauxv, ione, queue ); i__1 = k; magma_cgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, dauxv, ione, c_one, F(k+1,k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, dA(rk, 0), ldda, dA(rk, k), ione, c_zero, dauxv, ione, queue ); /* I think we only need stricly lower-triangular part :) */ magma_cgemv( MagmaNoTrans, n-k-1, i__2, c_one, dF(k+1,0), lddf, dauxv, ione, c_one, dF(k+1,k), ione, queue ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, dA(rk, k ), ldda, dF(k+1, k ), lddf, c_one, dA(rk, k+1), ldda, queue ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, dA(rk, 0 ), ldda, dF(k+1,0 ), lddf, c_one, dA(rk, k+1), ldda, queue ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_scnrm2_row_check_adjust( n-k-1, tol3z, &vn1[k+1], &vn2[k+1], dA(rk,k+1), ldda, dlsticcs, queue ); //magma_device_sync(); magma_sgetvector( 1, &dlsticcs[0], 1, &lsticc, 1, queue ); } ++k; } magma_ccopymatrix( 1, k, dAks, 1, dA(offset, 0), ldda+1, queue ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; magma_cgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), lddf, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ if ( lsticc > 0 ) { // printf( " -- recompute dnorms --\n" ); magmablas_scnrm2_check( m-rk-1, n-*kb, dA(rk+1,*kb), ldda, &vn1[*kb], dlsticcs, queue ); magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb, queue ); } magma_free( dAks ); magma_free( dlsticcs ); magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_claqps */
/** Purpose ------- CLABRD reduces the first NB rows and columns of a complex general m by n matrix A to upper or lower bidiagonal form by an orthogonal transformation Q' * A * P, and returns the matrices X and Y which are needed to apply the transformation to the unreduced part of A. If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower bidiagonal form. This is an auxiliary routine called by CGEBRD. Arguments --------- @param[in] m INTEGER The number of rows in the matrix A. @param[in] n INTEGER The number of columns in the matrix A. @param[in] nb INTEGER The number of leading rows and columns of A to be reduced. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the m by n general matrix to be reduced. On exit, the first NB rows and columns of the matrix are overwritten; the rest of the array is unchanged. If m >= n, elements on and below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors; and elements above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. \n If m < n, elements below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and elements on and above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] dA COMPLEX array, dimension (LDDA,N) Copy of A on GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] d COMPLEX array, dimension (NB) The diagonal elements of the first NB rows and columns of the reduced matrix. D(i) = A(i,i). @param[out] e COMPLEX array, dimension (NB) The off-diagonal elements of the first NB rows and columns of the reduced matrix. @param[out] tauq COMPLEX array dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. @param[out] taup COMPLEX array, dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. @param[out] X COMPLEX array, dimension (LDX,NB) The m-by-nb matrix X required to update the unreduced part of A. @param[in] ldx INTEGER The leading dimension of the array X. LDX >= M. @param[out] dX COMPLEX array, dimension (LDDX,NB) Copy of X on GPU. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= M. @param[out] Y COMPLEX array, dimension (LDY,NB) The n-by-nb matrix Y required to update the unreduced part of A. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[out] dY COMPLEX array, dimension (LDDY,NB) Copy of Y on GPU. @param[in] lddy INTEGER The leading dimension of the array dY. LDDY >= N. Further Details --------------- The matrices Q and P are represented as products of elementary reflectors: Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are complex scalars, and v and u are complex vectors. If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The elements of the vectors v and u together form the m-by-nb matrix V and the nb-by-n matrix U' which are needed, with X and Y, to apply the transformation to the unreduced part of the matrix, using a block update of the form: A := A - V*Y' - X*U'. The contents of A on exit are illustrated by the following examples with nb = 2: @verbatim m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) ( v1 v2 a a a ) ( v1 1 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) @endverbatim where a denotes an element of the original matrix which is unchanged, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). @ingroup magma_cgesvd_aux ********************************************************************/ extern "C" magma_int_t magma_clabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex_ptr dA, magma_int_t ldda, float *d, float *e, magmaFloatComplex *tauq, magmaFloatComplex *taup, magmaFloatComplex *X, magma_int_t ldx, magmaFloatComplex_ptr dX, magma_int_t lddx, magmaFloatComplex *Y, magma_int_t ldy, magmaFloatComplex_ptr dY, magma_int_t lddy) { #define A(i_,j_) (A + (i_) + (j_)*lda) #define X(i_,j_) (X + (i_) + (j_)*ldx) #define Y(i_,j_) (Y + (i_) + (j_)*ldy) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dY(i_,j_) (dY + (i_) + (j_)*lddy) #define dX(i_,j_) (dX + (i_) + (j_)*lddx) magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magma_int_t ione = 1; magma_int_t i__2, i__3; magma_int_t i; magmaFloatComplex alpha; A -= 1 + lda; X -= 1 + ldx; dX -= 1 + lddx; Y -= 1 + ldy; dY -= 1 + lddy; --d; --e; --tauq; --taup; /* Quick return if possible */ magma_int_t info = 0; if (m <= 0 || n <= 0) { return info; } magmaFloatComplex *f; magma_queue_t stream; magma_queue_create( &stream ); magma_cmalloc_cpu( &f, max(n,m) ); if ( f == NULL ) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (m >= n) { /* Reduce to upper bidiagonal form */ for (i = 1; i <= nb; ++i) { /* Update A(i:m,i) */ i__2 = m - i + 1; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_cgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i,1), &lda, Y(i,1), &ldy, &c_one, A(i,i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_cgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i,1), &ldx, A(1,i), &ione, &c_one, A(i,i), &ione ); /* Generate reflection Q(i) to annihilate A(i+1:m,i) */ alpha = *A(i,i); i__2 = m - i + 1; i__3 = i + 1; lapackf77_clarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] ); d[i] = MAGMA_C_REAL( alpha ); if (i < n) { *A(i,i) = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i + 1; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_csetvector( i__2, A(i,i), 1, dA(i-1,i-1), 1 ); // 2. Multiply --------------------------------------------- magma_cgemv( MagmaConjTrans, i__2, i__3, c_one, dA(i-1,i), ldda, dA(i-1,i-1), ione, c_zero, dY(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_cgetmatrix_async( i__3, 1, dY(i+1,i), lddy, Y(i+1,i), ldy, stream ); i__2 = m - i + 1; i__3 = i - 1; blasf77_cgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, A(i,1), &lda, A(i,i), &ione, &c_zero, Y(1,i), &ione ); i__2 = n - i; i__3 = i - 1; blasf77_cgemv( "N", &i__2, &i__3, &c_neg_one, Y(i+1,1), &ldy, Y(1,i), &ione, &c_zero, f, &ione ); i__2 = m - i + 1; i__3 = i - 1; blasf77_cgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, X(i,1), &ldx, A(i,i), &ione, &c_zero, Y(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3 != 0) { i__2 = n - i; blasf77_caxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione ); } i__2 = i - 1; i__3 = n - i; blasf77_cgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i+1), &lda, Y(1,i), &ione, &c_one, Y(i+1,i), &ione ); i__2 = n - i; blasf77_cscal( &i__2, &tauq[i], Y(i+1,i), &ione ); /* Update A(i,i+1:n) */ i__2 = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__2, A(i,i+1), &lda ); lapackf77_clacgv( &i, A(i,1), &lda ); #endif blasf77_cgemv( "No transpose", &i__2, &i, &c_neg_one, Y(i+1,1), &ldy, A(i,1), &lda, &c_one, A(i,i+1), &lda ); i__2 = i - 1; i__3 = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i, A(i,1), &lda ); lapackf77_clacgv( &i__2, X(i,1), &ldx ); #endif blasf77_cgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i+1), &lda, X(i,1), &ldx, &c_one, A(i,i+1), &lda ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__2, X(i,1), &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+2:n) */ i__2 = n - i; i__3 = i + 2; alpha = *A(i,i+1); lapackf77_clarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] ); e[i] = MAGMA_C_REAL( alpha ); *A(i,i+1) = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_csetvector( i__3, A(i,i+1), lda, dA(i-1,i), ldda ); // 2. Multiply --------------------------------------------- //magma_ccopy( i__3, dA(i-1,i), ldda, dY(1,1), 1 ); magma_cgemv( MagmaNoTrans, i__2, i__3, c_one, dA(i,i), ldda, dA(i-1,i), ldda, //dY(1,1), 1, c_zero, dX(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_cgetmatrix_async( i__2, 1, dX(i+1,i), lddx, X(i+1,i), ldx, stream ); i__2 = n - i; blasf77_cgemv( MagmaConjTransStr, &i__2, &i, &c_one, Y(i+1,1), &ldy, A(i,i+1), &lda, &c_zero, X(1,i), &ione ); i__2 = m - i; blasf77_cgemv( "N", &i__2, &i, &c_neg_one, A(i+1,1), &lda, X(1,i), &ione, &c_zero, f, &ione ); i__2 = i - 1; i__3 = n - i; blasf77_cgemv( "N", &i__2, &i__3, &c_one, A(1,i+1), &lda, A(i,i+1), &lda, &c_zero, X(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i != 0) { i__2 = m - i; blasf77_caxpy( &i__2, &c_one, f, &ione, X(i+1,i), &ione ); } i__2 = m - i; i__3 = i - 1; blasf77_cgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i+1,1), &ldx, X(1,i), &ione, &c_one, X(i+1,i), &ione ); i__2 = m - i; blasf77_cscal( &i__2, &taup[i], X(i+1,i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) i__2 = n - i; lapackf77_clacgv( &i__2, A(i,i+1), &lda ); // 4. Send the block reflector A(i+1:m,i) to the GPU after CLACGV() magma_csetvector( i__2, A(i,i+1), lda, dA(i-1,i), ldda ); #endif } } } else { /* Reduce to lower bidiagonal form */ for (i = 1; i <= nb; ++i) { /* Update A(i,i:n) */ i__2 = n - i + 1; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__2, A(i,i), &lda ); lapackf77_clacgv( &i__3, A(i,1), &lda ); #endif blasf77_cgemv( "No transpose", &i__2, &i__3, &c_neg_one, Y(i,1), &ldy, A(i,1), &lda, &c_one, A(i,i), &lda ); i__2 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__3, A(i,1), &lda ); lapackf77_clacgv( &i__3, X(i,1), &ldx ); #endif i__3 = n - i + 1; blasf77_cgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i), &lda, X(i,1), &ldx, &c_one, A(i,i), &lda ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__2, X(i,1), &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+1:n) */ i__2 = n - i + 1; i__3 = i + 1; alpha = *A(i,i); lapackf77_clarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] ); d[i] = MAGMA_C_REAL( alpha ); if (i < m) { *A(i,i) = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i; i__3 = n - i + 1; // 1. Send the block reflector A(i,i+1:n) to the GPU ------ magma_csetvector( i__3, A(i,i), lda, dA(i-1,i-1), ldda ); // 2. Multiply --------------------------------------------- //magma_ccopy( i__3, dA(i-1,i-1), ldda, dY(1,1), 1 ); magma_cgemv( MagmaNoTrans, i__2, i__3, c_one, dA(i,i-1), ldda, dA(i-1,i-1), ldda, //dY(1,1), 1, c_zero, dX(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_cgetmatrix_async( i__2, 1, dX(i+1,i), lddx, X(i+1,i), ldx, stream ); i__2 = n - i + 1; i__3 = i - 1; blasf77_cgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, Y(i,1), &ldy, A(i,i), &lda, &c_zero, X(1,i), &ione ); i__2 = m - i; i__3 = i - 1; blasf77_cgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i+1,1), &lda, X(1,i), &ione, &c_zero, f, &ione ); i__2 = i - 1; i__3 = n - i + 1; blasf77_cgemv( "No transpose", &i__2, &i__3, &c_one, A(1,i), &lda, A(i,i), &lda, &c_zero, X(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__2 != 0) { i__3 = m - i; blasf77_caxpy( &i__3, &c_one, f, &ione, X(i+1,i), &ione ); } i__2 = m - i; i__3 = i - 1; blasf77_cgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i+1,1), &ldx, X(1,i), &ione, &c_one, X(i+1,i), &ione ); i__2 = m - i; blasf77_cscal( &i__2, &taup[i], X(i+1,i), &ione ); i__2 = n - i + 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__2, A(i,i), &lda ); magma_csetvector( i__2, A(i,i), lda, dA(i-1,i-1), ldda ); #endif /* Update A(i+1:m,i) */ i__2 = m - i; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_cgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i+1,1), &lda, Y(i,1), &ldy, &c_one, A(i+1,i), &ione ); i__2 = m - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_cgemv( "No transpose", &i__2, &i, &c_neg_one, X(i+1,1), &ldx, A(1,i), &ione, &c_one, A(i+1,i), &ione ); /* Generate reflection Q(i) to annihilate A(i+2:m,i) */ i__2 = m - i; i__3 = i + 2; alpha = *A(i+1,i); lapackf77_clarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] ); e[i] = MAGMA_C_REAL( alpha ); *A(i+1,i) = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_csetvector( i__2, A(i+1,i), 1, dA(i,i-1), 1 ); // 2. Multiply --------------------------------------------- magma_cgemv( MagmaConjTrans, i__2, i__3, c_one, dA(i,i), ldda, dA(i,i-1), ione, c_zero, dY(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_cgetmatrix_async( i__3, 1, dY(i+1,i), lddy, Y(i+1,i), ldy, stream ); i__2 = m - i; i__3 = i - 1; blasf77_cgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, A(i+1,1), &lda, A(i+1,i), &ione, &c_zero, Y(1,i), &ione ); i__2 = n - i; i__3 = i - 1; blasf77_cgemv( "No transpose", &i__2, &i__3, &c_neg_one, Y(i+1,1), &ldy, Y(1,i), &ione, &c_zero, f, &ione ); i__2 = m - i; blasf77_cgemv( MagmaConjTransStr, &i__2, &i, &c_one, X(i+1,1), &ldx, A(i+1,i), &ione, &c_zero, Y(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3 != 0) { i__2 = n - i; blasf77_caxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione ); } i__2 = n - i; blasf77_cgemv( MagmaConjTransStr, &i, &i__2, &c_neg_one, A(1,i+1), &lda, Y(1,i), &ione, &c_one, Y(i+1,i), &ione ); i__2 = n - i; blasf77_cscal( &i__2, &tauq[i], Y(i+1,i), &ione ); } #if defined(PRECISION_z) || defined(PRECISION_c) else { i__2 = n - i + 1; lapackf77_clacgv( &i__2, A(i,i), &lda ); magma_csetvector( i__2, A(i,i), lda, dA(i-1,i-1), ldda ); } #endif } } magma_queue_destroy( stream ); magma_free_cpu( f ); return info; } /* magma_clabrd_gpu */
extern "C" magma_int_t magma_clahr2( magma_int_t n, magma_int_t k, magma_int_t nb, magmaFloatComplex *dA, magmaFloatComplex *dV, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *T, magma_int_t ldt, magmaFloatComplex *Y, magma_int_t ldy ) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CLAHR2 reduces the first NB columns of a complex general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by CGEHRD. Arguments ========= N (input) INTEGER The order of the matrix A. K (input) INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. NB (input) INTEGER The number of columns to be reduced. dA (input/output) COMPLEX array on the GPU, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements in rows K:N of the first NB columns are overwritten with the matrix Y. DV (output) COMPLEX array on the GPU, dimension (N, NB) On exit this contains the Householder vectors of the transformation. A (input/output) COMPLEX array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) COMPLEX array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. T (output) COMPLEX array, dimension (LDT,NB) The upper triangular matrix T. LDT (input) INTEGER The leading dimension of the array T. LDT >= NB. Y (output) COMPLEX array, dimension (LDY,NB) The n-by-nb matrix Y. LDY (input) INTEGER The leading dimension of the array Y. LDY >= N. Further Details =============== The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. ===================================================================== */ #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( i, j ) (dA + (i) + (j)*ldda) #define dV( i, j ) (dV + (i) + (j)*ldda) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ldda = lda; magma_int_t ione = 1; magma_int_t n_k_i_1, n_k; magmaFloatComplex scale; magma_int_t i; magmaFloatComplex ei = MAGMA_C_ZERO; // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return 0; for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Update A(k:n-1,i); Update i-th column of A - Y * T * V' // This updates one more row than LAPACK does (row k), // making the block above the panel an even multiple of nb. // Use last column of T as workspace, w. // w(0:i-1, nb-1) = VA(k+i, 0:i-1)' blasf77_ccopy( &i, A(k+i,0), &lda, T(0,nb-1), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) // If complex, conjugate row of V. lapackf77_clacgv(&i, T(0,nb-1), &ione); #endif // w = T(0:i-1, 0:i-1) * w blasf77_ctrmv( "Upper", "No trans", "No trans", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // A(k:n-1, i) -= Y(k:n-1, 0:i-1) * w blasf77_cgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_ccopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_ctrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_cgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_ctrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_cgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_ctrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_caxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_clarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // dV(i+1:n-k-1, i) = VA(k+i+1:n-1, i) magma_csetvector( n_k_i_1, A(k+i+1,i), 1, dV(i+1,i), 1 ); // Compute Y(k+1:n,i) = A vi // dA(k:n-1, i) = dA(k:n-1, i+1:n-k-1) * dV(i+1:n-k-1, i) magma_cgemv( MagmaNoTrans, n_k, n_k_i_1, c_one, dA(k,i+1), ldda, dV(i+1,i), ione, c_zero, dA(k,i), ione ); // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_C_NEGATE( tau[i]); blasf77_cgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_ctrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // Y(k:n-1, i) = dA(k:n-1, i) magma_cgetvector( n-k, dA(k,i), 1, Y(k,i), 1 ); } // Restore diagonal element *A(k+nb,nb-1) = ei; return 0; } // magma_clahr2
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans }; magma_uplo_t uplo [] = { MagmaLower, MagmaUpper }; magma_diag_t diag [] = { MagmaUnit, MagmaNonUnit }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magmaFloatComplex *A, *B, *C, *C2, *LU; magmaFloatComplex_ptr dA, dB, dC1, dC2; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.5, 0.1 ); magmaFloatComplex beta = MAGMA_C_MAKE( 0.7, 0.2 ); float dalpha = 0.6; float dbeta = 0.8; float work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_int_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = max( 1, maxn ); size = ld*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_cmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_cmalloc( &dA, size ); assert( err == 0 ); err = magma_cmalloc( &dB, size ); assert( err == 0 ); err = magma_cmalloc( &dC1, size ); assert( err == 0 ); err = magma_cmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test CSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetmatrix( m, n, A, ld, dB, ld ); magma_cswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_cswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dA, 1, dB, 1 ); magma_cgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_clange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "cswap diff %.2g\n", error ); } else { printf( "cswap skipped for n < 3\n" ); } // ----- test ICAMAX // get argmax of column of A magma_csetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_icamax( m, dA(0,j), 1 ); int i2; // NOT magma_int_t, for cublas cublasIcamax( opts.handle, m, dA(0,j), 1, &i2 ); // todo need sync here? assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (float)m * k / 1e9; printf( "icamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test CGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetvector( maxn, B, 1, dB, 1 ); magma_csetvector( maxn, C, 1, dC1, 1 ); magma_csetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_cgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemv( opts.handle, cublas_trans_const(trans[ia]), m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == MagmaNoTrans ? m : n); cublasCaxpy( opts.handle, size, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMV( m, n ) / 1e9; printf( "cgemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetvector( m, B, 1, dB, 1 ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_chemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemv( opts.handle, cublas_uplo_const(uplo[iu]), m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMV( m ) / 1e9; printf( "chemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_clacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_cgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_csetmatrix( m, m, LU, ld, dA, ld ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ctrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsv( opts.handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ctrsv( %c, %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]), error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test CGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == MagmaNoTrans); bool ntb = (trans[ib] == MagmaNoTrans); magma_csetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_csetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemm( opts.handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]), m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMM( m, n, k ) / 1e9; printf( "cgemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetmatrix( m, n, B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_chemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemm( opts.handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMM( side[is], m, n ) / 1e9; printf( "chemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_csetmatrix( n, k, A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCherk( opts.handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &dalpha, dA, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHERK( k, n ) / 1e9; printf( "cherk( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == MagmaNoTrans); magma_csetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCher2k( opts.handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHER2K( k, n ) / 1e9; printf( "cher2k( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. t2 = magma_sync_wtime( 0 ); cublasCtrmm( opts.handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRMM( side[is], m, n ) / 1e9; printf( "ctrmm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test CTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsm( opts.handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( side[is], m, n ) / 1e9; printf( "ctrsm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); fflush( stdout ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); int status = (total_error != 0.); return status; }
/***************************************************************************//** Purpose ------- CLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] dA COMPLEX array, dimension (LDA,N) Copy of A on the GPU. Portions of A are updated on the CPU; portions of dA are updated on the GPU. See code for details. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 REAL array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 REAL array, dimension (N) The vector with the exact column norms. @param[in,out] auxv COMPLEX array, dimension (NB) Auxiliar vector. @param[in,out] F COMPLEX array, dimension (LDF,NB) Matrix F' = L*Y'*A. @param[in] ldf INTEGER The leading dimension of the array F. LDF >= max(1,N). @param[in,out] dF COMPLEX array, dimension (LDDF,NB) Copy of F on the GPU. See code for details. @param[in] lddf INTEGER The leading dimension of the array dF. LDDF >= max(1,N). @ingroup magma_laqps *******************************************************************************/ extern "C" magma_int_t magma_claqps( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaFloatComplex *tau, float *vn1, float *vn2, magmaFloatComplex *auxv, magmaFloatComplex *F, magma_int_t ldf, magmaFloatComplex_ptr dF, magma_int_t lddf) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dA + (i) + (j)*(ldda)) #define F(i, j) (F + (i) + (j)*(ldf )) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaFloatComplex c_zero = MAGMA_C_MAKE( 0.,0.); magmaFloatComplex c_one = MAGMA_C_MAKE( 1.,0.); magmaFloatComplex c_neg_one = MAGMA_C_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; float d__1; magmaFloatComplex z__1; magma_int_t j, k, rk; magmaFloatComplex Akk; magma_int_t pvt; float temp, temp2, tol3z; magma_int_t itemp; magma_int_t lsticc; magma_int_t lastrk; lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); lsticc = 0; k = 0; while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran isamax; pvt, k are 0-based. i__1 = n-k; pvt = k + blasf77_isamax( &i__1, &vn1[k], &ione ) - 1; if (pvt != k) { if (pvt >= nb) { /* 1. Start copy from GPU */ magma_cgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, queue ); } /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; blasf77_cswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; vn1[pvt] = vn1[k]; vn2[pvt] = vn2[k]; if (pvt < nb) { /* no need of transfer if pivot is within the panel */ blasf77_cswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { /* 1. Finish copy from GPU */ magma_queue_sync( queue ); /* 2. Swap as usual on CPU */ blasf77_cswap(&m, A(0, pvt), &ione, A(0, k), &ione); /* 3. Restore the GPU */ magma_csetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, queue ); } } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CONJ( *F(k,j) ); } #endif i__1 = m - rk; i__2 = k; blasf77_cgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CONJ( *F(k,j) ); } #endif } /* Generate elementary reflector H(k). */ if (rk < m-1) { i__1 = m - rk; lapackf77_clarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] ); } else { lapackf77_clarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] ); } Akk = *A(rk, k); *A(rk, k) = c_one; /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ magma_csetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda, queue ); /* Multiply on GPU */ // was CALL CGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) magma_int_t i__3 = nb-k-1; magma_int_t i__4 = i__2 - i__3; magma_int_t i__5 = nb-k; magma_cgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, tau[k], dA(rk +i__5, k+1+i__3), ldda, dA(rk +i__5, k ), ione, c_zero, dF(k+1+i__3, k ), ione, queue ); magma_cgetmatrix_async( i__2-i__3, 1, dF(k + 1 +i__3, k), i__2, F (k + 1 +i__3, k), i__2, queue ); blasf77_cgemv( MagmaConjTransStr, &i__1, &i__3, &tau[k], A(rk, k+1), &lda, A(rk, k ), &ione, &c_zero, F(k+1, k ), &ione ); magma_queue_sync( queue ); blasf77_cgemv( MagmaConjTransStr, &i__5, &i__4, &tau[k], A(rk, k+1+i__3), &lda, A(rk, k ), &ione, &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. */ for (j = 0; j < k; ++j) { *F(j, k) = c_zero; } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */ if (k > 0) { i__1 = m - rk; i__2 = k; z__1 = MAGMA_C_NEGATE( tau[k] ); blasf77_cgemv( MagmaConjTransStr, &i__1, &i__2, &z__1, A(rk, 0), &lda, A(rk, k), &ione, &c_zero, auxv, &ione ); i__1 = k; blasf77_cgemv( MagmaNoTransStr, &n, &i__1, &c_one, F(0,0), &ldf, auxv, &ione, &c_one, F(0,k), &ione ); } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; blasf77_cgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, &c_neg_one, A(rk, 0 ), &lda, F(k+1,0 ), &ldf, &c_one, A(rk, k+1), &lda ); } /* Update partial column norms. */ if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { /* NOTE: The following 4 lines follow from the analysis in Lapack Working Note 176. */ temp = MAGMA_C_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } } *A(rk, k) = Akk; ++k; } // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU */ magma_csetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2, queue ); magma_cgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), i__2, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) { vn1[lsticc] = magma_cblas_scnrm2( i__1, A(rk+1,lsticc), ione ); } else { /* Where is the data, CPU or GPU ? */ float r1, r2; r1 = magma_cblas_scnrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_scnrm2( m-offset-nb, dA(offset + nb + 1, lsticc), ione, queue ); //vn1[lsticc] = magma_scnrm2( i__1, dA(rk + 1, lsticc), ione, queue ); vn1[lsticc] = magma_ssqrt(r1*r1 + r2*r2); } /* NOTE: The computation of VN1( LSTICC ) relies on the fact that SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */ vn2[lsticc] = vn1[lsticc]; lsticc = itemp; } magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_claqps */
extern "C" int calc_numerical_range(magmaFloatComplex *M, magma_int_t M_lead_dim, float _from, float _step, magma_int_t _steps, magmaFloatComplex *pts) { magma_int_t idx = 0, rslt = 0; magmaFloatComplex p, scalar; std::complex<float> vtmp; float j; magmaFloatComplex *dA = nullptr; magmaFloatComplex *dAth = NULL, *dAthT = NULL, *dX = NULL, *dY = NULL; float *dE = NULL; //float *hE = NULL; //magma_int_t *ipiv = NULL; magma_int_t lda = M_lead_dim; //magma_int_t ldx = lda; magma_int_t info = 0; magma_int_t nb = 0; //magma_vec_t jobvl; //magma_vec_t jobvr; magmaFloatComplex *work = nullptr; magma_int_t lwork = 0; float *rwork = nullptr; magma_int_t lrwork = 0; magma_int_t *iwork = nullptr; magma_int_t liwork = 0; nb = magma_get_cgehrd_nb( M_lead_dim ); lwork = 2 * max(M_lead_dim + M_lead_dim*nb, 2 * M_lead_dim + M_lead_dim*M_lead_dim); // MagmaVec lrwork = 1 + 5 * M_lead_dim + 2 * M_lead_dim*M_lead_dim; // MagmaVec liwork = (3 + 5 * M_lead_dim); // MagmaVec magma_imalloc_cpu(&iwork, liwork); magma_smalloc_cpu(&rwork, lrwork); magma_cmalloc_pinned(&work, lwork); magma_cmalloc_pinned(&dA, lda*M_lead_dim); magma_cmalloc_pinned(&dAth, lda*M_lead_dim); magma_cmalloc_pinned(&dAthT, lda*M_lead_dim); magma_smalloc_pinned(&dE, M_lead_dim); //magma_smalloc_cpu(&hE, M_lead_dim); magma_cmalloc_pinned(&dX, M_lead_dim); magma_cmalloc_pinned(&dY, M_lead_dim); magma_csetmatrix(M_lead_dim, M_lead_dim, M, lda, dA, M_lead_dim, queue); // th=[0:resolution:2*pi] j = _from; for (idx = 0; idx < _steps; idx++) { //scalar = exp( 1im * -j); vtmp.real( 0.0f ); vtmp.imag( -j ); //vtmp = _FCbuild(0.0f, -j); //printf("vtmp = %f + i%f\n", vtmp._Val[0], vtmp._Val[1]); vtmp = exp(vtmp); scalar.x = vtmp.real(); scalar.y = vtmp.imag(); //printf("scalar = %f + i%f\n", scalar.x, scalar.y); magma_ccopy(lda * M_lead_dim, dA, 1, dAth, 1, queue); // Ath = exp(1im * -j) * As magma_cscal(lda * M_lead_dim, scalar, dAth, 1, queue); //magma_cprint_gpu(N, N, dA, lda); //magma_cprint_gpu(N, N, dAth, lda); // AthT = (Ath + Ath') magmablas_ctranspose_conj(M_lead_dim, M_lead_dim, dAth, M_lead_dim, dAthT, M_lead_dim, queue); magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dAth, M_lead_dim, dAthT, M_lead_dim, queue); // AthT = AthT / 2 magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.5f, 0.0f), dAthT, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda); // e, r = eig(AthT) rslt = magma_cheevd(MagmaVec, MagmaLower, M_lead_dim, dAthT, lda, dE, work, lwork, rwork, lrwork, iwork, liwork, &info); magma_sync_wtime(queue); //printf("magma_cheevd info=%d\n", info); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda); //magma_sprint_gpu(M_lead_dim, 1, dE, M_lead_dim); //magma_sgetvector(M_lead_dim, dE, 1, hE, 1, queue); //printf("%f %f\n", hE[0], hE[1]); // p = r[:,s]' * A * r[:,s] // r = r[:,s] magma_ccopy( M_lead_dim, dAthT + (M_lead_dim*(M_lead_dim-1)), 1, // dAthT + (N), where (N) is a column offset dX, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, 1, dX, M_lead_dim); // pp = A * r[:,s] magma_cgemv(MagmaNoTrans, M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dA, lda, dX, 1, MAGMA_C_MAKE(0.0f, 0.0f), dY, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, 1, dY, M_lead_dim); // p = r' * pp p = magma_cdotc(M_lead_dim, dX, 1, dY, 1, queue); magma_sync_wtime(queue); pts[idx] = p; //printf("p = %f %fi\n", p.x, p.y); j += _step; } // end of for (idx = 0; idx < _steps; idx++) magma_free_pinned(dY); magma_free_pinned(dX); //magma_free_cpu(hE); magma_free_pinned(dE); magma_free_pinned(dAthT); magma_free_pinned(dAth); magma_free_pinned(dA); magma_free_pinned(work); magma_free_cpu(rwork); magma_free_cpu(iwork); //magma_free_cpu(w); //magma_free_cpu(A); return rslt; }
/** Purpose ------- CLAHR2 reduces the first NB columns of a complex general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by CGEHRD. Arguments --------- @param[in] n INTEGER The order of the matrix A. @param[in] k INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. @param[in] nb INTEGER The number of columns to be reduced. @param[in,out] A COMPLEX array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau COMPLEX array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. @param[out] T COMPLEX array, dimension (LDT,NB) The upper triangular matrix T. @param[in] ldt INTEGER The leading dimension of the array T. LDT >= NB. @param[out] Y COMPLEX array, dimension (LDY,NB) The n-by-nb matrix Y. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[in,out] data Structure with pointers to dA, dT, dV, dW, dY which are distributed across multiple GPUs. Further Details --------------- The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: @verbatim ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) @endverbatim where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_cgeev_aux ********************************************************************/ extern "C" magma_int_t magma_clahr2_m( magma_int_t n, magma_int_t k, magma_int_t nb, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *T, magma_int_t ldt, magmaFloatComplex *Y, magma_int_t ldy, struct cgehrd_data *data ) { #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex tmp; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t ione = 1; magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid; magma_int_t n_k_i_1, n_k; magmaFloatComplex scale; magma_int_t i; magmaFloatComplex ei = MAGMA_C_ZERO; magma_int_t info_data = 0; magma_int_t *info = &info_data; if (n < 0) { *info = -1; } else if (k < 0 || k >= n) { *info = -2; } else if (nb < 1 || nb > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldt < nb) { *info = -8; } else if (ldy < max(1,n)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // zero out current top block of V on all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); magmablas_claset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv ); } // set all Y=0 lapackf77_claset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy ); for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Finish applying I - V * T * V' on right tmp = MAGMA_C_NEGATE( tau[i-1] ); blasf77_caxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_ccopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_ctrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_cgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_ctrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_cgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_ctrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_caxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_clarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // compute yi = A vi = sum_g A{d} vi{d} nblocks = (n-1) / nb / ngpu + 1; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // dV(k+i+1:n-1, i) = VA(k+i:n, i) magma_csetvector_async( n_k_i_1, A(k+i+1,i), 1, dV(d, k+i+1, i), 1, data->streams[d] ); // copy column of dV -> dVd, using block cyclic distribution. // This assumes V and Vd have been padded so that // a 2D matrix copy doesn't access them out-of-bounds gblock = k / nb; lblock = gblock / ngpu; lgid = gblock % ngpu; if ( d < lgid ) { lblock += 1; } // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix magmablas_clacpy( MagmaFull, nb, nblocks-lblock, dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu, dVd(d, 0 + lblock*nb, i), nb ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i) // skip if matrix is empty // each GPU copies to different temporary vector in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_cgemv( MagmaNoTrans, n-k, dn-dki1, c_one, dA (d, k, dki1), ldda, dVd(d, dki1, i), 1, c_zero, dY (d, k, i), 1 ); // copy vector to host, storing in column nb+d of Y // as temporary space (Y has >= nb+ngpu columns) magma_cgetvector_async( n-k, dY(d, k, i), 1, Y(k, nb+d), 1, data->streams[d] ); } } // while GPU is doing above Ag*v... // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_C_NEGATE( tau[i] ); blasf77_cgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_ctrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // apply reflectors to next column, A(i+1), on right only. // one axpy will be required to finish this, in the next iteration above if ( i > 0 && i+1 < nb ) { // Update next column, A(k:n,i+1), applying Q on right. // One axpy will be required to finish this, in the next iteration // above, after yi is computed. // This updates one more row than LAPACK does (row k), // making block above panel an even multiple of nb. // Use last column of T as workspace, w. magma_int_t i1 = i+1; // If complex, conjugate row of V, and undo afterwards #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i1, A(k+i1,0), &lda ); #endif // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)' // T is now rectangular, so we use gemv instead of trmv as in lapack. blasf77_cgemv( "No trans", &i, &i1, &c_one, T(0,0), &ldt, A(k+i1,0), &lda, &c_zero, T(0,nb-1), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i1, A(k+i1,0), &lda ); #endif // A(k:n, i+1) -= Y(k:n, 0:i) * w blasf77_cgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i1), &ione ); } // yi = sum_g yi{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( data->streams[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // yi = yi + yi{d} blasf77_caxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione ); } } } // Restore diagonal element *A(k+nb,nb-1) = ei; // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:) for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :) // skip if matrix is empty // each GPU copies to different temporary block in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1, c_one, dA (d, 0, dki1), ldda, dVd(d, dki1, 0), ldvd, c_zero, dY (d, 0, 0), ldda ); // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y // as temporary space (Y has nb + nb*ngpu columns) magma_cgetmatrix_async( k, nb, dY(d, 0, 0), ldda, Y(0,nb+nb*d), ldy, data->streams[d] ); } } // Y = sum_g Y{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( 0 ); magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // Y = Y + Am V for( i = 0; i < nb; ++i ) { blasf77_caxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione ); } } } // copy Y and T matrices to GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_csetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] ); magma_csetmatrix_async( nb, nb, T, nb, dTi(d), nb, data->streams[d] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_clahr2 */
extern "C" magma_int_t magma_claqps_gpu(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *A, magma_int_t lda, magma_int_t *jpvt, magmaFloatComplex *tau, float *vn1, float *vn2, magmaFloatComplex *auxv, magmaFloatComplex *F, magma_int_t ldf) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0 OFFSET (input) INTEGER The number of rows of A that have been factorized in previous steps. NB (input) INTEGER The number of columns to factorize. KB (output) INTEGER The number of columns actually factorized. A (input/output) COMPLEX*16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. TAU (output) COMPLEX*16 array, dimension (KB) The scalar factors of the elementary reflectors. VN1 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. VN2 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. AUXV (input/output) COMPLEX*16 array, dimension (NB) Auxiliar vector. F (input/output) COMPLEX*16 array, dimension (LDF,NB) Matrix F' = L*Y'*A. LDF (input) INTEGER The leading dimension of the array F. LDF >= max(1,N). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) #define F(i, j) (F + (i) + (j)*(ldf )) magmaFloatComplex c_zero = MAGMA_C_MAKE( 0.,0.); magmaFloatComplex c_one = MAGMA_C_MAKE( 1.,0.); magmaFloatComplex c_neg_one = MAGMA_C_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; //float d__1; magmaFloatComplex z__1; //magma_int_t j; magma_int_t k, rk; //magmaFloatComplex Akk; magmaFloatComplex *Aks; magmaFloatComplex tauk; magma_int_t pvt; //float temp, temp2; float tol3z; magma_int_t itemp; float lsticc, *lsticcs; magma_int_t lastrk; magma_smalloc( &lsticcs, 1+256*(n+255)/256 ); lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); lsticc = 0; k = 0; magma_cmalloc( &Aks, nb ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // Fortran: pvt, k, isamax are all 1-based; subtract 1 from k. // C: pvt, k, isamax are all 0-based; don't subtract 1. pvt = k - 1 + magma_isamax( n-k, &vn1[k], ione ); if (pvt != k) { /*if (pvt >= nb) { // 1. Start copy from GPU magma_cgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); }*/ /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; /*if (pvt < nb){ // no need of transfer if pivot is within the panel blasf77_cswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { // 1. Finish copy from GPU magma_queue_sync( stream ); // 2. Swap as usual on CPU blasf77_cswap(&m, A(0, pvt), &ione, A(0, k), &ione); // 3. Restore the GPU magma_csetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); }*/ magmablas_cswap( m, A(0, pvt), ione, A(0, k), ione ); //blasf77_cswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); magmablas_cswap( i__1, F(pvt, 0), ldf, F(k, 0), ldf); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; //vn1[pvt] = vn1[k]; //vn2[pvt] = vn2[k]; #if defined(PRECISION_d) || defined(PRECISION_z) //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset ); #else //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset); #endif } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j){ *F(k,j) = MAGMA_C_CNJG( *F(k,j) ); } #endif*/ //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione ); #else i__1 = m - rk; i__2 = k; /*blasf77_cgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione );*/ magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(rk, 0), lda, F(k, 0), ldf, c_one, A(rk, k), ione ); #endif /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CNJG( *F(k,j) ); } #endif*/ } /* Generate elementary reflector H(k). */ magma_clarfg_gpu(m-rk, A(rk, k), A(rk + 1, k), &tau[k], &vn1[k], &Aks[k]); //Akk = *A(rk, k); //*A(rk, k) = c_one; //magma_cgetvector( 1, &Aks[k], 1, &Akk, 1 ); /* needed to avoid the race condition */ if (k == 0) magma_csetvector( 1, &c_one, 1, A(rk, k), 1 ); else magma_ccopymatrix( 1, 1, A(offset, 0), 1, A(rk, k), 1 ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ //magma_csetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL CGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) //magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); magma_cgemv( MagmaConjTrans, m-rk, n-k-1, tauk, A( rk, k+1 ), lda, A( rk, k ), 1, c_zero, F( k+1, k ), 1 ); //magma_cscal( m-rk, tau[k], F( k+1, k), 1 ); //magma_int_t i__3 = nb-k-1; //magma_int_t i__4 = i__2 - i__3; //magma_int_t i__5 = nb-k; //magma_cgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, // tau[k], dA(rk +i__5, k+1+i__3), ldda, // dA(rk +i__5, k ), ione, // c_zero, dF(k+1+i__3, k ), ione ); //magma_cgetmatrix_async( i__2-i__3, 1, // dF(k + 1 +i__3, k), i__2, // F (k + 1 +i__3, k), i__2, stream ); //blasf77_cgemv( MagmaConjTransStr, &i__1, &i__3, // &tau[k], A(rk, k+1), &lda, // A(rk, k ), &ione, // &c_zero, F(k+1, k ), &ione ); //magma_queue_sync( stream ); //blasf77_cgemv( MagmaConjTransStr, &i__5, &i__4, // &tau[k], A(rk, k+1+i__3), &lda, // A(rk, k ), &ione, // &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. for (j = 0; j <= k; ++j) { magma_csetvector( 1, &c_zero, 1, F(j, k), 1 ); }*/ /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ //if (k > 0 && k<n-1) { if (k > 0) { //magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); z__1 = MAGMA_C_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, A(offset+nb, 0), lda, A(offset+nb, k), ione, c_zero, auxv, ione ); i__1 = k; magma_cgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #else i__1 = m - rk; i__2 = k; //blasf77_cgemv( MagmaConjTransStr, &i__1, &i__2, // &z__1, A(rk, 0), &lda, // A(rk, k), &ione, // &c_zero, auxv, &ione ); magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, A(rk, 0), lda, A(rk, k), ione, c_zero, auxv, ione ); //i__1 = k; //blasf77_cgemv( MagmaNoTransStr, &n, &i__1, // &c_one, F(0,0), &ldf, // auxv, &ione, // &c_one, F(0,k), &ione ); /*magma_cgemv( MagmaNoTrans, n, i__1, c_one, F(0,0), ldf, auxv, ione, c_one, F(0,k), ione );*/ /* I think we only need stricly lower-triangular part :) */ magma_cgemv( MagmaNoTrans, n-k-1, i__2, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; //blasf77_cgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, // &c_neg_one, A(rk, 0 ), &lda, // F(k+1,0 ), &ldf, // &c_one, A(rk, k+1), &lda ); #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, A(rk, k ), lda, F(k+1, k ), ldf, c_one, A(rk, k+1), lda ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, A(rk, 0 ), lda, F(k+1,0 ), ldf, c_one, A(rk, k+1), lda ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ){ magmablas_scnrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], A(rk,k+1), lda, lsticcs); magma_device_sync(); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #else magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #endif } /*if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { // NOTE: The following 4 lines follow from the analysis in // Lapack Working Note 176. temp = MAGMA_C_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } }*/ //*A(rk, k) = Akk; //magma_csetvector( 1, &Akk, 1, A(rk, k), 1 ); //magma_cswap( 1, &Aks[k], 1, A(rk, k), 1 ); ++k; } magma_ccopymatrix( 1, k, Aks, 1, A(offset, 0), lda+1 ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU magma_csetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 );*/ magma_cgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, A(rk+1, 0 ), lda, F(*kb, 0 ), ldf, c_one, A(rk+1, *kb), lda ); } /* Recomputation of difficult columns. */ if( lsticc > 0 ) { printf( " -- recompute dnorms --\n" ); magmablas_scnrm2_check(m-rk-1, n-*kb, A(rk+1,*kb), lda, &vn1[*kb], lsticcs); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); #else magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); #endif /*while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = cblas_scnrm2(i__1, A(rk + 1, lsticc), ione); else { // Where is the data, CPU or GPU ? float r1, r2; r1 = cblas_scnrm2(nb-k, A(rk + 1, lsticc), ione); r2 = magma_scnrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); vn1[lsticc] = magma_ssqrt(r1*r1+r2*r2); } // NOTE: The computation of VN1( LSTICC ) relies on the fact that // SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) vn2[lsticc] = vn1[lsticc]; lsticc = itemp;*/ } magma_free(Aks); magma_free(lsticcs); return MAGMA_SUCCESS; } /* magma_claqps */
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; float magma_error, dev_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 1.5, -2.3 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.6, 0.8 ); magmaFloatComplex *A, *X, *Y, *Ydev, *Ymagma; magmaFloatComplex_ptr dA, dX, dY; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); #ifdef HAVE_CUBLAS printf(" M N MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else printf(" M N %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_CGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, magmaFloatComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaFloatComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaFloatComplex, sizeY ); TESTING_MALLOC_CPU( Ydev, magmaFloatComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaFloatComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, sizeA ); TESTING_MALLOC_DEV( dX, magmaFloatComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaFloatComplex, sizeY ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &sizeA, A ); lapackf77_clarnv( &ione, ISEED, &sizeX, X ); lapackf77_clarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( M, N, A, lda, dA, lda ); magma_csetvector( Xm, X, incx, dX, incx ); magma_csetvector( Ym, Y, incy, dY, incy ); #ifdef HAVE_CUBLAS dev_time = magma_sync_wtime( 0 ); cublasCgemv( opts.handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy ); dev_time = magma_sync_wtime( 0 ) - dev_time; #else dev_time = magma_sync_wtime( opts.queue ); magma_cgemv( opts.transA, M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy ); dev_time = magma_sync_wtime( opts.queue ) - dev_time; #endif dev_perf = gflops / dev_time; magma_cgetvector( Ym, dY, incy, Ydev, incy ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_csetvector( Ym, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_cgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_cgetvector( Ym, dY, incy, Ymagma, incy ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_cgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ float Anorm = lapackf77_clange( "F", &M, &N, A, &lda, work ); float Xnorm = lapackf77_clange( "F", &Xm, &ione, X, &Xm, work ); blasf77_caxpy( &Ym, &c_neg_one, Y, &incy, Ydev, &incy ); dev_error = lapackf77_clange( "F", &Ym, &ione, Ydev, &Ym, work ) / (Anorm * Xnorm); #ifdef HAVE_CUBLAS blasf77_caxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_clange( "F", &Ym, &ione, Ymagma, &Ym, work ) / (Anorm * Xnorm); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (magma_error < tol && dev_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && dev_error < tol); #else printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (dev_error < tol ? "ok" : "failed")); status += ! (dev_error < tol); #endif TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ydev ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_err_t magma_cgeqrs_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex *tau, magmaFloatComplex_ptr dT, size_t dT_offset, magmaFloatComplex_ptr dB, size_t dB_offset, magma_int_t lddb, magmaFloatComplex *hwork, magma_int_t lwork, magma_int_t *info, magma_queue_t queue) { /* -- clMagma (version 0.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by CGEQRF_GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input) COMPLEX array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by CGEQRF_GPU in the first n columns of its array argument A. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. TAU (input) COMPLEX array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_CGEQRF_GPU. DB (input/output) COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. DT (input) COMPLEX array that is the output (the 6th argument) of magma_cgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block inverses for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK, LWORK >= max(1,NRHS). For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where NB is the blocksize given by magma_get_cgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define d_ref(a_1) dT, (dT_offset + (lddwork+(a_1))*nb) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex_ptr dwork; magma_int_t i, k, lddwork, rows, ib; magma_int_t ione = 1; magma_int_t nb = magma_get_cgeqrf_nb(m); magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb); long int lquery = (lwork == -1); hwork[0] = MAGMA_C_MAKE( (float)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } /* B := Q' * B */ magma_cunmqr_gpu( MagmaLeft, MagmaConjTrans, m, nrhs, n, a_ref(0,0), ldda, tau, dB, dB_offset, lddb, hwork, lwork, dT, dT_offset, nb, info, queue ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) */ lddwork= k; int ldtwork; size_t dwork_offset = 0; if (nb < k) { dwork = dT; dwork_offset = dT_offset+2*lddwork*nb; } else { ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; magma_cmalloc( &dwork, ldtwork ); } // To do: Why did we have this line originally; seems to be a bug (Stan)? //dwork = dT; i = (k-1)/nb * nb; ib = n-i; rows = m-i; if ( nrhs == 1 ) { blasf77_ctrsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, hwork, &rows, hwork+rows*ib, &ione); } else { blasf77_ctrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, &nrhs, &c_one, hwork, &rows, hwork+rows*ib, &rows); } // update the solution vector magma_csetmatrix( ib, nrhs, hwork+rows*ib, 0, rows, dwork, dwork_offset+i, lddwork, queue ); // update c if (nrhs == 1) magma_cgemv( MagmaNoTrans, i, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset+i, 1, c_one, dB, dB_offset, 1, queue ); else magma_cgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset + i, lddwork, c_one, dB, dB_offset, lddb, queue ); int start = i-nb; if (nb < k) { for (i = start; i >=0; i -= nb) { ib = min(k-i, nb); rows = m -i; if (i + ib < n) { if (nrhs == 1) { magma_cgemv( MagmaNoTrans, ib, ib, c_one, d_ref(i), ib, dB, dB_offset+i, 1, c_zero, dwork, dwork_offset+i, 1, queue ); magma_cgemv( MagmaNoTrans, i, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset+i, 1, c_one, dB, dB_offset, 1, queue ); } else { magma_cgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib, c_one, d_ref(i), ib, dB, dB_offset+i, lddb, c_zero, dwork, dwork_offset+i, lddwork, queue ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset+i, lddwork, c_one, dB, dB_offset, lddb, queue ); } } } } magma_ccopymatrix( (n), nrhs, dwork, dwork_offset, lddwork, dB, dB_offset, lddb, queue ); if (nb >= k) magma_free(dwork); magma_queue_sync( queue ); return *info; }
/** Purpose ======= CLAHEF computes a partial factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. The partial factorization has the form: A = ( I U12 ) ( A11 0 ) ( I 0 ) if UPLO = 'U', or: ( 0 U22 ) ( 0 D ) ( U12' U22' ) A = ( L11 0 ) ( D 0 ) ( L11' L21' ) if UPLO = 'L' ( L21 I ) ( 0 A22 ) ( 0 I ) where the order of D is at most NB. The actual order is returned in the argument KB, and is either NB or NB-1, or N if N <= NB. Note that U' denotes the conjugate transpose of U. CLAHEF is an auxiliary routine called by CHETRF. It uses blocked code (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or A22 (if UPLO = 'L'). Arguments --------- @param[in] UPLO CHARACTER Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = 'U': Upper triangular - = 'L': Lower triangular @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in] NB INTEGER The maximum number of columns of the matrix A that should be factored. NB should be at least 2 to allow for 2-by-2 pivot blocks. @param[out] KB INTEGER The number of columns of A that were actually factored. KB is either NB-1 or NB, or N if N <= NB. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, A contains details of the partial factorization. @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] ipiv INTEGER array, dimension (N) Details of the interchanges and the block structure of D. If UPLO = 'U', only the last KB elements of ipiv are set; if UPLO = 'L', only the first KB elements are set. \n If ipiv(k) > 0, then rows and columns k and ipiv(k) were interchanged and D(k,k) is a 1-by-1 diagonal block. If UPLO = 'U' and ipiv(k) = ipiv(k-1) < 0, then rows and columns k-1 and -ipiv(k) were interchanged and D(k-1:k,k-1:k) is a 2-by-2 diagonal block. If UPLO = 'L' and ipiv(k) = ipiv(k+1) < 0, then rows and columns k+1 and -ipiv(k) were interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block. @param[out] W (workspace) COMPLEX array, dimension (LDW,NB) @param[in] LDW INTEGER The leading dimension of the array W. LDW >= max(1,N). @param[out] INFO INTEGER - = 0: successful exit - > 0: if INFO = k, D(k,k) is exactly zero. The factorization has been completed, but the block diagonal matrix D is exactly singular. @ingroup magma_chetrf_comp ********************************************************************/ extern "C" magma_int_t magma_clahef_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magmaFloatComplex_ptr dW, size_t dW_offset, magma_int_t lddw, magma_queue_t queue, magma_int_t *info) { /* .. Parameters .. */ float d_one = 1.0; float d_zero = 0.0; float d_eight = 8.0; float d_seven = 7.0; #if defined(PRECISION_c) float f_zero = 0.0; #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_mone = -MAGMA_C_ONE; magma_int_t upper = (uplo == MagmaUpper); magma_int_t ione = 1; /* .. Local Scalars .. */ magma_int_t imax = 0, jmax = 0, kk, kkW, kp, kstep, iinfo; float abs_akk, alpha, colmax, R1, rowmax; magmaFloatComplex Zimax, Z; #define dA(i, j) dA, dA_offset + (j)*ldda + (i) #define dW(i, j) dW, dW_offset + (j)*lddw + (i) #define A(i, j) (hA + (j)*lda + (i)) /* .. Executable Statements .. */ *info = 0; /* Initialize alpha for use in choosing pivot block size. */ alpha = ( d_one+sqrt( d_seven ) ) / d_eight; magma_event_t event = NULL; if( upper ) { /* Factorize the trailing columns of A using the upper triangle of A and working backwards, and compute the matrix W = U12*D for use in updating A11 (note that conjg(W) is actually stored) K is the main loop index, decreasing from N in steps of 1 or 2 KW is the column of W which corresponds to column K of A */ int k, kw = 0; for (k = n-1; k+1 > max(n-nb+1, nb); k -= kstep) { kw = nb - (n-k); /* Copy column K of A to column KW of W and update it */ magma_ccopy( k+1, dA( 0, k ), 1, dW( 0, kw ), 1, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif if (k+1 < n) { magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( k, kw+1 ), lddw, c_one, dW( 0, kw ), ione, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } kstep = 1; /* Determine rows and columns to be interchanged and whether a 1-by-1 or 2-by-2 pivot block will be used */ magma_cgetvector_async( 1, dW( k, kw ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); abs_akk = fabs( MAGMA_C_REAL( Z ) ); /* imax is the row-index of the largest off-diagonal element in column K, and colmax is its absolute value */ if( k > 0 ) { // magma is one-base imax = magma_icamax( k, dW( 0, kw ), 1, queue ) - 1; magma_cgetvector( 1, dW( imax, kw ), 1, &Z, 1, queue ); colmax = MAGMA_C_ABS1( Z ); } else { colmax = d_zero; } if( max( abs_akk, colmax ) == 0.0 ) { /* Column K is zero: set INFO and continue */ if ( *info == 0 ) *info = k; kp = k; #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } else { if( abs_akk >= alpha*colmax ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else { /* Copy column imax to column KW-1 of W and update it */ magma_ccopy( imax+1, dA( 0, imax ), 1, dW( 0, kw-1 ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #endif #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue ); #else magma_ccopy( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue ); #endif if( k+1 < n ) { magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( imax, kw+1 ), lddw, c_one, dW( 0, kw-1 ), ione, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #endif } magma_cgetvector_async( 1, dW( imax, kw-1 ), 1, &Zimax, 1, queue, &event ); magma_queue_sync( queue ); /* jmax is the column-index of the largest off-diagonal element in row imax, and rowmax is its absolute value */ jmax = imax + magma_icamax( k-imax, dW( imax+1, kw-1 ), 1, queue ); magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue ); rowmax = MAGMA_C_ABS1( Z ); if ( imax > 0 ) { // magma is one-base jmax = magma_icamax( imax, dW( 0, kw-1 ), 1, queue ) - 1; magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue ); rowmax = max( rowmax, MAGMA_C_ABS1( Z ) ); } if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else if ( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) { /* interchange rows and columns K and imax, use 1-by-1 pivot block */ kp = imax; /* copy column KW-1 of W to column KW */ magma_ccopy( k+1, dW( 0, kw-1 ), 1, dW( 0, kw ), 1, queue ); } else { /* interchange rows and columns K-1 and imax, use 2-by-2 pivot block */ kp = imax; kstep = 2; } } kk = k - kstep + 1; kkW = nb - (n - kk); /* Updated column kp is already stored in column kkW of W */ if( kp != kk ) { /* Interchange rows kk and kp in last kk columns of A and W */ // note: row-swap A(:,kk) magmablas_cswap( n-kk, dA( kk, kk ), ldda, dA( kp, kk ), ldda, queue ); magmablas_cswap( n-kk, dW( kk, kkW), lddw, dW( kp, kkW), lddw, queue ); /* Copy non-updated column kk to column kp */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue ); #else magma_ccopy( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue ); #endif // now A(kp,kk) should be A(kk,kk), and copy to A(kp,kp) magma_ccopy( kp+1, dA( 0, kk ), 1, dA( 0, kp ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event ); #endif } if( kstep == 1 ) { /* 1-by-1 pivot block D(k): column KW of W now holds W(k) = U(k)*D(k) where U(k) is the k-th column of U Store U(k) in column k of A */ magma_ccopy( k+1, dW( 0, kw ), 1, dA( 0, k ), 1, queue ); if ( k > 0 ) { magma_cgetvector_async( 1, dA( k, k ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); R1 = d_one / MAGMA_C_REAL( Z ); magma_csscal( k, R1, dA( 0, k ), 1, queue ); /* Conjugate W(k) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k, dW( 0, kw ), 1, dW( 0, kw ), 1, queue ); #endif } } else { /* 2-by-2 pivot block D(k): columns KW and KW-1 of W now hold ( W(k-1) W(k) ) = ( U(k-1) U(k) )*D(k) where U(k) and U(k-1) are the k-th and (k-1)-th columns of U */ if( k > 1 ) { /* Store U(k) and U(k-1) in columns k and k-1 of A */ magmablas_clascl_2x2( MagmaUpper, k-1, dW(0, kw-1), lddw, dA(0,k-1), ldda, &iinfo, queue ); } /* Copy D(k) to A */ magma_ccopymatrix( 2, 2, dW( k-1, kw-1 ), lddw, dA( k-1, k-1 ), ldda, queue ); /* Conjugate W(k) and W(k-1) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k, dW( 0, kw ), 1, dW( 0, kw ), 1, queue ); magmablas_clacpy_cnjg( k-1, dW( 0, kw-1 ), 1, dW( 0, kw-1 ), 1, queue ); #endif } } /* Store details of the interchanges in ipiv */ if( kstep == 1 ) { ipiv[ k ] = 1+kp; } else { ipiv[ k ] = -(1+kp); ipiv[ k-1 ] = -(1+kp); } } /* Update the upper triangle of A11 (= A(1:k,1:k)) as A11 := A11 - U12*D*U12' = A11 - U12*W' computing blocks of NB columns at a time (note that conjg(W) is actually stored) */ kw = nb - (n-k); for (int j = ( k / nb )*nb; j >= 0; j -= nb ) { int jb = min( nb, k-j+1 ); #ifdef SYMMETRIC_UPDATE /* Update the upper triangle of the diagonal block */ for (int jj = j; jj < j + jb; jj++) { #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #endif magma_cgemv( MagmaNoTrans, jj-j+1, n-(k+1), c_mone, dA( j, k+1 ), ldda, dW( jj, kw+1 ), lddw, c_one, dA( j, jj ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #endif } /* Update the rectangular superdiagonal block */ magma_cgemm( MagmaNoTrans, MagmaTrans, j, jb, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( j, kw+1 ), lddw, c_one, dA( 0, j ), ldda, queue ); #else #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #endif magma_cgemm( MagmaNoTrans, MagmaTrans, j+jb, jb, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( j, kw+1 ), lddw, c_one, dA( 0, j ), ldda, queue ); #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #endif #endif } /* Put U12 in standard form by partially undoing the interchanges in columns k+1:n */ for (int j = k+1; j < n;) { int jj = j; int jp = ipiv[ j ]; if( jp < 0 ) { jp = -jp; j = j + 1; } j = j + 1; jp = jp - 1; if( jp != jj && j < n ) magmablas_cswap( n-j, dA( jp, j ), ldda, dA( jj, j ), ldda, queue ); } // copying the panel back to CPU magma_cgetmatrix_async( n, n-(k+1), dA(0,k+1), ldda, A(0,k+1), lda, queue, &event ); magma_queue_sync( queue ); /* Set KB to the number of columns factorized */ *kb = n - (k+1); } else { /* Factorize the leading columns of A using the lower triangle of A and working forwards, and compute the matrix W = L21*D for use in updating A22 (note that conjg(W) is actually stored) K is the main loop index, increasing from 1 in steps of 1 or 2 */ int k; for (k = 0; k < min(nb-1,n); k += kstep) { /* Copy column K of A to column K of W and update it */ /* -------------------------------------------------------------- */ magma_ccopy( n-k, dA( k, k ), 1, dW( k, k ), 1, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif /* -------------------------------------------------------------- */ magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda, dW( k, 0 ), lddw, c_one, dW( k, k ), ione, queue ); // re-set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif kstep = 1; /* Determine rows and columns to be interchanged and whether a 1-by-1 or 2-by-2 pivot block will be used */ magma_cgetvector_async( 1, dW( k, k ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); abs_akk = fabs( MAGMA_C_REAL( Z ) ); /* imax is the row-index of the largest off-diagonal element in column K, and colmax is its absolute value */ if( k < n-1 ) { // magmablas is one-base imax = k + magma_icamax( n-k-1, dW(k+1,k), 1, queue ); magma_cgetvector( 1, dW( imax,k ), 1, &Z, 1, queue ); colmax = MAGMA_C_ABS1( Z ); } else { colmax = d_zero; } if ( max( abs_akk, colmax ) == 0.0 ) { /* Column K is zero: set INFO and continue */ if( *info == 0 ) *info = k; kp = k; // make sure the imaginary part of diagonal is zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } else { if ( abs_akk >= alpha*colmax ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else { /* Copy column imax to column K+1 of W and update it */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( imax-k, dA(imax,k), ldda, dW(k,k+1), 1, queue ); #else magma_ccopy( imax-k, dA( imax, k ), ldda, dW( k, k+1 ), 1, queue ); #endif magma_ccopy( n-imax, dA( imax, imax ), 1, dW( imax, k+1 ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda, dW( imax, 0 ), lddw, c_one, dW( k, k+1 ), ione, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif magma_cgetvector_async( 1, dW(imax,k+1), 1, &Zimax, 1, queue, &event); magma_queue_sync( queue ); /* jmax is the column-index of the largest off-diagonal element in row imax, and rowmax is its absolute value */ // magmablas is one-base jmax = k-1 + magma_icamax( imax-k, dW(k, k+1), 1, queue ); magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue ); rowmax = MAGMA_C_ABS1( Z ); if( imax < n-1 ) { // magmablas is one-base jmax = imax + magma_icamax( (n-1)-imax, dW(imax+1,k+1), 1, queue); magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue ); rowmax = max( rowmax, MAGMA_C_ABS1( Z ) ); } if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else if( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) { /* interchange rows and columns K and imax, use 1-by-1 pivot block */ kp = imax; /* copy column K+1 of W to column K */ magma_ccopy( n-k, dW( k, k+1 ), 1, dW( k, k ), 1, queue ); } else { /* interchange rows and columns K+1 and imax, use 2-by-2 pivot block */ kp = imax; kstep = 2; } } kk = k + kstep - 1; /* Updated column kp is already stored in column kk of W */ if( kp != kk ) { /* Copy non-updated column kk to column kp */ /* ------------------------------------------------------------------ */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue ); #else magma_ccopy( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue ); #endif if ( kp < n ) { magma_ccopy( n-kp, dA( kp, kk), 1, dA( kp, kp ), 1, queue ); } /* ------------------------------------------------------------------ */ /* Interchange rows kk and kp in first kk columns of A and W */ magmablas_cswap( kk+1, dA( kk, 0 ), ldda, dA( kp, 0 ), ldda, queue ); magmablas_cswap( kk+1, dW( kk, 0 ), lddw, dW( kp, 0 ), lddw, queue ); } if ( kstep == 1 ) { /* 1-by-1 pivot block D(k): column k of W now holds W(k) = L(k)*D(k) where L(k) is the k-th column of L Store L(k) in column k of A */ magma_ccopy( n-k, dW( k, k ), 1, dA( k, k ), 1, queue ); if ( k < n-1 ) { magma_cgetvector_async( 1, dA(k,k), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); R1 = d_one / MAGMA_C_REAL( Z ); magma_csscal((n-1)-k, R1, dA( k+1,k ), 1, queue); /* Conjugate W(k) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( (n-1)-k, dW( k+1,k ), 1, dW( k+1,k ), 1, queue ); #endif } } else { /* 2-by-2 pivot block D(k): columns k and k+1 of W now hold ( W(k) W(k+1) ) = ( L(k) L(k+1) )*D(k) where L(k) and L(k+1) are the k-th and (k+1)-th columns of L */ magmablas_clascl_2x2( MagmaLower, n-(k+2), dW(k,k), lddw, dA(k+2,k), ldda, &iinfo, queue ); /* Copy D(k) to A */ magma_ccopymatrix( 2, 2, dW( k, k ), lddw, dA( k, k ), ldda, queue ); /* Conjugate W(k) and W(k+1) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( (n-1)-k, dW( k+1,k ), 1, dW( k+1,k ), 1, queue ); magmablas_clacpy_cnjg( (n-1)-k-1, dW( k+2,k+1), 1, dW( k+2,k+1 ), 1, queue ); #endif } } /* Store details of the interchanges in ipiv */ if ( kstep == 1 ) { ipiv[k] = kp+1; } else { ipiv[k] = -kp-1; ipiv[k+1] = -kp-1; } } /* Update the lower triangle of A22 (= A(k:n,k:n)) as A22 := A22 - L21*D*L21' = A22 - L21*W' computing blocks of NB columns at a time (note that conjg(W) is actually stored) */ for( int j = k; j < n; j += nb ) { int jb = min( nb, n-j ); /* Update the lower triangle of the diagonal block */ #ifdef SYMMETRIC_UPDATE for (int jj = j; jj < j + jb; jj++) { int jnb = j + jb - jj; /* -------------------------------------------------------- */ magma_cgemv( MagmaNoTrans, jnb, k, c_mone, dA( jj, 0 ), ldda, dW( jj, 0 ), lddw, c_one, dA( jj, jj ), ione, queue ); /* -------------------------------------------------------- */ } /* Update the rectangular subdiagonal block */ if( j+jb < n ) { int nk = n - (j+jb); /* -------------------------------------------- */ magma_cgemm( MagmaNoTrans, MagmaTrans, nk, jb, k, c_mone, dA( j+jb, 0 ), ldda, dW( j, 0 ), lddw, c_one, dA( j+jb, j ), ldda, queue ); /* ------------------------------------------- */ } #else #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #endif magma_cgemm( MagmaNoTrans, MagmaTrans, n-j, jb, k, c_mone, dA( j, 0 ), ldda, dW( j, 0 ), lddw, c_one, dA( j, j ), ldda, queue ); #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #endif #endif } /* Put L21 in standard form by partially undoing the interchanges in columns 1:k-1 */ for (int j = k; j > 0;) { int jj = j; int jp = ipiv[j-1]; if( jp < 0 ) { jp = -jp; j--; } j--; if ( jp != jj && j >= 1 ) { magmablas_cswap( j, dA( jp-1,0 ), ldda, dA( jj-1,0 ), ldda, queue ); } } // copying the panel back to CPU magma_cgetmatrix_async( n, k, dA(0,0), ldda, A(0,0), lda, queue, &event ); magma_queue_sync( queue ); /* Set KB to the number of columns factorized */ *kb = k; } return *info; /* End of CLAHEF */ }
/* //////////////////////////////////////////////////////////////////////////// -- testing zdot */ int main( int argc, char** argv ) { magma_int_t info = 0; // set queue for old dense routines magma_queue_t queue=NULL; magma_queue_create( /*devices[ opts->device ],*/ &queue ); magmablasGetKernelStream( &queue ); TESTING_INIT(); magma_c_matrix a={Magma_CSR}, b={Magma_CSR}, x={Magma_CSR}, y={Magma_CSR}, skp={Magma_CSR}; printf("#================================================================================================================================================\n"); printf("\n"); printf(" | runtime | GFLOPS\n"); printf("#n num_vecs | CUDOT CUGEMV MAGMAGEMV MDOT MDGM | CUDOT CUGEMV MAGMAGEMV MDOT MDGM \n"); printf("#------------------------------------------------------------------------------------------------------------------------------------------------\n"); printf("\n"); for( magma_int_t num_vecs=5; num_vecs<6; num_vecs+=1 ) { for( magma_int_t n=10000; n<100000001; n=n+10000 ) { int iters = 10; float computations = (2.* n * iters * num_vecs); magmaFloatComplex one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex zero = MAGMA_C_MAKE(0.0, 0.0); magmaFloatComplex alpha; #define ENABLE_TIMER #ifdef ENABLE_TIMER real_Double_t mdot1, mdot2, mdgm1, mdgm2, magmagemv1, magmagemv2, cugemv1, cugemv2, cudot1, cudot2; real_Double_t mdot_time, mdgm_time, magmagemv_time, cugemv_time, cudot_time; #endif CHECK( magma_cvinit( &a, Magma_DEV, n, num_vecs, one, queue )); CHECK( magma_cvinit( &b, Magma_DEV, num_vecs, 1, one, queue )); int min_ten = min(num_vecs, 15); CHECK( magma_cvinit( &x, Magma_DEV, min_ten, n, one, queue )); CHECK( magma_cvinit( &y, Magma_DEV, min_ten, n, one, queue )); CHECK( magma_cvinit( &skp, Magma_DEV, num_vecs, 1, zero, queue )); // warm up CHECK( magma_cgemvmdot( n, num_vecs, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); // CUDOT #ifdef ENABLE_TIMER cudot1 = magma_sync_wtime( queue ); #endif for( int h=0; h<iters; h++) { for( int l=0; l<num_vecs; l++) alpha = magma_cdotc(n, a.dval, 1, b.dval, 1); } #ifdef ENABLE_TIMER cudot2 = magma_sync_wtime( queue ); cudot_time=cudot2-cudot1; #endif // CUGeMV #ifdef ENABLE_TIMER cugemv1 = magma_sync_wtime( queue ); #endif for( int h=0; h<iters; h++) { magma_cgemv(MagmaTrans, n, num_vecs, one, a.dval, n, b.dval, 1, zero, skp.dval, 1); //h++; } #ifdef ENABLE_TIMER cugemv2 = magma_sync_wtime( queue ); cugemv_time=cugemv2-cugemv1; #endif // MAGMAGeMV #ifdef ENABLE_TIMER magmagemv1 = magma_sync_wtime( queue ); #endif for( int h=0; h<iters; h++) { magmablas_cgemv(MagmaTrans, n, num_vecs, one, a.dval, n, b.dval, 1, zero, skp.dval, 1); //h++; } #ifdef ENABLE_TIMER magmagemv2 = magma_sync_wtime( queue ); magmagemv_time=magmagemv2-magmagemv1; #endif // MDOT #ifdef ENABLE_TIMER mdot1 = magma_sync_wtime( queue ); #endif for( int h=0; h<iters; h++) { //magma_cmdotc( n, num_vecs, a.dval, b.dval, x.dval, y.dval, skp.dval, queue ); CHECK( magma_cmdotc( n, 2, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); CHECK( magma_cmdotc( n, 2, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); CHECK( magma_cmdotc( n, 1, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); //h++; } #ifdef ENABLE_TIMER mdot2 = magma_sync_wtime( queue ); mdot_time=mdot2-mdot1; #endif // MDGM #ifdef ENABLE_TIMER mdgm1 = magma_sync_wtime( queue ); #endif for( int h=0; h<iters; h++) { CHECK( magma_cgemvmdot( n, num_vecs, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); //h++; } #ifdef ENABLE_TIMER mdgm2 = magma_sync_wtime( queue ); mdgm_time=mdgm2-mdgm1; #endif //magma_cprint_gpu(num_vecs,1,skp.dval,num_vecs); //Chronometry #ifdef ENABLE_TIMER printf("%d %d %e %e %e %e %e %e %e %e %e %e\n", n, num_vecs, cudot_time/iters, (cugemv_time)/iters, (magmagemv_time)/iters, (mdot_time)/iters, (mdgm_time)/iters, (float)(computations)/(cudot_time*(1.e+09)), (float)(computations)/(cugemv_time*(1.e+09)), (float)(computations)/(magmagemv_time*(1.e+09)), (float)(computations)/(mdot_time*(1.e+09)), (float)(computations)/(mdgm_time*(1.e+09)) ); #endif magma_cmfree(&a, queue ); magma_cmfree(&b, queue ); magma_cmfree(&x, queue ); magma_cmfree(&y, queue ); magma_cmfree(&skp, queue ); } printf("#================================================================================================================================================\n"); printf("\n"); printf("\n"); } cleanup: magma_cmfree(&a, queue ); magma_cmfree(&b, queue ); magma_cmfree(&x, queue ); magma_cmfree(&y, queue ); magma_cmfree(&skp, queue ); magma_queue_destroy( queue ); TESTING_FINALIZE(); return info; }
/** Purpose ------- CLAHR2 reduces the first NB columns of a complex general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by CGEHRD. Arguments --------- @param[in] n INTEGER The order of the matrix A. @param[in] k INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. @param[in] nb INTEGER The number of columns to be reduced. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements in rows K:N of the first NB columns are overwritten with the matrix Y. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). @param[out] dV COMPLEX array on the GPU, dimension (LDDV, NB) On exit this n-by-nb array contains the Householder vectors of the transformation. @param[in] lddv INTEGER The leading dimension of the array dV. LDDV >= max(1,N). @param[in,out] A COMPLEX array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau COMPLEX array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. @param[out] T COMPLEX array, dimension (LDT,NB) The upper triangular matrix T. @param[in] ldt INTEGER The leading dimension of the array T. LDT >= NB. @param[out] Y COMPLEX array, dimension (LDY,NB) The n-by-nb matrix Y. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. Further Details --------------- The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: @verbatim ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) @endverbatim where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_cgeev_aux ********************************************************************/ extern "C" magma_int_t magma_clahr2( magma_int_t n, magma_int_t k, magma_int_t nb, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dV, magma_int_t lddv, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *T, magma_int_t ldt, magmaFloatComplex *Y, magma_int_t ldy ) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define Y(i_,j_) ( Y + (i_) + (j_)*ldy) #define T(i_,j_) ( T + (i_) + (j_)*ldt) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dV(i_,j_) (dV + (i_) + (j_)*lddv) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t n_k_i_1, n_k; magmaFloatComplex scale; magma_int_t i; magmaFloatComplex ei = MAGMA_C_ZERO; magma_int_t info = 0; if (n < 0) { info = -1; } else if (k < 0 || k > n) { info = -2; } else if (nb < 1 || nb > n) { info = -3; } else if (ldda < max(1,n)) { info = -5; } else if (lddv < max(1,n)) { info = -7; } else if (lda < max(1,n)) { info = -9; } else if (ldt < max(1,nb)) { info = -12; } else if (ldy < max(1,n)) { info = -13; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } // adjust from 1-based indexing k -= 1; if (n <= 1) return info; for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Update A(k:n-1,i); Update i-th column of A - Y * T * V' // This updates one more row than LAPACK does (row k), // making the block above the panel an even multiple of nb. // Use last column of T as workspace, w. // w(0:i-1, nb-1) = VA(k+i, 0:i-1)' blasf77_ccopy( &i, A(k+i,0), &lda, T(0,nb-1), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) // If complex, conjugate row of V. lapackf77_clacgv(&i, T(0,nb-1), &ione); #endif // w = T(0:i-1, 0:i-1) * w blasf77_ctrmv( "Upper", "No trans", "No trans", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // A(k:n-1, i) -= Y(k:n-1, 0:i-1) * w blasf77_cgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_ccopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_ctrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_cgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_ctrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_cgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_ctrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_caxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_clarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // dV(i+1:n-k-1, i) = VA(k+i+1:n-1, i) magma_csetvector( n_k_i_1, A(k+i+1,i), 1, dV(i+1,i), 1 ); // Compute Y(k+1:n,i) = A vi // dA(k:n-1, i) = dA(k:n-1, i+1:n-k-1) * dV(i+1:n-k-1, i) magma_cgemv( MagmaNoTrans, n_k, n_k_i_1, c_one, dA(k,i+1), ldda, dV(i+1,i), ione, c_zero, dA(k,i), ione ); // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_C_NEGATE( tau[i]); blasf77_cgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_ctrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // Y(k:n-1, i) = dA(k:n-1, i) magma_cgetvector( n-k, dA(k,i), 1, Y(k,i), 1 ); } // Restore diagonal element *A(k+nb,nb-1) = ei; return info; } /* magma_clahr2 */
/** @deprecated Purpose ------- CLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 REAL array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 REAL array, dimension (N) The vector with the exact column norms. @param[in,out] auxv COMPLEX array, dimension (NB) Auxiliar vector. @param[in,out] F COMPLEX array, dimension (LDF,NB) Matrix F' = L*Y'*A. @param[in] ldf INTEGER The leading dimension of the array F. LDF >= max(1,N). @ingroup magma_cgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_claqps_gpu(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *A, magma_int_t lda, magma_int_t *jpvt, magmaFloatComplex *tau, float *vn1, float *vn2, magmaFloatComplex *auxv, magmaFloatComplex *F, magma_int_t ldf) { #define A(i, j) (A + (i) + (j)*(lda )) #define F(i, j) (F + (i) + (j)*(ldf )) magmaFloatComplex c_zero = MAGMA_C_MAKE( 0.,0.); magmaFloatComplex c_one = MAGMA_C_MAKE( 1.,0.); magmaFloatComplex c_neg_one = MAGMA_C_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; //float d__1; magmaFloatComplex z__1; //magma_int_t j; magma_int_t k, rk; //magmaFloatComplex Akk; magmaFloatComplex *Aks; magmaFloatComplex tauk = MAGMA_C_ZERO; magma_int_t pvt; //float temp, temp2; float tol3z; magma_int_t itemp; float lsticc, *lsticcs; magma_int_t lastrk; magma_smalloc( &lsticcs, 1+256*(n+255)/256 ); lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); lsticc = 0; k = 0; magma_cmalloc( &Aks, nb ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran/CUBLAS isamax; pvt, k are 0-based. pvt = k + magma_isamax( n-k, &vn1[k], ione ) - 1; if (pvt != k) { /*if (pvt >= nb) { // 1. Start copy from GPU magma_cgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); }*/ /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; /*if (pvt < nb) { // no need of transfer if pivot is within the panel blasf77_cswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { // 1. Finish copy from GPU magma_queue_sync( stream ); // 2. Swap as usual on CPU blasf77_cswap(&m, A(0, pvt), &ione, A(0, k), &ione); // 3. Restore the GPU magma_csetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); }*/ magmablas_cswap( m, A(0, pvt), ione, A(0, k), ione ); //blasf77_cswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); magmablas_cswap( i__1, F(pvt, 0), ldf, F(k, 0), ldf); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; //vn1[pvt] = vn1[k]; //vn2[pvt] = vn2[k]; #if defined(PRECISION_d) || defined(PRECISION_z) //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset ); #else //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset); #endif } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CNJG( *F(k,j) ); } #endif*/ //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione ); #else i__1 = m - rk; i__2 = k; /*blasf77_cgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione );*/ magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(rk, 0), lda, F(k, 0), ldf, c_one, A(rk, k), ione ); #endif /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CNJG( *F(k,j) ); } #endif*/ } /* Generate elementary reflector H(k). */ magma_clarfg_gpu(m-rk, A(rk, k), A(rk + 1, k), &tau[k], &vn1[k], &Aks[k]); //Akk = *A(rk, k); //*A(rk, k) = c_one; //magma_cgetvector( 1, &Aks[k], 1, &Akk, 1 ); /* needed to avoid the race condition */ if (k == 0) magma_csetvector( 1, &c_one, 1, A(rk, k), 1 ); else magma_ccopymatrix( 1, 1, A(offset, 0), 1, A(rk, k), 1 ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ //magma_csetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL CGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) //magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); magma_cgemv( MagmaConjTrans, m-rk, n-k-1, tauk, A( rk, k+1 ), lda, A( rk, k ), 1, c_zero, F( k+1, k ), 1 ); //magma_cscal( m-rk, tau[k], F( k+1, k), 1 ); //magma_int_t i__3 = nb-k-1; //magma_int_t i__4 = i__2 - i__3; //magma_int_t i__5 = nb-k; //magma_cgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, // tau[k], dA(rk +i__5, k+1+i__3), ldda, // dA(rk +i__5, k ), ione, // c_zero, dF(k+1+i__3, k ), ione ); //magma_cgetmatrix_async( i__2-i__3, 1, // dF(k + 1 +i__3, k), i__2, // F (k + 1 +i__3, k), i__2, stream ); //blasf77_cgemv( MagmaConjTransStr, &i__1, &i__3, // &tau[k], A(rk, k+1), &lda, // A(rk, k ), &ione, // &c_zero, F(k+1, k ), &ione ); //magma_queue_sync( stream ); //blasf77_cgemv( MagmaConjTransStr, &i__5, &i__4, // &tau[k], A(rk, k+1+i__3), &lda, // A(rk, k ), &ione, // &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. for (j = 0; j <= k; ++j) { magma_csetvector( 1, &c_zero, 1, F(j, k), 1 ); }*/ /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ //if (k > 0 && k < n-1) { if (k > 0) { //magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); z__1 = MAGMA_C_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, A(offset+nb, 0), lda, A(offset+nb, k), ione, c_zero, auxv, ione ); i__1 = k; magma_cgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #else i__1 = m - rk; i__2 = k; //blasf77_cgemv( MagmaConjTransStr, &i__1, &i__2, // &z__1, A(rk, 0), &lda, // A(rk, k), &ione, // &c_zero, auxv, &ione ); magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, A(rk, 0), lda, A(rk, k), ione, c_zero, auxv, ione ); //i__1 = k; //blasf77_cgemv( MagmaNoTransStr, &n, &i__1, // &c_one, F(0,0), &ldf, // auxv, &ione, // &c_one, F(0,k), &ione ); /*magma_cgemv( MagmaNoTrans, n, i__1, c_one, F(0,0), ldf, auxv, ione, c_one, F(0,k), ione );*/ /* I think we only need stricly lower-triangular part :) */ magma_cgemv( MagmaNoTrans, n-k-1, i__2, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; //blasf77_cgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, // &c_neg_one, A(rk, 0 ), &lda, // F(k+1,0 ), &ldf, // &c_one, A(rk, k+1), &lda ); #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, A(rk, k ), lda, F(k+1, k ), ldf, c_one, A(rk, k+1), lda ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, A(rk, 0 ), lda, F(k+1,0 ), ldf, c_one, A(rk, k+1), lda ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_scnrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], A(rk,k+1), lda, lsticcs); magma_device_sync(); #if defined(PRECISION_d) || defined(PRECISION_z) magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #else magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #endif } /*if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { // NOTE: The following 4 lines follow from the analysis in // Lapack Working Note 176. temp = MAGMA_C_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } }*/ //*A(rk, k) = Akk; //magma_csetvector( 1, &Akk, 1, A(rk, k), 1 ); //magma_cswap( 1, &Aks[k], 1, A(rk, k), 1 ); ++k; } magma_ccopymatrix( 1, k, Aks, 1, A(offset, 0), lda+1 ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU magma_csetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 );*/ magma_cgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, A(rk+1, 0 ), lda, F(*kb, 0 ), ldf, c_one, A(rk+1, *kb), lda ); } /* Recomputation of difficult columns. */ if ( lsticc > 0 ) { // printf( " -- recompute dnorms --\n" ); magmablas_scnrm2_check(m-rk-1, n-*kb, A(rk+1,*kb), lda, &vn1[*kb], lsticcs); magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); /*while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = magma_cblas_scnrm2( i__1, A(rk+1,lsticc), ione ); else { // Where is the data, CPU or GPU ? float r1, r2; r1 = magma_cblas_scnrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_scnrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); vn1[lsticc] = magma_ssqrt(r1*r1+r2*r2); } // NOTE: The computation of VN1( LSTICC ) relies on the fact that // SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) vn2[lsticc] = vn1[lsticc]; lsticc = itemp;*/ } magma_free(Aks); magma_free(lsticcs); return MAGMA_SUCCESS; } /* magma_claqps */
magma_err_t magma_clabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex_ptr da, size_t da_offset, magma_int_t ldda, float *d, float *e, magmaFloatComplex *tauq, magmaFloatComplex *taup, magmaFloatComplex *x, magma_int_t ldx, magmaFloatComplex_ptr dx, size_t dx_offset, magma_int_t lddx, magmaFloatComplex *y, magma_int_t ldy, magmaFloatComplex_ptr dy, size_t dy_offset, magma_int_t lddy, magma_queue_t queue ) { /* -- MAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= CLABRD reduces the first NB rows and columns of a complex general m by n matrix A to upper or lower bidiagonal form by an orthogonal transformation Q' * A * P, and returns the matrices X and Y which are needed to apply the transformation to the unreduced part of A. If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower bidiagonal form. This is an auxiliary routine called by SGEBRD Arguments ========= M (input) INTEGER The number of rows in the matrix A. N (input) INTEGER The number of columns in the matrix A. NB (input) INTEGER The number of leading rows and columns of A to be reduced. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the m by n general matrix to be reduced. On exit, the first NB rows and columns of the matrix are overwritten; the rest of the array is unchanged. If m >= n, elements on and below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors; and elements above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. If m < n, elements below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and elements on and above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). D (output) COMPLEX array, dimension (NB) The diagonal elements of the first NB rows and columns of the reduced matrix. D(i) = A(i,i). E (output) COMPLEX array, dimension (NB) The off-diagonal elements of the first NB rows and columns of the reduced matrix. TAUQ (output) COMPLEX array dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. TAUP (output) COMPLEX array, dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. X (output) COMPLEX array, dimension (LDX,NB) The m-by-nb matrix X required to update the unreduced part of A. LDX (input) INTEGER The leading dimension of the array X. LDX >= M. Y (output) COMPLEX array, dimension (LDY,NB) The n-by-nb matrix Y required to update the unreduced part of A. LDY (input) INTEGER The leading dimension of the array Y. LDY >= N. Further Details =============== The matrices Q and P are represented as products of elementary reflectors: Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are complex scalars, and v and u are complex vectors. If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The elements of the vectors v and u together form the m-by-nb matrix V and the nb-by-n matrix U' which are needed, with X and Y, to apply the transformation to the unreduced part of the matrix, using a block update of the form: A := A - V*Y' - X*U'. The contents of A on exit are illustrated by the following examples with nb = 2: m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) ( v1 v2 a a a ) ( v1 1 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) where a denotes an element of the original matrix which is unchanged, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). ===================================================================== */ magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magma_int_t c__1 = 1; magma_int_t a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__2, i__3; magma_int_t i__; magmaFloatComplex alpha; a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; --d; --e; --tauq; --taup; x_dim1 = ldx; x_offset = 1 + x_dim1; x -= x_offset; dx_offset -= 1 + lddx; y_dim1 = ldy; y_offset = 1 + y_dim1; y -= y_offset; dy_offset -= 1 + lddy; if (m <= 0 || n <= 0) { return 0; } magmaFloatComplex *f; magma_cmalloc_cpu( &f, max(n,m) ); magma_event_t event = NULL; if (m >= n) { /* Reduce to upper bidiagonal form */ for (i__ = 1; i__ <= nb; ++i__) { /* Update A(i:m,i) */ i__2 = m - i__ + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__3, &y[i__+y_dim1], &ldy ); #endif blasf77_cgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + a_dim1], &lda, &y[i__+y_dim1], &ldy, &c_one, &a[i__ + i__ * a_dim1], &c__1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__3, &y[i__+y_dim1], &ldy ); #endif blasf77_cgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + x_dim1], &ldx, &a[i__*a_dim1+1], &c__1, &c_one, &a[i__+i__*a_dim1], &c__1); /* Generate reflection Q(i) to annihilate A(i+1:m,i) */ alpha = a[i__ + i__ * a_dim1]; i__2 = m - i__ + 1; i__3 = i__ + 1; lapackf77_clarfg(&i__2, &alpha, &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]); d[i__] = MAGMA_C_REAL( alpha ); if (i__ < n) { a[i__ + i__ * a_dim1] = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i__ + 1; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_csetvector( i__2, a + i__ + i__ * a_dim1, 0, 1, da, da_offset + (i__-1)+(i__-1)* (ldda), 1, queue ); // 2. Multiply --------------------------------------------- magma_cgemv(MagmaConjTrans, i__2, i__3, c_one, da, da_offset + (i__-1) + ((i__-1) + 1) * (ldda), ldda, da, da_offset + (i__-1) + (i__-1) * (ldda), c__1, c_zero, dy, dy_offset + i__ + 1 + i__ * y_dim1, c__1, queue ); // 3. Put the result back ---------------------------------- magma_cgetmatrix_async( i__3, 1, dy, dy_offset + i__+1+i__*y_dim1, y_dim1, y+i__+1+i__*y_dim1, 0, y_dim1, queue, &event ); i__2 = m - i__ + 1; i__3 = i__ - 1; blasf77_cgemv(MagmaConjTransStr, &i__2, &i__3, &c_one, &a[i__ + a_dim1], &lda, &a[i__ + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); i__2 = n - i__; i__3 = i__ - 1; blasf77_cgemv("N", &i__2, &i__3, &c_neg_one, &y[i__ + 1 +y_dim1], &ldy, &y[i__ * y_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = m - i__ + 1; i__3 = i__ - 1; blasf77_cgemv(MagmaConjTransStr, &i__2, &i__3, &c_one, &x[i__ + x_dim1], &ldx, &a[i__ + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_event_sync( event ); if (i__3!=0){ i__2 = n - i__; blasf77_caxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1); } i__2 = i__ - 1; i__3 = n - i__; blasf77_cgemv(MagmaConjTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1], &c__1, &c_one, &y[i__ + 1 + i__ * y_dim1], &c__1); i__2 = n - i__; blasf77_cscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1); /* Update A(i,i+1:n) */ i__2 = n - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__2, &a[i__+(i__+1)*a_dim1], &lda ); lapackf77_clacgv( &i__, &a[i__+a_dim1], &lda ); #endif blasf77_cgemv("No transpose", &i__2, &i__, &c_neg_one, &y[i__ + 1 + y_dim1], &ldy, &a[i__ + a_dim1], &lda, &c_one, &a[i__ + (i__ + 1) * a_dim1], &lda); i__2 = i__ - 1; i__3 = n - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__, &a[i__+a_dim1], &lda ); lapackf77_clacgv( &i__2, &x[i__+x_dim1], &ldx ); #endif blasf77_cgemv(MagmaConjTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[ i__ + (i__ + 1) * a_dim1], &lda); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv( &i__2, &x[i__+x_dim1], &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+2:n) */ i__2 = n - i__; /* Computing MIN */ i__3 = i__ + 2; alpha = a[i__ + (i__ + 1) * a_dim1]; lapackf77_clarfg(&i__2, &alpha, &a[i__ + min( i__3,n) * a_dim1], &lda, &taup[i__]); e[i__] = MAGMA_C_REAL( alpha ); a[i__ + (i__ + 1) * a_dim1] = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i__; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_csetvector( i__3, a + i__ + (i__ +1)* a_dim1, 0, lda, da, da_offset + (i__-1)+((i__-1)+1)*(ldda), ldda, queue ); // 2. Multiply --------------------------------------------- //magma_ccopy(i__3, da+(i__-1)+((i__-1)+1)*(ldda), ldda, // dy + 1 + lddy, 1); magma_cgemv(MagmaNoTrans, i__2, i__3, c_one, da, da_offset + (i__-1)+1+ ((i__-1)+1) * (ldda), ldda, da, da_offset + (i__-1) + ((i__-1)+1) * (ldda), ldda, //dy + 1 + lddy, 1, c_zero, dx, dx_offset + i__ + 1 + i__ * x_dim1, c__1, queue ); // 3. Put the result back ---------------------------------- magma_cgetmatrix_async( i__2, 1, dx, dx_offset + i__+1+i__*x_dim1, x_dim1, x+i__+1+i__*x_dim1, 0, x_dim1, queue, &event ); i__2 = n - i__; blasf77_cgemv(MagmaConjTransStr, &i__2, &i__, &c_one, &y[i__ + 1 + y_dim1], &ldy, &a[i__ + (i__ + 1) * a_dim1], &lda, &c_zero, &x[ i__ * x_dim1 + 1], &c__1); i__2 = m - i__; blasf77_cgemv("N", &i__2, &i__, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &x[i__ * x_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = i__ - 1; i__3 = n - i__; blasf77_cgemv("N", &i__2, &i__3, &c_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &a[i__ + (i__ + 1) * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_event_sync( event ); if (i__!=0){ i__2 = m - i__; blasf77_caxpy(&i__2, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1); } i__2 = m - i__; i__3 = i__ - 1; blasf77_cgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one, &x[ i__ + 1 + i__ * x_dim1], &c__1); i__2 = m - i__; blasf77_cscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1); #if defined(PRECISION_z) || defined(PRECISION_c) i__2 = n - i__; lapackf77_clacgv( &i__2, &a[i__+(i__+1)*a_dim1], &lda ); // 4. Send the block reflector A(i+1:m,i) to the GPU after CLACGV() magma_csetvector( i__2, a + i__ + (i__ +1)* a_dim1, 0, lda, da, da_offset + (i__-1)+((i__-1)+1)*(ldda), ldda, queue ); #endif } } } else { /* Reduce to lower bidiagonal form */ for (i__ = 1; i__ <= nb; ++i__) { /* Update A(i,i:n) */ i__2 = n - i__ + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); lapackf77_clacgv(&i__3, &a[i__ + a_dim1], &lda); #endif blasf77_cgemv("No transpose", &i__2, &i__3, &c_neg_one, &y[i__ + y_dim1], &ldy, &a[i__ + a_dim1], &lda, &c_one, &a[i__ + i__ * a_dim1], &lda); i__2 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i__3, &a[i__ + a_dim1], &lda); lapackf77_clacgv(&i__3, &x[i__ + x_dim1], &ldx); #endif i__3 = n - i__ + 1; blasf77_cgemv(MagmaConjTransStr, &i__2, &i__3, &c_neg_one, &a[i__ * a_dim1 + 1], &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[i__ + i__ * a_dim1], &lda); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i__2, &x[i__ + x_dim1], &ldx); #endif /* Generate reflection P(i) to annihilate A(i,i+1:n) */ i__2 = n - i__ + 1; /* Computing MIN */ i__3 = i__ + 1; alpha = a[i__ + i__ * a_dim1]; lapackf77_clarfg(&i__2, &alpha, &a[i__ + min(i__3,n) * a_dim1], &lda, &taup[i__]); d[i__] = MAGMA_C_REAL( alpha ); if (i__ < m) { a[i__ + i__ * a_dim1] = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i__; i__3 = n - i__ + 1; // 1. Send the block reflector A(i,i+1:n) to the GPU ------ magma_csetvector( i__3, a + i__ + i__ * a_dim1, 0, lda, da, da_offset + (i__-1)+(i__-1)* (ldda), ldda, queue ); // 2. Multiply --------------------------------------------- //magma_ccopy(i__3, da+(i__-1)+(i__-1)*(ldda), ldda, // dy + 1 + lddy, 1); magma_cgemv(MagmaNoTrans, i__2, i__3, c_one, da, da_offset + (i__-1)+1 + (i__-1) * ldda, ldda, da, da_offset + (i__-1) + (i__-1) * ldda, ldda, // dy + 1 + lddy, 1, c_zero, dx, dx_offset + i__ + 1 + i__ * x_dim1, c__1, queue ); // 3. Put the result back ---------------------------------- magma_cgetmatrix_async( i__2, 1, dx, dx_offset + i__+1+i__*x_dim1, x_dim1, x+i__+1+i__*x_dim1, 0, x_dim1, queue, &event ); i__2 = n - i__ + 1; i__3 = i__ - 1; blasf77_cgemv(MagmaConjTransStr, &i__2, &i__3, &c_one, &y[i__ + y_dim1], &ldy, &a[i__ + i__ * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); i__2 = m - i__; i__3 = i__ - 1; blasf77_cgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &x[i__ * x_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = i__ - 1; i__3 = n - i__ + 1; blasf77_cgemv("No transpose", &i__2, &i__3, &c_one, &a[i__ * a_dim1 + 1], &lda, &a[i__ + i__ * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_event_sync( event ); if (i__2!=0){ i__3 = m - i__; blasf77_caxpy(&i__3, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1); } i__2 = m - i__; i__3 = i__ - 1; blasf77_cgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one, &x[i__ + 1 + i__ * x_dim1], &c__1); i__2 = m - i__; blasf77_cscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1); i__2 = n - i__ + 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); magma_csetvector( i__2, a + i__ + (i__ )* a_dim1, 0, lda, da, da_offset + (i__-1)+ (i__-1)*(ldda), ldda, queue ); #endif /* Update A(i+1:m,i) */ i__2 = m - i__; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i__3, &y[i__ + y_dim1], &ldy); #endif blasf77_cgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &y[i__ + y_dim1], &ldy, &c_one, &a[i__ + 1 + i__ * a_dim1], &c__1); i__2 = m - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_clacgv(&i__3, &y[i__ + y_dim1], &ldy); #endif blasf77_cgemv("No transpose", &i__2, &i__, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &a[i__ * a_dim1 + 1], &c__1, &c_one, &a[i__ + 1 + i__ * a_dim1], &c__1); /* Generate reflection Q(i) to annihilate A(i+2:m,i) */ i__2 = m - i__; i__3 = i__ + 2; alpha = a[i__ + 1 + i__ * a_dim1]; lapackf77_clarfg(&i__2, &alpha, &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]); e[i__] = MAGMA_C_REAL( alpha ); a[i__ + 1 + i__ * a_dim1] = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i__; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_csetvector( i__2, a + i__ +1+ i__ * a_dim1, 0, 1, da, da_offset + (i__-1)+1+ (i__-1)*(ldda), 1, queue ); // 2. Multiply --------------------------------------------- magma_cgemv(MagmaConjTrans, i__2, i__3, c_one, da, da_offset + (i__-1)+1+ ((i__-1)+1) * ldda, ldda, da, da_offset + (i__-1)+1+ (i__-1) * ldda, c__1, c_zero, dy, dy_offset + i__ + 1 + i__ * y_dim1, c__1, queue ); // 3. Put the result back ---------------------------------- magma_cgetmatrix_async( i__3, 1, dy, dy_offset + i__+1+i__*y_dim1, y_dim1, y+i__+1+i__*y_dim1, 0, y_dim1, queue, &event ); i__2 = m - i__; i__3 = i__ - 1; blasf77_cgemv(MagmaConjTransStr, &i__2, &i__3, &c_one, &a[i__ + 1 + a_dim1], &lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero, &y[ i__ * y_dim1 + 1], &c__1); i__2 = n - i__; i__3 = i__ - 1; blasf77_cgemv("No transpose", &i__2, &i__3, &c_neg_one, &y[i__ + 1 + y_dim1], &ldy, &y[i__ * y_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = m - i__; blasf77_cgemv(MagmaConjTransStr, &i__2, &i__, &c_one, &x[i__ + 1 + x_dim1], &ldx, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_event_sync( event ); if (i__3!=0){ i__2 = n - i__; blasf77_caxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1); } i__2 = n - i__; blasf77_cgemv(MagmaConjTransStr, &i__, &i__2, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1], &c__1, &c_one, &y[i__ + 1 + i__ * y_dim1], &c__1); i__2 = n - i__; blasf77_cscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1); #if defined(PRECISION_z) || defined(PRECISION_c) } else { i__2 = n - i__ + 1; lapackf77_clacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); magma_csetvector( i__2, a + i__ + (i__ )* a_dim1, 0, lda, da, da_offset + (i__-1)+ (i__-1)*(ldda), ldda, queue ); #endif } } } magma_queue_sync( queue ); magma_free_cpu(f); return MAGMA_SUCCESS; } /* magma_clabrd */