extern "C" void magma_slarfxsym( magma_int_t N, float *A, magma_int_t LDA, float *V, float *TAU) { magma_int_t IONE=1; float dtmp; float Z_ZERO = MAGMA_S_ZERO; //float Z_ONE = MAGMA_S_ONE; float Z_MONE = MAGMA_S_NEG_ONE; float Z_HALF = MAGMA_S_HALF; //float WORK[N]; float *WORK; magma_smalloc_cpu( &WORK, N ); /* apply left and right on A(st:ed,st:ed)*/ //magma_slarfxsym(len,A(st,st),LDX,V(st),TAU(st)); /* X = AVtau */ blasf77_ssymv("L",&N, TAU, A, &LDA, V, &IONE, &Z_ZERO, WORK, &IONE); /* je calcul dtmp= X'*V */ dtmp = magma_cblas_sdot(N, WORK, IONE, V, IONE); /* je calcul 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * Z_HALF * (*TAU); /* je calcul W=X-1/2VX'Vt = X - dtmp*V */ /* for (j = 0; j < N; j++) WORK[j] = WORK[j] + (dtmp*V[j]); */ blasf77_saxpy(&N, &dtmp, V, &IONE, WORK, &IONE); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_ssyr2("L",&N,&Z_MONE,WORK,&IONE,V,&IONE,A,&LDA); magma_free_cpu(WORK); }
inline static void magma_slarfxsym_v2( magma_int_t n, float *A, magma_int_t lda, float *V, float *TAU, float *work) { /* WORK (workspace) float real array, dimension N */ magma_int_t ione = 1; float dtmp; float c_zero = MAGMA_S_ZERO; float c_neg_one= MAGMA_S_NEG_ONE; float c_half = MAGMA_S_HALF; /* X = AVtau */ blasf77_ssymv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione); /* compute dtmp= X'*V */ dtmp = magma_cblas_sdot(n, work, ione, V, ione); /* compute 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * c_half * (*TAU); /* compute W=X-1/2VX'Vt = X - dtmp*V */ blasf77_saxpy(&n, &dtmp, V, &ione, work, &ione); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_ssyr2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda); }
magma_int_t magma_spgmres( magma_s_sparse_matrix A, magma_s_vector b, magma_s_vector *x, magma_s_solver_par *solver_par, magma_s_preconditioner *precond_par ){ // prepare solver feedback solver_par->solver = Magma_PGMRES; solver_par->numiter = 0; solver_par->info = 0; // local variables float c_zero = MAGMA_S_ZERO, c_one = MAGMA_S_ONE, c_mone = MAGMA_S_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; float nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace //magma_setdevice(0); float *H, *HH, *y, *h1; magma_smalloc_pinned( &H, (ldh+1)*ldh ); magma_smalloc_pinned( &y, ldh ); magma_smalloc_pinned( &HH, ldh*ldh ); magma_smalloc_pinned( &h1, ldh ); // GPU workspace magma_s_vector r, q, q_t, z, z_t, t; magma_s_vinit( &t, Magma_DEV, dofs, c_zero ); magma_s_vinit( &r, Magma_DEV, dofs, c_zero ); magma_s_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero ); magma_s_vinit( &z, Magma_DEV, dofs*(ldh+1), c_zero ); magma_s_vinit( &z_t, Magma_DEV, dofs, c_zero ); q_t.memory_location = Magma_DEV; q_t.val = NULL; q_t.num_rows = q_t.nnz = dofs; float *dy, *dH = NULL; if (MAGMA_SUCCESS != magma_smalloc( &dy, ldh )) return MAGMA_ERR_DEVICE_ALLOC; if (MAGMA_SUCCESS != magma_smalloc( &dH, (ldh+1)*ldh )) return MAGMA_ERR_DEVICE_ALLOC; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magmablasSetKernelStream(stream[0]); magma_sscal( dofs, c_zero, x->val, 1 ); // x = 0 magma_scopy( dofs, b.val, 1, r.val, 1 ); // r = b nom0 = betanom = magma_snrm2( dofs, r.val, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_S_MAKE( nom0, 0. ); magma_ssetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom0 * RTOLERANCE ) < ATOLERANCE ) r0 = solver_par->epsilon; if ( nom < r0 ) return MAGMA_SUCCESS; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ for(k=1; k<=restart; k++) { magma_scopy(dofs, r.val, 1, q(k-1), 1); // q[0] = 1.0/||r|| magma_sscal(dofs, 1./H(k,k-1), q(k-1), 1); // (to be fused) q_t.val = q(k-1); magmablasSetKernelStream(stream[0]); // preconditioner // z[k] = M^(-1) q(k) magma_s_applyprecond_left( A, q_t, &t, precond_par ); magma_s_applyprecond_right( A, t, &z_t, precond_par ); magma_scopy(dofs, z_t.val, 1, z(k-1), 1); // r = A q[k] magma_s_spmv( c_one, A, z_t, c_zero, r ); // if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt for (i=1; i<=k; i++) { H(i,k) =magma_sdot(dofs, q(i-1), 1, r.val, 1); // H(i,k) = q[i] . r magma_saxpy(dofs,-H(i,k), q(i-1), 1, r.val, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_S_MAKE( magma_snrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = ||r|| /*}else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing sgemv with snrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_scopy(dofs, r.val, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_sgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_sgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_scopyscale( dofs, k, r.val, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_sgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_sgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // dH(1:k,k) = q[0:k-1] . r #ifndef SNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_sgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_sgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); #ifdef SNRM2SCALE magma_scopy(dofs, r.val, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_snrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_sgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_S_MAKE( magma_snrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if( k<solver_par->restart ){ magmablasSetKernelStream(stream[0]); magma_scopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_sscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif }*/ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { HH(k,i) = magma_cblas_sdot( i+1, &H(1,k), 1, &H(1,i), 1 ); } h1[k] = H(1,k)*H(1,0); if (k != 1){ for (i=1; i<k; i++) { HH(k,i) = HH(k,i)/HH(i,i);// for (m=i+1; m<=k; m++){ HH(k,m) -= HH(k,i) * HH(m,i) * HH(i,i); } h1[k] -= h1[i] * HH(k,i); } } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_S_REAL(H(k+1,k))); }/* Minimization done */ // compute solution approximation magma_ssetmatrix(m, 1, y+1, m, dy, m ); magma_sgemv(MagmaNoTrans, dofs, m, c_one, z(0), dofs, dy, 1, c_one, x->val, 1); // compute residual magma_s_spmv( c_mone, A, *x, c_zero, r ); // r = - A * x magma_saxpy(dofs, c_one, b.val, 1, r.val, 1); // r = r + b H(1,0) = MAGMA_S_MAKE( magma_snrm2(dofs, r.val, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_S_REAL( H(1,0) ); betanom = fabs(RNorm); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_sresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_s_vfree(&t); magma_s_vfree(&r); magma_s_vfree(&q); magma_s_vfree(&z); magma_s_vfree(&z_t); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); magmablasSetKernelStream(NULL); return MAGMA_SUCCESS; } /* magma_spgmres */
/** Purpose ------- SLATRD reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, SLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, SLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by SSYTRD. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e REAL array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W REAL array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). @param dA @param ldda @param dW @param lddw Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slatrd( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, float *A, magma_int_t lda, float *e, float *tau, float *W, magma_int_t ldw, magmaFloat_ptr dA, magma_int_t ldda, magmaFloat_ptr dW, magma_int_t lddw) { #define A(i_, j_) (A + (i_) + (j_)*lda) #define W(i_, j_) (W + (i_) + (j_)*ldw) #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dW(i_, j_) (dW + (i_) + (j_)*lddw) const float c_neg_one = MAGMA_S_NEG_ONE; const float c_one = MAGMA_S_ONE; const float c_zero = MAGMA_S_ZERO; const magma_int_t ione = 1; float alpha, value; magma_int_t i, i_n, i_1, iw; /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower && uplo != MagmaUpper ) { info = -1; } else if ( n < 0 ) { info = -2; } else if ( nb < 1 ) { info = -3; } else if ( lda < max(1,n) ) { info = -5; } else if ( ldw < max(1,n) ) { info = -9; } else if ( ldda < max(1,n) ) { info = -11; } else if ( lddw < max(1,n) ) { info = -13; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if (n == 0) { return info; } magma_queue_t stream; magma_queue_create( &stream ); float *f; magma_smalloc_cpu( &f, n ); if ( f == NULL ) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, W(i, iw+1), &ldw ); #endif blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, W(i, iw+1), &ldw ); lapackf77_slacgv( &i_n, A(i, i+1), &lda ); #endif blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, A(i, i+1), &lda ); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_slarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] ); e[i-1] = MAGMA_S_REAL( alpha ); *A(i-1,i) = MAGMA_S_ONE; /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_ssetvector( i, A(0, i), 1, dA(0, i), 1 ); magma_ssymv( MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione ); // 2. Start putting the result back (asynchronously) magma_sgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw), ldw, stream ); if (i < n-1) { blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); } blasf77_sscal( &i, &tau[i - 1], W(0, iw), &ione ); value = magma_cblas_sdot( i, W(0,iw), ione, A(0,i), ione ); alpha = tau[i - 1] * -0.5f * value; blasf77_saxpy( &i, &alpha, A(0, i), &ione, W(0, iw), &ione ); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, W(i, 0), &ldw ); #endif blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, W(i, 0), &ldw ); lapackf77_slacgv( &i, A(i, 0), &lda ); #endif blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, A(i, 0), &lda ); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_slarfg( &i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] ); e[i] = MAGMA_S_REAL( alpha ); *A(i+1,i) = MAGMA_S_ONE; /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_ssetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); magma_ssymv( MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione ); // 2. Start putting the result back (asynchronously) magma_sgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione ); blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); // 3. Here is where we need it magma_queue_sync( stream ); if (i != 0) blasf77_saxpy( &i_n, &c_one, f, &ione, W(i+1, i), &ione ); blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione ); blasf77_sscal( &i_n, &tau[i], W(i+1,i), &ione ); value = magma_cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione ); alpha = tau[i] * -0.5f * value; blasf77_saxpy( &i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione ); } } } magma_free_cpu( f ); magma_queue_destroy( stream ); return info; } /* magma_slatrd */
extern "C" magma_int_t magma_slatrd( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, float *a, magma_int_t lda, float *e, float *tau, float *w, magma_int_t ldw, magmaFloat_ptr da, size_t da_offset, magma_int_t ldda, magmaFloat_ptr dw, size_t dw_offset, magma_int_t lddw, magma_queue_t queue) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= SLATRD reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = 'U', SLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = 'L', SLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by SSYTRD. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the matrix A. NB (input) INTEGER The number of rows and columns to be reduced. A (input/output) REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: if UPLO = 'U', the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= (1,N). E (output) REAL array, dimension (N-1) If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = 'L', E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. TAU (output) REAL array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'. See Further Details. W (output) REAL array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. LDW (input) INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = 'U': if UPLO = 'L': ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). ===================================================================== */ magma_int_t i; float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; float value = MAGMA_S_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; float alpha; float *f; if (n <= 0) { return 0; } magma_event_t event = NULL; magma_smalloc_cpu( &f, n ); assert( f != NULL ); // TODO return error, or allocate outside slatrd if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i_n, W(i, iw+1), &ldw); lapackf77_slacgv(&i_n, A(i, i+1), &lda); #endif blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i_n, A(i, i+1), &lda); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_slarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_S_REAL( alpha ); *A(i-1,i) = MAGMA_S_ONE; /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_ssetvector( i, A(0, i), 1, dA(0, i), 1, queue ); magma_ssymv(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, queue); // 2. Start putting the result back (asynchronously) magma_sgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw), ldw, queue, &event ); if (i < n-1) { blasf77_sgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_event_sync(event); if (i < n-1) { blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_sgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_sscal(&i, &tau[i - 1], W(0, iw), &ione); value = magma_cblas_sdot( i, W(0,iw), ione, A(0,i), ione ); alpha = tau[i - 1] * -0.5f * value; blasf77_saxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i, W(i, 0), &ldw); #endif blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i, W(i, 0), &ldw); lapackf77_slacgv(&i, A(i, 0), &lda); #endif blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_slarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_S_REAL( alpha ); *A(i+1,i) = MAGMA_S_ONE; /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_ssetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1, queue ); magma_ssymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, queue); // 2. Start putting the result back (asynchronously) magma_sgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, queue, &event ); blasf77_sgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_sgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_event_sync(event); if (i != 0) blasf77_saxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_sscal(&i_n, &tau[i], W(i+1,i), &ione); value = magma_cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione ); alpha = tau[i] * -0.5f * value; blasf77_saxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu( f ); return 0; } /* magma_slatrd */