/** Purpose ------- SLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to SLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] k INTEGER The number of terms in the rational function to be solved by SLAED4. K >= 0. @param[in] n INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N > K). @param[in] n1 INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. @param[out] d REAL array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. @param[out] Q REAL array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. @param[in] ldq INTEGER The leading dimension of the array Q. LDQ >= max(1,N). @param[in] rho REAL The value of the parameter in the rank one update equation. RHO >= 0 required. @param[in,out] dlamda REAL array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. @param[in] Q2 REAL array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. TODO what is LDQ2? @param[in] indx INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see SLAED2). The rows of the eigenvectors found by SLAED4 must be likewise permuted before the matrix multiply can take place. @param[in] ctot INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. @param[in,out] w REAL array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. @param s (workspace) REAL array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. @param[out] indxq INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. @param dwork (workspace) REAL array, dimension (3*N*N/2+3*N) @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. TODO verify range, vl, vu, il, iu -- copied from slaex1. @param[in] vl REAL @param[in] vu REAL if RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER if RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = 1, an eigenvalue did not converge Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slaex3(magma_int_t k, magma_int_t n, magma_int_t n1, float* d, float* Q, magma_int_t ldq, float rho, float* dlamda, float* Q2, magma_int_t* indx, magma_int_t* ctot, float* w, float* s, magma_int_t* indxq, float* dwork, magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t* info ) { #define Q(i_,j_) (Q + (i_) + (j_)*ldq) float d_one = 1.; float d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; magma_int_t iil, iiu, rk; float* dq2= dwork; float* ds = dq2 + n*(n/2+1); float* dq = ds + n*(n/2+1); magma_int_t lddq = n/2 + 1; magma_int_t i, iq2, j, n12, n2, n23, tmp, lq2; float temp; magma_int_t alleig, valeig, indeig; alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); *info = 0; if (k < 0) *info=-1; else if (n < k) *info=-2; else if (ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if (*info != 0) { magma_xerbla(__func__, -(*info)); return *info; } // Quick return if possible if (k == 0) return *info; /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; lq2 = iq2 + n2 * n23; magma_ssetvector_async( lq2, Q2, 1, dq2, 1, NULL ); #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for (i = ib; i < ie; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = ib; j < ie; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) { #pragma omp critical (info) *info=iinfo; break; } } #pragma omp barrier if (*info == 0) { #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2) { #pragma omp single { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if (k != 1) { // Compute updated W. blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for (j = 0; j < k; ++j) { magma_int_t i_tmp = min(j, ie); for (i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for (i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if (id < tot) { ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else { ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for (j = ib; j < ie; ++j) { for (i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = magma_cblas_snrm2( k, s+id*k, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return *info; timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); for (i = 0; i < k; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = 0; j < k; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) *info=iinfo; } if (*info != 0) return *info; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2) { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if (k != 1) { // Compute updated W. blasf77_scopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &k, Q, &tmp, w, &ione); for (j = 0; j < k; ++j) { for (i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for (i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for (j = iil-1; j < iiu; ++j) { for (i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = magma_cblas_snrm2( k, s, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #endif //_OPENMP // Compute the updated eigenvectors. timer_start( time ); magma_queue_sync( NULL ); if (rk != 0) { if ( n23 != 0 ) { if (rk < magma_get_slaed3_k()) { lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else { magma_ssetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq); magma_sgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq ); } } else lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 != 0 ) { if (rk < magma_get_slaed3_k()) { lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else { magma_ssetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq); magma_sgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq ); } } else lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } timer_stop( time ); timer_printf( "gemms = %6.2f\n", time ); return *info; } /* magma_slaex3 */
extern "C" magma_int_t magma_slaex3(magma_int_t k, magma_int_t n, magma_int_t n1, float* d, float* q, magma_int_t ldq, float rho, float* dlamda, float* q2, magma_int_t* indx, magma_int_t* ctot, float* w, float* s, magma_int_t* indxq, float* dwork, char range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t* info ) { /* Purpose ======= SLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to SLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= K (input) INTEGER The number of terms in the rational function to be solved by SLAED4. K >= 0. N (input) INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N>K). N1 (input) INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. D (output) REAL array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. Q (output) REAL array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. LDQ (input) INTEGER The leading dimension of the array Q. LDQ >= max(1,N). RHO (input) REAL The value of the parameter in the rank one update equation. RHO >= 0 required. DLAMDA (input/output) REAL array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. Q2 (input) REAL array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. INDX (input) INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see SLAED2). The rows of the eigenvectors found by SLAED4 must be likewise permuted before the matrix multiply can take place. CTOT (input) INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. W (input/output) REAL array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. S (workspace) REAL array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. INDXQ (output) INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. DWORK (device workspace) REAL array, dimension (3*N*N/2+3*N) INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. > 0: if INFO = 1, an eigenvalue did not converge Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. ===================================================================== */ float d_one = 1.; float d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; char range_[] = {range, 0}; magma_int_t iil, iiu, rk; float* dq2= dwork; float* ds = dq2 + n*(n/2+1); float* dq = ds + n*(n/2+1); magma_int_t lddq = n/2 + 1; magma_int_t i,iq2,j,n12,n2,n23,tmp,lq2; float temp; magma_int_t alleig, valeig, indeig; alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); *info = 0; if(k < 0) *info=-1; else if(n < k) *info=-2; else if(ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if(*info != 0){ magma_xerbla(__func__, -(*info)); return MAGMA_ERR_ILLEGAL_VALUE; } // Quick return if possible if(k == 0) return MAGMA_SUCCESS; /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; lq2 = iq2 + n2 * n23; magma_ssetvector_async( lq2, q2, 1, dq2, 1, NULL ); #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER magma_timestr_t start, end; start = get_current_time(); #endif #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for(i = ib; i < ie; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for(j = ib; j < ie; ++j){ magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if(iinfo != 0){ #pragma omp critical (info) *info=iinfo; break; } } #pragma omp barrier if(*info == 0){ #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2){ #pragma omp single { for(j = 0; j < k; ++j){ w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if(k != 1){ // Compute updated W. blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for(j = 0; j < k; ++j){ magma_int_t i_tmp = min(j, ie); for(i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for(i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for(i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if(id < tot){ ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else{ ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for(j = ib; j < ie; ++j){ for(i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s+id*k, 1); for(i = 0; i < k; ++i){ magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return MAGMA_SUCCESS; //?????? #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.); #endif #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER magma_timestr_t start, end; start = get_current_time(); #endif for(i = 0; i < k; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for(j = 0; j < k; ++j){ magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if(iinfo != 0) *info=iinfo; } if(*info != 0) return MAGMA_SUCCESS; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2){ for(j = 0; j < k; ++j){ w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if(k != 1){ // Compute updated W. blasf77_scopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &k, q, &tmp, w, &ione); for(j = 0; j < k; ++j){ for(i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for(i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for(i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for(j = iil-1; j < iiu; ++j){ for(i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = cblas_snrm2( k, s, 1); for(i = 0; i < k; ++i){ magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.); #endif #endif //_OPENMP // Compute the updated eigenvectors. #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER start = get_current_time(); #endif magma_queue_sync( NULL ); if (rk != 0){ if( n23 != 0 ){ if (rk < magma_get_slaed3_k()){ lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else { magma_ssetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 ); magma_sgemm('N', 'N', n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq); magma_sgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq ); } } else lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if( n12 != 0 ) { if (rk < magma_get_slaed3_k()){ lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else { magma_ssetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 ); magma_sgemm('N', 'N', n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq); magma_sgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq ); } } else lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf("gemms = %6.2f\n", GetTimerValue(start,end)/1000.); #endif return MAGMA_SUCCESS; } /*magma_slaed3*/
/** Purpose ------- SLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by SGEHRD. Arguments --------- @param[in] n INTEGER The order of the matrix A. @param[in] k INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. @param[in] nb INTEGER The number of columns to be reduced. @param[in,out] A REAL array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau REAL array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. @param[out] T REAL array, dimension (LDT,NB) The upper triangular matrix T. @param[in] ldt INTEGER The leading dimension of the array T. LDT >= NB. @param[out] Y REAL array, dimension (LDY,NB) The n-by-nb matrix Y. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[in,out] data Structure with pointers to dA, dT, dV, dW, dY which are distributed across multiple GPUs. Further Details --------------- The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: @verbatim ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) @endverbatim where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_sgeev_aux ********************************************************************/ extern "C" magma_int_t magma_slahr2_m( magma_int_t n, magma_int_t k, magma_int_t nb, float *A, magma_int_t lda, float *tau, float *T, magma_int_t ldt, float *Y, magma_int_t ldy, struct sgehrd_data *data ) { #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float tmp; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t ione = 1; magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid; magma_int_t n_k_i_1, n_k; float scale; magma_int_t i; float ei = MAGMA_S_ZERO; magma_int_t info_data = 0; magma_int_t *info = &info_data; if (n < 0) { *info = -1; } else if (k < 0 || k >= n) { *info = -2; } else if (nb < 1 || nb > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldt < nb) { *info = -8; } else if (ldy < max(1,n)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); // zero out current top block of V on all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablas_slaset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv, data->queues[d] ); } // set all Y=0 lapackf77_slaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy ); for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Finish applying I - V * T * V' on right tmp = MAGMA_S_NEGATE( tau[i-1] ); blasf77_saxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_scopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_strmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_sgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_strmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_sgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_strmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_saxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_slarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // compute yi = A vi = sum_g A{d} vi{d} nblocks = (n-1) / nb / ngpu + 1; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); // dV(k+i+1:n-1, i) = VA(k+i:n, i) magma_ssetvector_async( n_k_i_1, A(k+i+1,i), 1, dV(d, k+i+1, i), 1, data->queues[d] ); // copy column of dV -> dVd, using block cyclic distribution. // This assumes V and Vd have been padded so that // a 2D matrix copy doesn't access them out-of-bounds gblock = k / nb; lblock = gblock / ngpu; lgid = gblock % ngpu; if ( d < lgid ) { lblock += 1; } // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix magmablas_slacpy( MagmaFull, nb, nblocks-lblock, dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu, dVd(d, 0 + lblock*nb, i), nb, data->queues[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i) // skip if matrix is empty // each GPU copies to different temporary vector in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_sgemv( MagmaNoTrans, n-k, dn-dki1, c_one, dA (d, k, dki1), ldda, dVd(d, dki1, i), 1, c_zero, dY (d, k, i), 1, data->queues[d] ); // copy vector to host, storing in column nb+d of Y // as temporary space (Y has >= nb+ngpu columns) magma_sgetvector_async( n-k, dY(d, k, i), 1, Y(k, nb+d), 1, data->queues[d] ); } } // while GPU is doing above Ag*v... // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_S_NEGATE( tau[i] ); blasf77_sgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_strmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // apply reflectors to next column, A(i+1), on right only. // one axpy will be required to finish this, in the next iteration above if ( i > 0 && i+1 < nb ) { // Update next column, A(k:n,i+1), applying Q on right. // One axpy will be required to finish this, in the next iteration // above, after yi is computed. // This updates one more row than LAPACK does (row k), // making block above panel an even multiple of nb. // Use last column of T as workspace, w. magma_int_t i1 = i+1; // If real, conjugate row of V, and undo afterwards #ifdef COMPLEX lapackf77_slacgv( &i1, A(k+i1,0), &lda ); #endif // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)' // T is now rectangular, so we use gemv instead of trmv as in lapack. blasf77_sgemv( "No trans", &i, &i1, &c_one, T(0,0), &ldt, A(k+i1,0), &lda, &c_zero, T(0,nb-1), &ione ); #ifdef COMPLEX lapackf77_slacgv( &i1, A(k+i1,0), &lda ); #endif // A(k:n, i+1) -= Y(k:n, 0:i) * w blasf77_sgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i1), &ione ); } // yi = sum_g yi{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( data->queues[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // yi = yi + yi{d} blasf77_saxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione ); } } } // Restore diagonal element *A(k+nb,nb-1) = ei; // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:) for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :) // skip if matrix is empty // each GPU copies to different temporary block in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_sgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1, c_one, dA (d, 0, dki1), ldda, dVd(d, dki1, 0), ldvd, c_zero, dY (d, 0, 0), ldda, data->queues[d] ); // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y // as temporary space (Y has nb + nb*ngpu columns) magma_sgetmatrix_async( k, nb, dY(d, 0, 0), ldda, Y(0,nb+nb*d), ldy, data->queues[d] ); } } // Y = sum_g Y{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( 0 ); magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // Y = Y + Am V for( i = 0; i < nb; ++i ) { blasf77_saxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione ); } } } // copy Y and T matrices to GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_ssetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->queues[d] ); magma_ssetmatrix_async( nb, nb, T, nb, dTi(d), nb, data->queues[d] ); } magma_setdevice( orig_dev ); return *info; } /* magma_slahr2 */
extern "C" magma_int_t magma_sidr_strms( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_IDRMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_n_one = MAGMA_S_NEG_ONE; // internal user options const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const float angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; magma_int_t ldd; magma_int_t q; float residual; float nrm; float nrmb; float nrmr; float nrmt; float rho; float om; float gamma; // matrices and vectors magma_s_matrix dxs = {Magma_CSR}; magma_s_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_s_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_s_matrix dG = {Magma_CSR}, dGcol = {Magma_CSR}; magma_s_matrix dU = {Magma_CSR}; magma_s_matrix dM = {Magma_CSR}; magma_s_matrix df = {Magma_CSR}; magma_s_matrix dt = {Magma_CSR}, dtt = {Magma_CSR}; magma_s_matrix dc = {Magma_CSR}; magma_s_matrix dv = {Magma_CSR}; magma_s_matrix dskp = {Magma_CSR}; magma_s_matrix dalpha = {Magma_CSR}; magma_s_matrix dbeta = {Magma_CSR}; float *hMdiag = NULL; float *hskp = NULL; float *halpha = NULL; float *hbeta = NULL; float *d1 = NULL, *d2 = NULL; // queue variables const magma_int_t nqueues = 3; // number of queues magma_queue_t queues[nqueues]; // chronometry real_Double_t tempo1, tempo2; // create additional queues queues[0] = queue; for ( q = 1; q < nqueues; q++ ) { magma_queue_create( queue->device(), &(queues[q]) ); } // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_snrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_sscal( x->num_rows, MAGMA_S_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // t = 0 // make t twice as large to contain both, dt and dr ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_svinit( &dt, Magma_DEV, ldd, 2, c_zero, queue )); dt.num_rows = b.num_rows; dt.num_cols = 1; dt.nnz = dt.num_rows; // redirect the dr.dval to the second part of dt CHECK( magma_svinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); magma_free( dr.dval ); dr.dval = dt.dval + ldd; // r = b - A x CHECK( magma_sresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_svinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_slarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_smtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_smfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_sqr(P1), QR factorization CHECK( magma_sqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_snrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_sscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_smtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_smfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_smalloc_pinned( &hskp, 5 )); CHECK( magma_svinit( &dskp, Magma_DEV, 4, 1, c_zero, queue )); CHECK( magma_smalloc_pinned( &halpha, s )); CHECK( magma_svinit( &dalpha, Magma_DEV, s, 1, c_zero, queue )); CHECK( magma_smalloc_pinned( &hbeta, s )); CHECK( magma_svinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // workspace for merged dot product CHECK( magma_smalloc( &d1, max(2, s) * b.num_rows )); CHECK( magma_smalloc( &d2, max(2, s) * b.num_rows )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_smtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // tt = 0 // make tt twice as large to contain both, dtt and drs ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_svinit( &dtt, Magma_DEV, ldd, 2, c_zero, queue )); dtt.num_rows = dr.num_rows; dtt.num_cols = 1; dtt.nnz = dtt.num_rows; // redirect the drs.dval to the second part of dtt CHECK( magma_svinit( &drs, Magma_DEV, dr.num_rows, 1, c_zero, queue )); magma_free( drs.dval ); drs.dval = dtt.dval + ldd; // set smoothing residual vector magma_scopyvector( dr.num_rows, dr.dval, 1, drs.dval, 1, queue ); } // G(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_rows, 32 ); CHECK( magma_svinit( &dG, Magma_DEV, ldd, s, c_zero, queue )); dG.num_rows = A.num_rows; } else { CHECK( magma_svinit( &dG, Magma_DEV, A.num_rows, s, c_zero, queue )); } // dGcol represents a single column of dG, array pointer is set inside loop CHECK( magma_svinit( &dGcol, Magma_DEV, dG.num_rows, 1, c_zero, queue )); magma_free( dGcol.dval ); // U(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_cols, 32 ); CHECK( magma_svinit( &dU, Magma_DEV, ldd, s, c_zero, queue )); dU.num_rows = A.num_cols; } else { CHECK( magma_svinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); } // M(s,s) = I CHECK( magma_svinit( &dM, Magma_DEV, s, s, c_zero, queue )); CHECK( magma_smalloc_pinned( &hMdiag, s )); magmablas_slaset( MagmaFull, dM.num_rows, dM.num_cols, c_zero, c_one, dM.dval, dM.ld, queue ); // f = 0 CHECK( magma_svinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // c = 0 CHECK( magma_svinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = r CHECK( magma_smtransfer( dr, &dv, Magma_DEV, Magma_DEV, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } cudaProfilerStart(); om = MAGMA_S_ONE; gamma = MAGMA_S_ZERO; innerflag = 0; // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); // start iteration do { solver_par->numiter++; // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; dGcol.dval = dG.dval + k * dG.ld; // v = r - G(:,k:s) c(k:s) // Q1 magmablas_sgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, dGcol.dval, dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queues[1] ); // U(:,k) = om * v + U(:,k:s) c(k:s) // Q1 magmablas_sgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queues[1] ); // G(:,k) = A U(:,k) // Q1 CHECK( magma_s_spmv( c_one, A, dv, c_zero, dGcol, queues[1] )); solver_par->spmv_count++; // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) // Q1 halpha[i] = magma_sdot( dP.num_rows, &dP.dval[i*dP.ld], 1, dGcol.dval, 1, queues[1] ); // implicit sync Q1 --> alpha = P(:,i)' G(:,k) // alpha = alpha / M(i,i) halpha[i] = halpha[i] / hMdiag[i]; // G(:,k) = G(:,k) - alpha * G(:,i) // Q1 magma_saxpy( dG.num_rows, -halpha[i], &dG.dval[i*dG.ld], 1, dGcol.dval, 1, queues[1] ); } // sync Q1 --> G(:,k) = G(:,k) - alpha * G(:,i), skp[4] = f(k) magma_queue_sync( queues[1] ); // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) // Q2 magma_sgemvmdot_shfl( dP.num_rows, sk, &dP.dval[k*dP.ld], dGcol.dval, d1, d2, &dM.dval[k*dM.ld+k], queues[2] ); // non-first s iteration if ( k > 0 ) { // alpha = dalpha // Q0 magma_ssetvector_async( k, halpha, 1, dalpha.dval, 1, queues[0] ); // U update outside of loop using GEMV // U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) // Q0 magmablas_sgemv( MagmaNoTrans, dU.num_rows, k, c_n_one, dU.dval, dU.ld, dalpha.dval, 1, c_one, dv.dval, 1, queues[0] ); } // Mdiag(k) = M(k,k) // Q2 magma_sgetvector( 1, &dM.dval[k*dM.ld+k], 1, &hMdiag[k], 1, queues[2] ); // implicit sync Q2 --> Mdiag(k) = M(k,k) // U(:,k) = v // Q0 magma_scopyvector_async( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queues[0] ); // check M(k,k) == 0 if ( MAGMA_S_EQUAL(hMdiag[k], MAGMA_S_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) hbeta[k] = hskp[4] / hMdiag[k]; // check for nan if ( magma_s_isnan( hbeta[k] ) || magma_s_isinf( hbeta[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // r = r - beta * G(:,k) // Q2 magma_saxpy( dr.num_rows, -hbeta[k], dGcol.dval, 1, dr.dval, 1, queues[2] ); // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) // Q1 magma_saxpy( sk-1, -hbeta[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queues[1] ); // c(k+1:s) = f(k+1:s) // Q1 magma_scopyvector_async( sk-1, &df.dval[k+1], 1, &dc.dval[k+1], 1, queues[1] ); // c(k+1:s) = M(k+1:s,k+1:s) \ f(k+1:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk-1, &dM.dval[(k+1)*dM.ld+(k+1)], dM.ld, &dc.dval[k+1], 1, queues[1] ); // skp[4] = f(k+1) // Q1 magma_sgetvector_async( 1, &df.dval[k+1], 1, &hskp[4], 1, queues[1] ); } // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_sidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // x = x + beta * U(:,k) // Q0 magma_saxpy( x->num_rows, hbeta[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queues[0] ); // t't // t'rs // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_sgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // rs = rs - gamma * t // Q1 magma_saxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[1] ); // xs = xs - gamma * (xs - x) // Q0 magma_sidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // |rs| // Q1 nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queues[1] ); // implicit sync Q0 --> |r| //--------------------------------------- } // v = r // Q1 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // last s iteration if ( (k + 1) == s ) { // t = A r // Q2 CHECK( magma_s_spmv( c_one, A, dr, c_zero, dt, queues[2] )); solver_par->spmv_count++; // t't // t'r // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dt.dval, dt.dval, d1, d2, dskp.dval, queues[2] )); } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // dbeta(1:s) = beta(1:s) // Q0 magma_ssetvector_async( s, hbeta, 1, dbeta.dval, 1, queues[0] ); // x = x + U(:,1:s) * beta(1:s) // Q0 magmablas_sgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queues[0] ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // computation of a new omega //--------------------------------------- // skp[0-2] = dskp[0-2] // Q2 magma_sgetvector( 2, dskp.dval, 1, hskp, 1, queues[2] ); // implicit sync Q2 --> skp = dskp // |t| nrmt = magma_ssqrt( MAGMA_S_REAL(hskp[0]) ); // rho = abs((t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_S_REAL(hskp[1]) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = hskp[1] / hskp[0]; if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_S_EQUAL(om, MAGMA_S_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // sync Q1 --> v = r magma_queue_sync( queues[1] ); // r = r - om * t // Q2 magma_saxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queues[2] ); // x = x + om * v // Q0 magma_saxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queues[0] ); // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // v = r // Q0 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[0] ); // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_sidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // t't // t'rs // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_sgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // rs = rs - gamma * (rs - r) // Q2 magma_saxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[2] ); // xs = xs - gamma * (xs - x) // Q0 magma_sidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // v = r // Q0 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[0] ); // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // |rs| // Q2 nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } // sync Q0 --> v = r magma_queue_sync( queues[0] ); } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // sync all queues for ( q = 0; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_scopyvector_async( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_scopyvector_async( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } cudaProfilerStop(); // get last iteration timing tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_sresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // sync all queues, destory additional queues magma_queue_sync( queues[0] ); for ( q = 1; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); magma_queue_destroy( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { drs.dval = NULL; // needed because its pointer is redirected to dtt magma_smfree( &dxs, queue ); magma_smfree( &drs, queue ); magma_smfree( &dtt, queue ); } dr.dval = NULL; // needed because its pointer is redirected to dt dGcol.dval = NULL; // needed because its pointer is redirected to dG magma_smfree( &dr, queue ); magma_smfree( &dP, queue ); magma_smfree( &dP1, queue ); magma_smfree( &dG, queue ); magma_smfree( &dGcol, queue ); magma_smfree( &dU, queue ); magma_smfree( &dM, queue ); magma_smfree( &df, queue ); magma_smfree( &dt, queue ); magma_smfree( &dc, queue ); magma_smfree( &dv, queue ); magma_smfree( &dskp, queue ); magma_smfree( &dalpha, queue ); magma_smfree( &dbeta, queue ); magma_free_pinned( hMdiag ); magma_free_pinned( hskp ); magma_free_pinned( halpha ); magma_free_pinned( hbeta ); magma_free( d1 ); magma_free( d2 ); solver_par->info = info; return info; /* magma_sidr_strms */ }
/** Purpose ------- SLATRD2 reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, SLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, SLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by SSYTRD2_GPU. It uses an accelerated HEMV that needs extra memory. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e REAL array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W REAL array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). @param dA TODO: dimension (ldda, n) ?? @param ldda TODO: ldda >= n ?? @param dW TODO: dimension (lddw, 2*nb) ?? @param lddw TODO: lddw >= n ?? @param dwork TODO: dimension (ldwork) ?? @param ldwork TODO: ldwork >= ceil(n/64)*ldda ?? Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slatrd2( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, float *A, magma_int_t lda, float *e, float *tau, float *W, magma_int_t ldw, magmaFloat_ptr dA, magma_int_t ldda, magmaFloat_ptr dW, magma_int_t lddw, magmaFloat_ptr dwork, magma_int_t ldwork) { #define A(i_, j_) (A + (i_) + (j_)*lda) #define W(i_, j_) (W + (i_) + (j_)*ldw) #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dW(i_, j_) (dW + (i_) + (j_)*lddw) const float c_neg_one = MAGMA_S_NEG_ONE; const float c_one = MAGMA_S_ONE; const float c_zero = MAGMA_S_ZERO; const magma_int_t ione = 1; float alpha, value; magma_int_t i, i_n, i_1, iw; /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower && uplo != MagmaUpper ) { info = -1; } else if ( n < 0 ) { info = -2; } else if ( nb < 1 ) { info = -3; } else if ( lda < max(1,n) ) { info = -5; } else if ( ldw < max(1,n) ) { info = -9; } else if ( ldda < max(1,n) ) { info = -11; } else if ( lddw < max(1,n) ) { info = -13; } else if ( ldwork < ldda*ceildiv(n,64) ) { info = -15; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if (n == 0) { return info; } magma_queue_t stream; magma_queue_create( &stream ); float *f; magma_smalloc_cpu( &f, n ); if ( f == NULL ) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, W(i, iw+1), &ldw ); #endif blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, W(i, iw+1), &ldw ); lapackf77_slacgv( &i_n, A(i, i+1), &lda ); #endif blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, A(i, i+1), &lda ); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_slarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] ); e[i-1] = MAGMA_S_REAL( alpha ); *A(i-1,i) = MAGMA_S_ONE; /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_ssetvector_async( i, A(0, i), 1, dA(0, i), 1, stream ); magmablas_ssymv_work( MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, dwork, ldwork, stream ); // 2. Start getting the result back (asynchronously) magma_sgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw), ldw, stream ); if (i < n-1) { blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); } // 3. Here we need ssymv result W(0, iw) magma_queue_sync( stream ); if (i < n-1) { blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); } blasf77_sscal( &i, &tau[i - 1], W(0, iw), &ione ); value = magma_cblas_sdot( i, W(0,iw), ione, A(0,i), ione ); alpha = tau[i - 1] * -0.5f * value; blasf77_saxpy( &i, &alpha, A(0, i), &ione, W(0, iw), &ione ); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, W(i, 0), &ldw ); #endif blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, W(i, 0), &ldw ); lapackf77_slacgv( &i, A(i, 0), &lda ); #endif blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, A(i, 0), &lda ); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_slarfg( &i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] ); e[i] = MAGMA_S_REAL( alpha ); *A(i+1,i) = MAGMA_S_ONE; /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_ssetvector_async( i_n, A(i+1, i), 1, dA(i+1, i), 1, stream ); magmablas_ssymv_work( MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, dwork, ldwork, stream ); // 2. Start getting the result back (asynchronously) magma_sgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione ); blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); // 3. Here we need ssymv result W(i+1, i) magma_queue_sync( stream ); if (i != 0) blasf77_saxpy( &i_n, &c_one, f, &ione, W(i+1, i), &ione ); blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione ); blasf77_sscal( &i_n, &tau[i], W(i+1,i), &ione ); value = magma_cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione ); alpha = tau[i] * -0.5f * value; blasf77_saxpy( &i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione ); } } } magma_free_cpu( f ); magma_queue_destroy( stream ); return info; } /* magma_slatrd */
/** Purpose ======= CLAHEF computes a partial factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. The partial factorization has the form: A = ( I U12 ) ( A11 0 ) ( I 0 ) if UPLO = 'U', or: ( 0 U22 ) ( 0 D ) ( U12' U22' ) A = ( L11 0 ) ( D 0 ) ( L11' L21' ) if UPLO = 'L' ( L21 I ) ( 0 A22 ) ( 0 I ) where the order of D is at most NB. The actual order is returned in the argument KB, and is either NB or NB-1, or N if N <= NB. Note that U' denotes the conjugate transpose of U. CLAHEF is an auxiliary routine called by CHETRF. It uses blocked code (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or A22 (if UPLO = 'L'). Arguments --------- @param[in] UPLO CHARACTER Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = 'U': Upper triangular - = 'L': Lower triangular @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in] NB INTEGER The maximum number of columns of the matrix A that should be factored. NB should be at least 2 to allow for 2-by-2 pivot blocks. @param[out] KB INTEGER The number of columns of A that were actually factored. KB is either NB-1 or NB, or N if N <= NB. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, A contains details of the partial factorization. @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] ipiv INTEGER array, dimension (N) Details of the interchanges and the block structure of D. If UPLO = 'U', only the last KB elements of ipiv are set; if UPLO = 'L', only the first KB elements are set. \n If ipiv(k) > 0, then rows and columns k and ipiv(k) were interchanged and D(k,k) is a 1-by-1 diagonal block. If UPLO = 'U' and ipiv(k) = ipiv(k-1) < 0, then rows and columns k-1 and -ipiv(k) were interchanged and D(k-1:k,k-1:k) is a 2-by-2 diagonal block. If UPLO = 'L' and ipiv(k) = ipiv(k+1) < 0, then rows and columns k+1 and -ipiv(k) were interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block. @param[out] W (workspace) COMPLEX array, dimension (LDW,NB) @param[in] LDW INTEGER The leading dimension of the array W. LDW >= max(1,N). @param[out] INFO INTEGER - = 0: successful exit - > 0: if INFO = k, D(k,k) is exactly zero. The factorization has been completed, but the block diagonal matrix D is exactly singular. @ingroup magma_chetrf_comp ********************************************************************/ extern "C" magma_int_t magma_clahef_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magmaFloatComplex_ptr dW, size_t dW_offset, magma_int_t lddw, magma_queue_t queue, magma_int_t *info) { /* .. Parameters .. */ float d_one = 1.0; float d_zero = 0.0; float d_eight = 8.0; float d_seven = 7.0; #if defined(PRECISION_c) float f_zero = 0.0; #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_mone = -MAGMA_C_ONE; magma_int_t upper = (uplo == MagmaUpper); magma_int_t ione = 1; /* .. Local Scalars .. */ magma_int_t imax = 0, jmax = 0, kk, kkW, kp, kstep, iinfo; float abs_akk, alpha, colmax, R1, rowmax; magmaFloatComplex Zimax, Z; #define dA(i, j) dA, dA_offset + (j)*ldda + (i) #define dW(i, j) dW, dW_offset + (j)*lddw + (i) #define A(i, j) (hA + (j)*lda + (i)) /* .. Executable Statements .. */ *info = 0; /* Initialize alpha for use in choosing pivot block size. */ alpha = ( d_one+sqrt( d_seven ) ) / d_eight; magma_event_t event = NULL; if( upper ) { /* Factorize the trailing columns of A using the upper triangle of A and working backwards, and compute the matrix W = U12*D for use in updating A11 (note that conjg(W) is actually stored) K is the main loop index, decreasing from N in steps of 1 or 2 KW is the column of W which corresponds to column K of A */ int k, kw = 0; for (k = n-1; k+1 > max(n-nb+1, nb); k -= kstep) { kw = nb - (n-k); /* Copy column K of A to column KW of W and update it */ magma_ccopy( k+1, dA( 0, k ), 1, dW( 0, kw ), 1, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif if (k+1 < n) { magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( k, kw+1 ), lddw, c_one, dW( 0, kw ), ione, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } kstep = 1; /* Determine rows and columns to be interchanged and whether a 1-by-1 or 2-by-2 pivot block will be used */ magma_cgetvector_async( 1, dW( k, kw ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); abs_akk = fabs( MAGMA_C_REAL( Z ) ); /* imax is the row-index of the largest off-diagonal element in column K, and colmax is its absolute value */ if( k > 0 ) { // magma is one-base imax = magma_icamax( k, dW( 0, kw ), 1, queue ) - 1; magma_cgetvector( 1, dW( imax, kw ), 1, &Z, 1, queue ); colmax = MAGMA_C_ABS1( Z ); } else { colmax = d_zero; } if( max( abs_akk, colmax ) == 0.0 ) { /* Column K is zero: set INFO and continue */ if ( *info == 0 ) *info = k; kp = k; #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } else { if( abs_akk >= alpha*colmax ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else { /* Copy column imax to column KW-1 of W and update it */ magma_ccopy( imax+1, dA( 0, imax ), 1, dW( 0, kw-1 ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #endif #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue ); #else magma_ccopy( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue ); #endif if( k+1 < n ) { magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( imax, kw+1 ), lddw, c_one, dW( 0, kw-1 ), ione, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #endif } magma_cgetvector_async( 1, dW( imax, kw-1 ), 1, &Zimax, 1, queue, &event ); magma_queue_sync( queue ); /* jmax is the column-index of the largest off-diagonal element in row imax, and rowmax is its absolute value */ jmax = imax + magma_icamax( k-imax, dW( imax+1, kw-1 ), 1, queue ); magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue ); rowmax = MAGMA_C_ABS1( Z ); if ( imax > 0 ) { // magma is one-base jmax = magma_icamax( imax, dW( 0, kw-1 ), 1, queue ) - 1; magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue ); rowmax = max( rowmax, MAGMA_C_ABS1( Z ) ); } if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else if ( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) { /* interchange rows and columns K and imax, use 1-by-1 pivot block */ kp = imax; /* copy column KW-1 of W to column KW */ magma_ccopy( k+1, dW( 0, kw-1 ), 1, dW( 0, kw ), 1, queue ); } else { /* interchange rows and columns K-1 and imax, use 2-by-2 pivot block */ kp = imax; kstep = 2; } } kk = k - kstep + 1; kkW = nb - (n - kk); /* Updated column kp is already stored in column kkW of W */ if( kp != kk ) { /* Interchange rows kk and kp in last kk columns of A and W */ // note: row-swap A(:,kk) magmablas_cswap( n-kk, dA( kk, kk ), ldda, dA( kp, kk ), ldda, queue ); magmablas_cswap( n-kk, dW( kk, kkW), lddw, dW( kp, kkW), lddw, queue ); /* Copy non-updated column kk to column kp */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue ); #else magma_ccopy( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue ); #endif // now A(kp,kk) should be A(kk,kk), and copy to A(kp,kp) magma_ccopy( kp+1, dA( 0, kk ), 1, dA( 0, kp ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event ); #endif } if( kstep == 1 ) { /* 1-by-1 pivot block D(k): column KW of W now holds W(k) = U(k)*D(k) where U(k) is the k-th column of U Store U(k) in column k of A */ magma_ccopy( k+1, dW( 0, kw ), 1, dA( 0, k ), 1, queue ); if ( k > 0 ) { magma_cgetvector_async( 1, dA( k, k ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); R1 = d_one / MAGMA_C_REAL( Z ); magma_csscal( k, R1, dA( 0, k ), 1, queue ); /* Conjugate W(k) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k, dW( 0, kw ), 1, dW( 0, kw ), 1, queue ); #endif } } else { /* 2-by-2 pivot block D(k): columns KW and KW-1 of W now hold ( W(k-1) W(k) ) = ( U(k-1) U(k) )*D(k) where U(k) and U(k-1) are the k-th and (k-1)-th columns of U */ if( k > 1 ) { /* Store U(k) and U(k-1) in columns k and k-1 of A */ magmablas_clascl_2x2( MagmaUpper, k-1, dW(0, kw-1), lddw, dA(0,k-1), ldda, &iinfo, queue ); } /* Copy D(k) to A */ magma_ccopymatrix( 2, 2, dW( k-1, kw-1 ), lddw, dA( k-1, k-1 ), ldda, queue ); /* Conjugate W(k) and W(k-1) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k, dW( 0, kw ), 1, dW( 0, kw ), 1, queue ); magmablas_clacpy_cnjg( k-1, dW( 0, kw-1 ), 1, dW( 0, kw-1 ), 1, queue ); #endif } } /* Store details of the interchanges in ipiv */ if( kstep == 1 ) { ipiv[ k ] = 1+kp; } else { ipiv[ k ] = -(1+kp); ipiv[ k-1 ] = -(1+kp); } } /* Update the upper triangle of A11 (= A(1:k,1:k)) as A11 := A11 - U12*D*U12' = A11 - U12*W' computing blocks of NB columns at a time (note that conjg(W) is actually stored) */ kw = nb - (n-k); for (int j = ( k / nb )*nb; j >= 0; j -= nb ) { int jb = min( nb, k-j+1 ); #ifdef SYMMETRIC_UPDATE /* Update the upper triangle of the diagonal block */ for (int jj = j; jj < j + jb; jj++) { #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #endif magma_cgemv( MagmaNoTrans, jj-j+1, n-(k+1), c_mone, dA( j, k+1 ), ldda, dW( jj, kw+1 ), lddw, c_one, dA( j, jj ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #endif } /* Update the rectangular superdiagonal block */ magma_cgemm( MagmaNoTrans, MagmaTrans, j, jb, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( j, kw+1 ), lddw, c_one, dA( 0, j ), ldda, queue ); #else #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #endif magma_cgemm( MagmaNoTrans, MagmaTrans, j+jb, jb, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( j, kw+1 ), lddw, c_one, dA( 0, j ), ldda, queue ); #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #endif #endif } /* Put U12 in standard form by partially undoing the interchanges in columns k+1:n */ for (int j = k+1; j < n;) { int jj = j; int jp = ipiv[ j ]; if( jp < 0 ) { jp = -jp; j = j + 1; } j = j + 1; jp = jp - 1; if( jp != jj && j < n ) magmablas_cswap( n-j, dA( jp, j ), ldda, dA( jj, j ), ldda, queue ); } // copying the panel back to CPU magma_cgetmatrix_async( n, n-(k+1), dA(0,k+1), ldda, A(0,k+1), lda, queue, &event ); magma_queue_sync( queue ); /* Set KB to the number of columns factorized */ *kb = n - (k+1); } else { /* Factorize the leading columns of A using the lower triangle of A and working forwards, and compute the matrix W = L21*D for use in updating A22 (note that conjg(W) is actually stored) K is the main loop index, increasing from 1 in steps of 1 or 2 */ int k; for (k = 0; k < min(nb-1,n); k += kstep) { /* Copy column K of A to column K of W and update it */ /* -------------------------------------------------------------- */ magma_ccopy( n-k, dA( k, k ), 1, dW( k, k ), 1, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif /* -------------------------------------------------------------- */ magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda, dW( k, 0 ), lddw, c_one, dW( k, k ), ione, queue ); // re-set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif kstep = 1; /* Determine rows and columns to be interchanged and whether a 1-by-1 or 2-by-2 pivot block will be used */ magma_cgetvector_async( 1, dW( k, k ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); abs_akk = fabs( MAGMA_C_REAL( Z ) ); /* imax is the row-index of the largest off-diagonal element in column K, and colmax is its absolute value */ if( k < n-1 ) { // magmablas is one-base imax = k + magma_icamax( n-k-1, dW(k+1,k), 1, queue ); magma_cgetvector( 1, dW( imax,k ), 1, &Z, 1, queue ); colmax = MAGMA_C_ABS1( Z ); } else { colmax = d_zero; } if ( max( abs_akk, colmax ) == 0.0 ) { /* Column K is zero: set INFO and continue */ if( *info == 0 ) *info = k; kp = k; // make sure the imaginary part of diagonal is zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } else { if ( abs_akk >= alpha*colmax ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else { /* Copy column imax to column K+1 of W and update it */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( imax-k, dA(imax,k), ldda, dW(k,k+1), 1, queue ); #else magma_ccopy( imax-k, dA( imax, k ), ldda, dW( k, k+1 ), 1, queue ); #endif magma_ccopy( n-imax, dA( imax, imax ), 1, dW( imax, k+1 ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda, dW( imax, 0 ), lddw, c_one, dW( k, k+1 ), ione, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif magma_cgetvector_async( 1, dW(imax,k+1), 1, &Zimax, 1, queue, &event); magma_queue_sync( queue ); /* jmax is the column-index of the largest off-diagonal element in row imax, and rowmax is its absolute value */ // magmablas is one-base jmax = k-1 + magma_icamax( imax-k, dW(k, k+1), 1, queue ); magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue ); rowmax = MAGMA_C_ABS1( Z ); if( imax < n-1 ) { // magmablas is one-base jmax = imax + magma_icamax( (n-1)-imax, dW(imax+1,k+1), 1, queue); magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue ); rowmax = max( rowmax, MAGMA_C_ABS1( Z ) ); } if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else if( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) { /* interchange rows and columns K and imax, use 1-by-1 pivot block */ kp = imax; /* copy column K+1 of W to column K */ magma_ccopy( n-k, dW( k, k+1 ), 1, dW( k, k ), 1, queue ); } else { /* interchange rows and columns K+1 and imax, use 2-by-2 pivot block */ kp = imax; kstep = 2; } } kk = k + kstep - 1; /* Updated column kp is already stored in column kk of W */ if( kp != kk ) { /* Copy non-updated column kk to column kp */ /* ------------------------------------------------------------------ */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue ); #else magma_ccopy( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue ); #endif if ( kp < n ) { magma_ccopy( n-kp, dA( kp, kk), 1, dA( kp, kp ), 1, queue ); } /* ------------------------------------------------------------------ */ /* Interchange rows kk and kp in first kk columns of A and W */ magmablas_cswap( kk+1, dA( kk, 0 ), ldda, dA( kp, 0 ), ldda, queue ); magmablas_cswap( kk+1, dW( kk, 0 ), lddw, dW( kp, 0 ), lddw, queue ); } if ( kstep == 1 ) { /* 1-by-1 pivot block D(k): column k of W now holds W(k) = L(k)*D(k) where L(k) is the k-th column of L Store L(k) in column k of A */ magma_ccopy( n-k, dW( k, k ), 1, dA( k, k ), 1, queue ); if ( k < n-1 ) { magma_cgetvector_async( 1, dA(k,k), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); R1 = d_one / MAGMA_C_REAL( Z ); magma_csscal((n-1)-k, R1, dA( k+1,k ), 1, queue); /* Conjugate W(k) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( (n-1)-k, dW( k+1,k ), 1, dW( k+1,k ), 1, queue ); #endif } } else { /* 2-by-2 pivot block D(k): columns k and k+1 of W now hold ( W(k) W(k+1) ) = ( L(k) L(k+1) )*D(k) where L(k) and L(k+1) are the k-th and (k+1)-th columns of L */ magmablas_clascl_2x2( MagmaLower, n-(k+2), dW(k,k), lddw, dA(k+2,k), ldda, &iinfo, queue ); /* Copy D(k) to A */ magma_ccopymatrix( 2, 2, dW( k, k ), lddw, dA( k, k ), ldda, queue ); /* Conjugate W(k) and W(k+1) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( (n-1)-k, dW( k+1,k ), 1, dW( k+1,k ), 1, queue ); magmablas_clacpy_cnjg( (n-1)-k-1, dW( k+2,k+1), 1, dW( k+2,k+1 ), 1, queue ); #endif } } /* Store details of the interchanges in ipiv */ if ( kstep == 1 ) { ipiv[k] = kp+1; } else { ipiv[k] = -kp-1; ipiv[k+1] = -kp-1; } } /* Update the lower triangle of A22 (= A(k:n,k:n)) as A22 := A22 - L21*D*L21' = A22 - L21*W' computing blocks of NB columns at a time (note that conjg(W) is actually stored) */ for( int j = k; j < n; j += nb ) { int jb = min( nb, n-j ); /* Update the lower triangle of the diagonal block */ #ifdef SYMMETRIC_UPDATE for (int jj = j; jj < j + jb; jj++) { int jnb = j + jb - jj; /* -------------------------------------------------------- */ magma_cgemv( MagmaNoTrans, jnb, k, c_mone, dA( jj, 0 ), ldda, dW( jj, 0 ), lddw, c_one, dA( jj, jj ), ione, queue ); /* -------------------------------------------------------- */ } /* Update the rectangular subdiagonal block */ if( j+jb < n ) { int nk = n - (j+jb); /* -------------------------------------------- */ magma_cgemm( MagmaNoTrans, MagmaTrans, nk, jb, k, c_mone, dA( j+jb, 0 ), ldda, dW( j, 0 ), lddw, c_one, dA( j+jb, j ), ldda, queue ); /* ------------------------------------------- */ } #else #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #endif magma_cgemm( MagmaNoTrans, MagmaTrans, n-j, jb, k, c_mone, dA( j, 0 ), ldda, dW( j, 0 ), lddw, c_one, dA( j, j ), ldda, queue ); #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #endif #endif } /* Put L21 in standard form by partially undoing the interchanges in columns 1:k-1 */ for (int j = k; j > 0;) { int jj = j; int jp = ipiv[j-1]; if( jp < 0 ) { jp = -jp; j--; } j--; if ( jp != jj && j >= 1 ) { magmablas_cswap( j, dA( jp-1,0 ), ldda, dA( jj-1,0 ), ldda, queue ); } } // copying the panel back to CPU magma_cgetmatrix_async( n, k, dA(0,0), ldda, A(0,0), lda, queue, &event ); magma_queue_sync( queue ); /* Set KB to the number of columns factorized */ *kb = k; } return *info; /* End of CLAHEF */ }