magma_int_t magma_clatrsd( magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_bool_t normin, magma_int_t n, const magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex lambda, magmaFloatComplex *x, float *scale, float *cnorm, magma_int_t *info) { #define A(i,j) (A + (i) + (j)*lda) /* constants */ const magma_int_t ione = 1; const float d_half = 0.5; const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; /* System generated locals */ magma_int_t len; magmaFloatComplex ztmp; /* Local variables */ magma_int_t i, j; float xj, rec, tjj; magma_int_t jinc; float xbnd; magma_int_t imax; float tmax; magmaFloatComplex tjjs; float xmax, grow; float tscal; magmaFloatComplex uscal; magma_int_t jlast; magmaFloatComplex csumj; float bignum; magma_int_t jfirst; float smlnum; /* Function Body */ *info = 0; magma_int_t upper = (uplo == MagmaUpper); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t nounit = (diag == MagmaNonUnit); /* Test the input parameters. */ if ( ! upper && uplo != MagmaLower ) { *info = -1; } else if (! notran && trans != MagmaTrans && trans != MagmaConjTrans) { *info = -2; } else if ( ! nounit && diag != MagmaUnit ) { *info = -3; } else if ( ! (normin == MagmaTrue) && ! (normin == MagmaFalse) ) { *info = -4; } else if ( n < 0 ) { *info = -5; } else if ( lda < max(1,n) ) { *info = -7; } if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( n == 0 ) { return *info; } /* Determine machine dependent parameters to control overflow. */ smlnum = lapackf77_slamch( "Safe minimum" ); bignum = 1. / smlnum; lapackf77_slabad( &smlnum, &bignum ); smlnum /= lapackf77_slamch( "Precision" ); bignum = 1. / smlnum; *scale = 1.; if ( normin == MagmaFalse ) { /* Compute the 1-norm of each column, not including the diagonal. */ if ( upper ) { /* A is upper triangular. */ cnorm[0] = 0.; for( j = 1; j < n; ++j ) { cnorm[j] = magma_cblas_scasum( j, A(0,j), ione ); } } else { /* A is lower triangular. */ for( j = 0; j < n-1; ++j ) { cnorm[j] = magma_cblas_scasum( n-(j+1), A(j+1,j), ione ); } cnorm[n-1] = 0.; } } /* Scale the column norms by TSCAL if the maximum element in CNORM is */ /* greater than BIGNUM/2. */ imax = blasf77_isamax( &n, &cnorm[0], &ione ) - 1; tmax = cnorm[imax]; if ( tmax <= bignum * 0.5 ) { tscal = 1.; } else { tscal = 0.5 / (smlnum * tmax); blasf77_sscal( &n, &tscal, &cnorm[0], &ione ); } /* ================================================================= */ /* Compute a bound on the computed solution vector to see if the */ /* Level 2 BLAS routine CTRSV can be used. */ xmax = 0.; for( j = 0; j < n; ++j ) { xmax = max( xmax, 0.5*MAGMA_C_ABS1( x[j] )); } xbnd = xmax; if ( notran ) { /* ---------------------------------------- */ /* Compute the growth in A * x = b. */ if ( upper ) { jfirst = n-1; jlast = 0; jinc = -1; } else { jfirst = 0; jlast = n; jinc = 1; } if ( tscal != 1. ) { grow = 0.; goto L60; } /* A is non-unit triangular. */ /* Compute GROW = 1/G(j) and XBND = 1/M(j). */ /* Initially, G(0) = max{x(i), i=1,...,n}. */ grow = 0.5 / max( xbnd, smlnum ); xbnd = grow; for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) { /* Exit the loop if the growth factor is too small. */ if ( grow <= smlnum ) { goto L60; } if ( nounit ) { tjjs = *A(j,j) - lambda; } else { tjjs = c_one - lambda; } tjj = MAGMA_C_ABS1( tjjs ); if ( tjj >= smlnum ) { /* M(j) = G(j-1) / abs(A(j,j)) */ xbnd = min( xbnd, min(1.,tjj)*grow ); } else { /* M(j) could overflow, set XBND to 0. */ xbnd = 0.; } if ( tjj + cnorm[j] >= smlnum ) { /* G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) */ grow *= (tjj / (tjj + cnorm[j])); } else { /* G(j) could overflow, set GROW to 0. */ grow = 0.; } } grow = xbnd; L60: ; } else { /* ---------------------------------------- */ /* Compute the growth in A**T * x = b or A**H * x = b. */ if ( upper ) { jfirst = 0; jlast = n; jinc = 1; } else { jfirst = n-1; jlast = 0; jinc = -1; } if ( tscal != 1. ) { grow = 0.; goto L90; } /* A is non-unit triangular. */ /* Compute GROW = 1/G(j) and XBND = 1/M(j). */ /* Initially, M(0) = max{x(i), i=1,...,n}. */ grow = 0.5 / max( xbnd, smlnum ); xbnd = grow; for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) { /* Exit the loop if the growth factor is too small. */ if ( grow <= smlnum ) { goto L90; } /* G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */ xj = 1. + cnorm[j]; grow = min( grow, xbnd / xj ); if ( nounit ) { tjjs = *A(j,j) - lambda; } else { tjjs = c_one - lambda; } tjj = MAGMA_C_ABS1( tjjs ); if ( tjj >= smlnum ) { /* M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) */ if ( xj > tjj ) { xbnd *= (tjj / xj); } } else { /* M(j) could overflow, set XBND to 0. */ xbnd = 0.; } } grow = min( grow, xbnd ); L90: ; } /* ================================================================= */ /* Due to modified diagonal, we can't use regular BLAS ctrsv. */ /* Use a Level 1 BLAS solve, scaling intermediate results. */ if ( xmax > bignum * 0.5 ) { /* Scale X so that its components are less than or equal to */ /* BIGNUM in absolute value. */ *scale = (bignum * 0.5) / xmax; blasf77_csscal( &n, scale, &x[0], &ione ); xmax = bignum; } else { xmax *= 2.; } if ( notran ) { /* ---------------------------------------- */ /* Solve A * x = b */ for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) { /* Compute x(j) = b(j) / A(j,j), scaling x if necessary. */ xj = MAGMA_C_ABS1( x[j] ); if ( nounit ) { tjjs = (*A(j,j) - lambda ) * tscal; } else { tjjs = (c_one - lambda) * tscal; if ( tscal == 1. ) { goto L110; } } tjj = MAGMA_C_ABS1( tjjs ); if ( tjj > smlnum ) { /* abs(A(j,j)) > SMLNUM: */ if ( tjj < 1. ) { if ( xj > tjj * bignum ) { /* Scale x by 1/b(j). */ rec = 1. / xj; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } } x[j] = x[j] / tjjs; xj = MAGMA_C_ABS1( x[j] ); } else if ( tjj > 0. ) { /* 0 < abs(A(j,j)) <= SMLNUM: */ if ( xj > tjj * bignum ) { /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM */ /* to avoid overflow when dividing by A(j,j). */ rec = (tjj * bignum) / xj; if ( cnorm[j] > 1. ) { /* Scale by 1/CNORM(j) to avoid overflow when */ /* multiplying x(j) times column j. */ rec /= cnorm[j]; } blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } x[j] = x[j] / tjjs; xj = MAGMA_C_ABS1( x[j] ); } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */ /* scale = 0, and compute a solution to A*x = 0. */ for( i = 0; i < n; ++i ) { x[i] = c_zero; } x[j] = c_one; xj = 1.; *scale = 0.; xmax = 0.; } L110: /* Scale x if necessary to avoid overflow when adding a */ /* multiple of column j of A. */ if ( xj > 1. ) { rec = 1. / xj; if ( cnorm[j] > (bignum - xmax) * rec ) { /* Scale x by 1/(2*abs(x(j))). */ rec *= 0.5; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; } } else if ( xj * cnorm[j] > bignum - xmax ) { /* Scale x by 1/2. */ blasf77_csscal( &n, &d_half, &x[0], &ione ); *scale *= 0.5; } if ( upper ) { if ( j > 0 ) { /* Compute the update */ /* x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) */ len = j; ztmp = -tscal * x[j]; blasf77_caxpy( &len, &ztmp, A(0,j), &ione, &x[0], &ione ); i = blasf77_icamax( &len, &x[0], &ione ) - 1; xmax = MAGMA_C_ABS1( x[i] ); } } else { if ( j < n-1 ) { /* Compute the update */ /* x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) */ len = n - (j+1); ztmp = -tscal * x[j]; blasf77_caxpy( &len, &ztmp, A(j+1,j), &ione, &x[j + 1], &ione ); i = j + blasf77_icamax( &len, &x[j + 1], &ione ); xmax = MAGMA_C_ABS1( x[i] ); } } } } else if ( trans == MagmaTrans ) { /* ---------------------------------------- */ /* Solve A**T * x = b */ for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) { /* Compute x(j) = b(j) - sum A(k,j)*x(k). */ /* k<>j */ xj = MAGMA_C_ABS1( x[j] ); uscal = MAGMA_C_MAKE( tscal, 0. ); rec = 1. / max( xmax, 1. ); if ( cnorm[j] > (bignum - xj) * rec ) { /* If x(j) could overflow, scale x by 1/(2*XMAX). */ rec *= 0.5; if ( nounit ) { tjjs = (*A(j,j) - lambda) * tscal; } else { tjjs = (c_one - lambda) * tscal; } tjj = MAGMA_C_ABS1( tjjs ); if ( tjj > 1. ) { /* Divide by A(j,j) when scaling x if A(j,j) > 1. */ rec = min( 1., rec * tjj ); uscal = uscal / tjjs; } if ( rec < 1. ) { blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } } csumj = c_zero; if ( uscal == c_one ) { /* If the scaling needed for A in the dot product is 1, */ /* call CDOTU to perform the dot product. */ if ( upper ) { csumj = magma_cblas_cdotu( j, A(0,j), ione, &x[0], ione ); } else if ( j < n-1 ) { csumj = magma_cblas_cdotu( n-(j+1), A(j+1,j), ione, &x[j+1], ione ); } } else { /* Otherwise, use in-line code for the dot product. */ if ( upper ) { for( i = 0; i < j; ++i ) { csumj += (*A(i,j) * uscal) * x[i]; } } else if ( j < n-1 ) { for( i = j+1; i < n; ++i ) { csumj += (*A(i,j) * uscal) * x[i]; } } } if ( uscal == MAGMA_C_MAKE( tscal, 0. )) { /* Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) */ /* was not used to scale the dotproduct. */ x[j] -= csumj; xj = MAGMA_C_ABS1( x[j] ); if ( nounit ) { tjjs = (*A(j,j) - lambda) * tscal; } else { tjjs = (c_one - lambda) * tscal; if ( tscal == 1. ) { goto L160; } } /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */ tjj = MAGMA_C_ABS1( tjjs ); if ( tjj > smlnum ) { /* abs(A(j,j)) > SMLNUM: */ if ( tjj < 1. ) { if ( xj > tjj * bignum ) { /* Scale X by 1/abs(x(j)). */ rec = 1. / xj; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } } x[j] = x[j] / tjjs; } else if ( tjj > 0. ) { /* 0 < abs(A(j,j)) <= SMLNUM: */ if ( xj > tjj * bignum ) { /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */ rec = (tjj * bignum) / xj; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } x[j] = x[j] / tjjs; } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */ /* scale = 0 and compute a solution to A**T *x = 0. */ for( i = 0; i < n; ++i ) { x[i] = c_zero; } x[j] = c_one; *scale = 0.; xmax = 0.; } L160: ; } else { /* Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot */ /* product has already been divided by 1/A(j,j). */ x[j] = (x[j] / tjjs) - csumj; } xmax = max( xmax, MAGMA_C_ABS1( x[j] )); } } else { /* ---------------------------------------- */ /* Solve A**H * x = b */ for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) { /* Compute x(j) = b(j) - sum A(k,j)*x(k). */ /* k<>j */ xj = MAGMA_C_ABS1( x[j] ); uscal = MAGMA_C_MAKE( tscal, 0. ); rec = 1. / max(xmax, 1.); if ( cnorm[j] > (bignum - xj) * rec ) { /* If x(j) could overflow, scale x by 1/(2*XMAX). */ rec *= 0.5; if ( nounit ) { tjjs = MAGMA_C_CONJ( *A(j,j) - lambda ) * tscal; } else { tjjs = (c_one - lambda) * tscal; } tjj = MAGMA_C_ABS1( tjjs ); if ( tjj > 1. ) { /* Divide by A(j,j) when scaling x if A(j,j) > 1. */ rec = min( 1., rec * tjj ); uscal = uscal / tjjs; } if ( rec < 1. ) { blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } } csumj = c_zero; if ( uscal == c_one ) { /* If the scaling needed for A in the dot product is 1, */ /* call CDOTC to perform the dot product. */ if ( upper ) { csumj = magma_cblas_cdotc( j, A(0,j), ione, &x[0], ione ); } else if ( j < n-1 ) { csumj = magma_cblas_cdotc( n-(j+1), A(j+1,j), ione, &x[j+1], ione ); } } else { /* Otherwise, use in-line code for the dot product. */ if ( upper ) { for( i = 0; i < j; ++i ) { csumj += (MAGMA_C_CONJ( *A(i,j) ) * uscal) * x[i]; } } else if ( j < n-1 ) { for( i = j + 1; i < n; ++i ) { csumj += (MAGMA_C_CONJ( *A(i,j) ) * uscal) * x[i]; } } } if ( uscal == tscal ) { /* Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) */ /* was not used to scale the dotproduct. */ x[j] -= csumj; xj = MAGMA_C_ABS1( x[j] ); if ( nounit ) { tjjs = MAGMA_C_CONJ( *A(j,j) - lambda ) * tscal; } else { tjjs = (c_one - lambda) * tscal; if ( tscal == 1. ) { goto L210; } } /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */ tjj = MAGMA_C_ABS1( tjjs ); if ( tjj > smlnum ) { /* abs(A(j,j)) > SMLNUM: */ if ( tjj < 1. ) { if ( xj > tjj * bignum ) { /* Scale X by 1/abs(x(j)). */ rec = 1. / xj; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } } x[j] = x[j] / tjjs; } else if ( tjj > 0. ) { /* 0 < abs(A(j,j)) <= SMLNUM: */ if ( xj > tjj * bignum ) { /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */ rec = (tjj * bignum) / xj; blasf77_csscal( &n, &rec, &x[0], &ione ); *scale *= rec; xmax *= rec; } x[j] = x[j] / tjjs; } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */ /* scale = 0 and compute a solution to A**H *x = 0. */ for( i = 0; i < n; ++i ) { x[i] = c_zero; } x[j] = c_one; *scale = 0.; xmax = 0.; } L210: ; } else { /* Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot */ /* product has already been divided by 1/A(j,j). */ x[j] = (x[j] / tjjs) - csumj; } xmax = max( xmax, MAGMA_C_ABS1( x[j] )); } } *scale /= tscal; /* Scale the column norms by 1/TSCAL for return. */ if ( tscal != 1. ) { float d = 1. / tscal; blasf77_sscal( &n, &d, &cnorm[0], &ione ); } return *info; } /* end clatrsd */
magma_int_t magma_ctrevc3_mt( magma_side_t side, magma_vec_t howmany, magma_int_t *select, // logical in Fortran magma_int_t n, magmaFloatComplex *T, magma_int_t ldt, magmaFloatComplex *VL, magma_int_t ldvl, magmaFloatComplex *VR, magma_int_t ldvr, magma_int_t mm, magma_int_t *mout, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t *info ) { #define T(i,j) ( T + (i) + (j)*ldt ) #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) #define work(i,j) (work + (i) + (j)*n) // .. Parameters .. const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magma_int_t nbmin = 16, nbmax = 128; const magma_int_t ione = 1; // .. Local Scalars .. magma_int_t allv, bothv, leftv, over, rightv, somev; magma_int_t i, ii, is, j, k, ki, iv, n2, nb, nb2, version; float ovfl, remax, smin, smlnum, ulp, unfl; // Decode and test the input parameters bothv = (side == MagmaBothSides); rightv = (side == MagmaRight) || bothv; leftv = (side == MagmaLeft ) || bothv; allv = (howmany == MagmaAllVec); over = (howmany == MagmaBacktransVec); somev = (howmany == MagmaSomeVec); // Set mout to the number of columns required to store the selected // eigenvectors. if ( somev ) { *mout = 0; for( j=0; j < n; ++j ) { if ( select[j] ) { *mout += 1; } } } else { *mout = n; } *info = 0; if ( ! rightv && ! leftv ) *info = -1; else if ( ! allv && ! over && ! somev ) *info = -2; else if ( n < 0 ) *info = -4; else if ( ldt < max( 1, n ) ) *info = -6; else if ( ldvl < 1 || ( leftv && ldvl < n ) ) *info = -8; else if ( ldvr < 1 || ( rightv && ldvr < n ) ) *info = -10; else if ( mm < *mout ) *info = -11; else if ( lwork < max( 1, 2*n ) ) *info = -14; if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } // Quick return if possible. if ( n == 0 ) { return *info; } // Use blocked version (2) if sufficient workspace. // Requires 1 vector to save diagonal elements, and 2*nb vectors for x and Q*x. // (Compared to dtrevc3, rwork stores 1-norms.) // Zero-out the workspace to avoid potential NaN propagation. nb = 2; if ( lwork >= n + 2*n*nbmin ) { version = 2; nb = (lwork - n) / (2*n); nb = min( nb, nbmax ); nb2 = 1 + 2*nb; lapackf77_claset( "F", &n, &nb2, &c_zero, &c_zero, work, &n ); } else { version = 1; } // Set the constants to control overflow. unfl = lapackf77_slamch( "Safe minimum" ); ovfl = 1. / unfl; lapackf77_slabad( &unfl, &ovfl ); ulp = lapackf77_slamch( "Precision" ); smlnum = unfl*( n / ulp ); // Store the diagonal elements of T in working array work. for( i=0; i < n; ++i ) { *work(i,0) = *T(i,i); } // Compute 1-norm of each column of strictly upper triangular // part of T to control overflow in triangular solver. rwork[0] = 0.; for( j=1; j < n; ++j ) { rwork[j] = cblas_scasum( j, T(0,j), ione ); } // launch threads -- each single-threaded MKL magma_int_t nthread = magma_get_parallel_numthreads(); magma_int_t lapack_nthread = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( 1 ); magma_queue queue; queue.launch( nthread ); //printf( "nthread %d, %d\n", nthread, lapack_nthread ); // NB = N/thread, rounded up to multiple of 16, // but avoid multiples of page size, e.g., 512*8 bytes = 4096. magma_int_t NB = magma_int_t( ceil( ceil( ((float)n) / nthread ) / 16. ) * 16. ); if ( NB % 512 == 0 ) { NB += 32; } magma_timer_t time_total=0, time_trsv=0, time_gemm=0, time_gemv=0, time_trsv_sum=0, time_gemm_sum=0, time_gemv_sum=0; timer_start( time_total ); if ( rightv ) { // ============================================================ // Compute right eigenvectors. // iv is index of column in current block. // Non-blocked version always uses iv=1; // blocked version starts with iv=nb, goes down to 1. // (Note the "0-th" column is used to store the original diagonal.) iv = 1; if ( version == 2 ) { iv = nb; } timer_start( time_trsv ); is = *mout - 1; for( ki=n-1; ki >= 0; --ki ) { if ( somev ) { if ( ! select[ki] ) { continue; } } smin = max( ulp*( MAGMA_C_ABS1( *T(ki,ki) ) ), smlnum ); // -------------------------------------------------------- // Complex right eigenvector *work(ki,iv) = c_one; // Form right-hand side. for( k=0; k < ki; ++k ) { *work(k,iv) = -(*T(k,ki)); } // Solve upper triangular system: // [ T(1:ki-1,1:ki-1) - T(ki,ki) ]*X = scale*work. if ( ki > 0 ) { queue.push_task( new magma_clatrsd_task( MagmaUpper, MagmaNoTrans, MagmaNonUnit, MagmaTrue, ki, T, ldt, *T(ki,ki), work(0,iv), work(ki,iv), rwork )); } // Copy the vector x or Q*x to VR and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VR and normalize queue.sync(); n2 = ki+1; blasf77_ccopy( &n2, work(0,iv), &ione, VR(0,is), &ione ); ii = blasf77_icamax( &n2, VR(0,is), &ione ) - 1; remax = 1. / MAGMA_C_ABS1( *VR(ii,is) ); blasf77_csscal( &n2, &remax, VR(0,is), &ione ); for( k=ki+1; k < n; ++k ) { *VR(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. queue.sync(); time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemv ); if ( ki > 0 ) { blasf77_cgemv( "n", &n, &ki, &c_one, VR, &ldvr, work(0, iv), &ione, work(ki,iv), VR(0,ki), &ione ); } time_gemv_sum += timer_stop( time_gemv ); ii = blasf77_icamax( &n, VR(0,ki), &ione ) - 1; remax = 1. / MAGMA_C_ABS1( *VR(ii,ki) ); blasf77_csscal( &n, &remax, VR(0,ki), &ione ); timer_start( time_trsv ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out below vector for( k=ki+1; k < n; ++k ) { *work(k,iv) = c_zero; } // Columns iv:nb of work are valid vectors. // When the number of vectors stored reaches nb, // or if this was last vector, do the GEMM if ( (iv == 1) || (ki == 0) ) { queue.sync(); time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemm ); nb2 = nb-iv+1; n2 = ki+nb-iv+1; // split gemm into multiple tasks, each doing one block row for( i=0; i < n; i += NB ) { magma_int_t ib = min( NB, n-i ); queue.push_task( new cgemm_task( MagmaNoTrans, MagmaNoTrans, ib, nb2, n2, c_one, VR(i,0), ldvr, work(0,iv ), n, c_zero, work(i,nb+iv), n )); } queue.sync(); time_gemm_sum += timer_stop( time_gemm ); // normalize vectors // TODO if somev, should copy vectors individually to correct location. for( k = iv; k <= nb; ++k ) { ii = blasf77_icamax( &n, work(0,nb+k), &ione ) - 1; remax = 1. / MAGMA_C_ABS1( *work(ii,nb+k) ); blasf77_csscal( &n, &remax, work(0,nb+k), &ione ); } lapackf77_clacpy( "F", &n, &nb2, work(0,nb+iv), &n, VR(0,ki), &ldvr ); iv = nb; timer_start( time_trsv ); } else { iv -= 1; } } // blocked back-transform is -= 1; } } timer_stop( time_trsv ); timer_stop( time_total ); timer_printf( "trevc trsv %.4f, gemm %.4f, gemv %.4f, total %.4f\n", time_trsv_sum, time_gemm_sum, time_gemv_sum, time_total ); if ( leftv ) { // ============================================================ // Compute left eigenvectors. // iv is index of column in current block. // Non-blocked version always uses iv=1; // blocked version starts with iv=1, goes up to nb. // (Note the "0-th" column is used to store the original diagonal.) iv = 1; is = 0; for( ki=0; ki < n; ++ki ) { if ( somev ) { if ( ! select[ki] ) { continue; } } smin = max( ulp*MAGMA_C_ABS1( *T(ki,ki) ), smlnum ); // -------------------------------------------------------- // Complex left eigenvector *work(ki,iv) = c_one; // Form right-hand side. for( k = ki + 1; k < n; ++k ) { *work(k,iv) = -MAGMA_C_CNJG( *T(ki,k) ); } // Solve conjugate-transposed triangular system: // [ T(ki+1:n,ki+1:n) - T(ki,ki) ]**H * X = scale*work. // TODO what happens with T(k,k) - lambda is small? Used to have < smin test. if ( ki < n-1 ) { n2 = n-ki-1; queue.push_task( new magma_clatrsd_task( MagmaUpper, MagmaConjTrans, MagmaNonUnit, MagmaTrue, n2, T(ki+1,ki+1), ldt, *T(ki,ki), work(ki+1,iv), work(ki,iv), rwork )); } // Copy the vector x or Q*x to VL and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VL and normalize queue.sync(); n2 = n-ki; blasf77_ccopy( &n2, work(ki,iv), &ione, VL(ki,is), &ione ); ii = blasf77_icamax( &n2, VL(ki,is), &ione ) + ki - 1; remax = 1. / MAGMA_C_ABS1( *VL(ii,is) ); blasf77_csscal( &n2, &remax, VL(ki,is), &ione ); for( k=0; k < ki; ++k ) { *VL(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. queue.sync(); if ( ki < n-1 ) { n2 = n-ki-1; blasf77_cgemv( "n", &n, &n2, &c_one, VL(0,ki+1), &ldvl, work(ki+1,iv), &ione, work(ki, iv), VL(0,ki), &ione ); } ii = blasf77_icamax( &n, VL(0,ki), &ione ) - 1; remax = 1. / MAGMA_C_ABS1( *VL(ii,ki) ); blasf77_csscal( &n, &remax, VL(0,ki), &ione ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out above vector // could go from (ki+1)-NV+1 to ki for( k=0; k < ki; ++k ) { *work(k,iv) = c_zero; } // Columns 1:iv of work are valid vectors. // When the number of vectors stored reaches nb, // or if this was last vector, do the GEMM if ( (iv == nb) || (ki == n-1) ) { queue.sync(); n2 = n-(ki+1)+iv; // split gemm into multiple tasks, each doing one block row for( i=0; i < n; i += NB ) { magma_int_t ib = min( NB, n-i ); queue.push_task( new cgemm_task( MagmaNoTrans, MagmaNoTrans, ib, iv, n2, c_one, VL(i,ki-iv+1), ldvl, work(ki-iv+1,1), n, c_zero, work(i,nb+1), n )); } queue.sync(); // normalize vectors for( k=1; k <= iv; ++k ) { ii = blasf77_icamax( &n, work(0,nb+k), &ione ) - 1; remax = 1. / MAGMA_C_ABS1( *work(ii,nb+k) ); blasf77_csscal( &n, &remax, work(0,nb+k), &ione ); } lapackf77_clacpy( "F", &n, &iv, work(0,nb+1), &n, VL(0,ki-iv+1), &ldvl ); iv = 1; } else { iv += 1; } } // blocked back-transform is += 1; } } // close down threads queue.quit(); magma_set_lapack_numthreads( lapack_nthread ); return *info; } // End of CTREVC
/** Purpose ======= CLAHEF computes a partial factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. The partial factorization has the form: A = ( I U12 ) ( A11 0 ) ( I 0 ) if UPLO = 'U', or: ( 0 U22 ) ( 0 D ) ( U12' U22' ) A = ( L11 0 ) ( D 0 ) ( L11' L21' ) if UPLO = 'L' ( L21 I ) ( 0 A22 ) ( 0 I ) where the order of D is at most NB. The actual order is returned in the argument KB, and is either NB or NB-1, or N if N <= NB. Note that U' denotes the conjugate transpose of U. CLAHEF is an auxiliary routine called by CHETRF. It uses blocked code (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or A22 (if UPLO = 'L'). Arguments --------- @param[in] UPLO CHARACTER Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = 'U': Upper triangular - = 'L': Lower triangular @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in] NB INTEGER The maximum number of columns of the matrix A that should be factored. NB should be at least 2 to allow for 2-by-2 pivot blocks. @param[out] KB INTEGER The number of columns of A that were actually factored. KB is either NB-1 or NB, or N if N <= NB. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, A contains details of the partial factorization. @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] ipiv INTEGER array, dimension (N) Details of the interchanges and the block structure of D. If UPLO = 'U', only the last KB elements of ipiv are set; if UPLO = 'L', only the first KB elements are set. \n If ipiv(k) > 0, then rows and columns k and ipiv(k) were interchanged and D(k,k) is a 1-by-1 diagonal block. If UPLO = 'U' and ipiv(k) = ipiv(k-1) < 0, then rows and columns k-1 and -ipiv(k) were interchanged and D(k-1:k,k-1:k) is a 2-by-2 diagonal block. If UPLO = 'L' and ipiv(k) = ipiv(k+1) < 0, then rows and columns k+1 and -ipiv(k) were interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block. @param[out] W (workspace) COMPLEX array, dimension (LDW,NB) @param[in] LDW INTEGER The leading dimension of the array W. LDW >= max(1,N). @param[out] INFO INTEGER - = 0: successful exit - > 0: if INFO = k, D(k,k) is exactly zero. The factorization has been completed, but the block diagonal matrix D is exactly singular. @ingroup magma_chetrf_comp ********************************************************************/ extern "C" magma_int_t magma_clahef_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magmaFloatComplex_ptr dW, size_t dW_offset, magma_int_t lddw, magma_queue_t queue, magma_int_t *info) { /* .. Parameters .. */ float d_one = 1.0; float d_zero = 0.0; float d_eight = 8.0; float d_seven = 7.0; #if defined(PRECISION_c) float f_zero = 0.0; #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_mone = -MAGMA_C_ONE; magma_int_t upper = (uplo == MagmaUpper); magma_int_t ione = 1; /* .. Local Scalars .. */ magma_int_t imax = 0, jmax = 0, kk, kkW, kp, kstep, iinfo; float abs_akk, alpha, colmax, R1, rowmax; magmaFloatComplex Zimax, Z; #define dA(i, j) dA, dA_offset + (j)*ldda + (i) #define dW(i, j) dW, dW_offset + (j)*lddw + (i) #define A(i, j) (hA + (j)*lda + (i)) /* .. Executable Statements .. */ *info = 0; /* Initialize alpha for use in choosing pivot block size. */ alpha = ( d_one+sqrt( d_seven ) ) / d_eight; magma_event_t event = NULL; if( upper ) { /* Factorize the trailing columns of A using the upper triangle of A and working backwards, and compute the matrix W = U12*D for use in updating A11 (note that conjg(W) is actually stored) K is the main loop index, decreasing from N in steps of 1 or 2 KW is the column of W which corresponds to column K of A */ int k, kw = 0; for (k = n-1; k+1 > max(n-nb+1, nb); k -= kstep) { kw = nb - (n-k); /* Copy column K of A to column KW of W and update it */ magma_ccopy( k+1, dA( 0, k ), 1, dW( 0, kw ), 1, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif if (k+1 < n) { magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( k, kw+1 ), lddw, c_one, dW( 0, kw ), ione, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } kstep = 1; /* Determine rows and columns to be interchanged and whether a 1-by-1 or 2-by-2 pivot block will be used */ magma_cgetvector_async( 1, dW( k, kw ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); abs_akk = fabs( MAGMA_C_REAL( Z ) ); /* imax is the row-index of the largest off-diagonal element in column K, and colmax is its absolute value */ if( k > 0 ) { // magma is one-base imax = magma_icamax( k, dW( 0, kw ), 1, queue ) - 1; magma_cgetvector( 1, dW( imax, kw ), 1, &Z, 1, queue ); colmax = MAGMA_C_ABS1( Z ); } else { colmax = d_zero; } if( max( abs_akk, colmax ) == 0.0 ) { /* Column K is zero: set INFO and continue */ if ( *info == 0 ) *info = k; kp = k; #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } else { if( abs_akk >= alpha*colmax ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else { /* Copy column imax to column KW-1 of W and update it */ magma_ccopy( imax+1, dA( 0, imax ), 1, dW( 0, kw-1 ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #endif #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue ); #else magma_ccopy( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue ); #endif if( k+1 < n ) { magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( imax, kw+1 ), lddw, c_one, dW( 0, kw-1 ), ione, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #endif } magma_cgetvector_async( 1, dW( imax, kw-1 ), 1, &Zimax, 1, queue, &event ); magma_queue_sync( queue ); /* jmax is the column-index of the largest off-diagonal element in row imax, and rowmax is its absolute value */ jmax = imax + magma_icamax( k-imax, dW( imax+1, kw-1 ), 1, queue ); magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue ); rowmax = MAGMA_C_ABS1( Z ); if ( imax > 0 ) { // magma is one-base jmax = magma_icamax( imax, dW( 0, kw-1 ), 1, queue ) - 1; magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue ); rowmax = max( rowmax, MAGMA_C_ABS1( Z ) ); } if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else if ( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) { /* interchange rows and columns K and imax, use 1-by-1 pivot block */ kp = imax; /* copy column KW-1 of W to column KW */ magma_ccopy( k+1, dW( 0, kw-1 ), 1, dW( 0, kw ), 1, queue ); } else { /* interchange rows and columns K-1 and imax, use 2-by-2 pivot block */ kp = imax; kstep = 2; } } kk = k - kstep + 1; kkW = nb - (n - kk); /* Updated column kp is already stored in column kkW of W */ if( kp != kk ) { /* Interchange rows kk and kp in last kk columns of A and W */ // note: row-swap A(:,kk) magmablas_cswap( n-kk, dA( kk, kk ), ldda, dA( kp, kk ), ldda, queue ); magmablas_cswap( n-kk, dW( kk, kkW), lddw, dW( kp, kkW), lddw, queue ); /* Copy non-updated column kk to column kp */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue ); #else magma_ccopy( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue ); #endif // now A(kp,kk) should be A(kk,kk), and copy to A(kp,kp) magma_ccopy( kp+1, dA( 0, kk ), 1, dA( 0, kp ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event ); #endif } if( kstep == 1 ) { /* 1-by-1 pivot block D(k): column KW of W now holds W(k) = U(k)*D(k) where U(k) is the k-th column of U Store U(k) in column k of A */ magma_ccopy( k+1, dW( 0, kw ), 1, dA( 0, k ), 1, queue ); if ( k > 0 ) { magma_cgetvector_async( 1, dA( k, k ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); R1 = d_one / MAGMA_C_REAL( Z ); magma_csscal( k, R1, dA( 0, k ), 1, queue ); /* Conjugate W(k) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k, dW( 0, kw ), 1, dW( 0, kw ), 1, queue ); #endif } } else { /* 2-by-2 pivot block D(k): columns KW and KW-1 of W now hold ( W(k-1) W(k) ) = ( U(k-1) U(k) )*D(k) where U(k) and U(k-1) are the k-th and (k-1)-th columns of U */ if( k > 1 ) { /* Store U(k) and U(k-1) in columns k and k-1 of A */ magmablas_clascl_2x2( MagmaUpper, k-1, dW(0, kw-1), lddw, dA(0,k-1), ldda, &iinfo, queue ); } /* Copy D(k) to A */ magma_ccopymatrix( 2, 2, dW( k-1, kw-1 ), lddw, dA( k-1, k-1 ), ldda, queue ); /* Conjugate W(k) and W(k-1) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k, dW( 0, kw ), 1, dW( 0, kw ), 1, queue ); magmablas_clacpy_cnjg( k-1, dW( 0, kw-1 ), 1, dW( 0, kw-1 ), 1, queue ); #endif } } /* Store details of the interchanges in ipiv */ if( kstep == 1 ) { ipiv[ k ] = 1+kp; } else { ipiv[ k ] = -(1+kp); ipiv[ k-1 ] = -(1+kp); } } /* Update the upper triangle of A11 (= A(1:k,1:k)) as A11 := A11 - U12*D*U12' = A11 - U12*W' computing blocks of NB columns at a time (note that conjg(W) is actually stored) */ kw = nb - (n-k); for (int j = ( k / nb )*nb; j >= 0; j -= nb ) { int jb = min( nb, k-j+1 ); #ifdef SYMMETRIC_UPDATE /* Update the upper triangle of the diagonal block */ for (int jj = j; jj < j + jb; jj++) { #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #endif magma_cgemv( MagmaNoTrans, jj-j+1, n-(k+1), c_mone, dA( j, k+1 ), ldda, dW( jj, kw+1 ), lddw, c_one, dA( j, jj ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #endif } /* Update the rectangular superdiagonal block */ magma_cgemm( MagmaNoTrans, MagmaTrans, j, jb, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( j, kw+1 ), lddw, c_one, dA( 0, j ), ldda, queue ); #else #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #endif magma_cgemm( MagmaNoTrans, MagmaTrans, j+jb, jb, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( j, kw+1 ), lddw, c_one, dA( 0, j ), ldda, queue ); #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #endif #endif } /* Put U12 in standard form by partially undoing the interchanges in columns k+1:n */ for (int j = k+1; j < n;) { int jj = j; int jp = ipiv[ j ]; if( jp < 0 ) { jp = -jp; j = j + 1; } j = j + 1; jp = jp - 1; if( jp != jj && j < n ) magmablas_cswap( n-j, dA( jp, j ), ldda, dA( jj, j ), ldda, queue ); } // copying the panel back to CPU magma_cgetmatrix_async( n, n-(k+1), dA(0,k+1), ldda, A(0,k+1), lda, queue, &event ); magma_queue_sync( queue ); /* Set KB to the number of columns factorized */ *kb = n - (k+1); } else { /* Factorize the leading columns of A using the lower triangle of A and working forwards, and compute the matrix W = L21*D for use in updating A22 (note that conjg(W) is actually stored) K is the main loop index, increasing from 1 in steps of 1 or 2 */ int k; for (k = 0; k < min(nb-1,n); k += kstep) { /* Copy column K of A to column K of W and update it */ /* -------------------------------------------------------------- */ magma_ccopy( n-k, dA( k, k ), 1, dW( k, k ), 1, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif /* -------------------------------------------------------------- */ magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda, dW( k, 0 ), lddw, c_one, dW( k, k ), ione, queue ); // re-set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif kstep = 1; /* Determine rows and columns to be interchanged and whether a 1-by-1 or 2-by-2 pivot block will be used */ magma_cgetvector_async( 1, dW( k, k ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); abs_akk = fabs( MAGMA_C_REAL( Z ) ); /* imax is the row-index of the largest off-diagonal element in column K, and colmax is its absolute value */ if( k < n-1 ) { // magmablas is one-base imax = k + magma_icamax( n-k-1, dW(k+1,k), 1, queue ); magma_cgetvector( 1, dW( imax,k ), 1, &Z, 1, queue ); colmax = MAGMA_C_ABS1( Z ); } else { colmax = d_zero; } if ( max( abs_akk, colmax ) == 0.0 ) { /* Column K is zero: set INFO and continue */ if( *info == 0 ) *info = k; kp = k; // make sure the imaginary part of diagonal is zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } else { if ( abs_akk >= alpha*colmax ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else { /* Copy column imax to column K+1 of W and update it */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( imax-k, dA(imax,k), ldda, dW(k,k+1), 1, queue ); #else magma_ccopy( imax-k, dA( imax, k ), ldda, dW( k, k+1 ), 1, queue ); #endif magma_ccopy( n-imax, dA( imax, imax ), 1, dW( imax, k+1 ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda, dW( imax, 0 ), lddw, c_one, dW( k, k+1 ), ione, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif magma_cgetvector_async( 1, dW(imax,k+1), 1, &Zimax, 1, queue, &event); magma_queue_sync( queue ); /* jmax is the column-index of the largest off-diagonal element in row imax, and rowmax is its absolute value */ // magmablas is one-base jmax = k-1 + magma_icamax( imax-k, dW(k, k+1), 1, queue ); magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue ); rowmax = MAGMA_C_ABS1( Z ); if( imax < n-1 ) { // magmablas is one-base jmax = imax + magma_icamax( (n-1)-imax, dW(imax+1,k+1), 1, queue); magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue ); rowmax = max( rowmax, MAGMA_C_ABS1( Z ) ); } if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else if( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) { /* interchange rows and columns K and imax, use 1-by-1 pivot block */ kp = imax; /* copy column K+1 of W to column K */ magma_ccopy( n-k, dW( k, k+1 ), 1, dW( k, k ), 1, queue ); } else { /* interchange rows and columns K+1 and imax, use 2-by-2 pivot block */ kp = imax; kstep = 2; } } kk = k + kstep - 1; /* Updated column kp is already stored in column kk of W */ if( kp != kk ) { /* Copy non-updated column kk to column kp */ /* ------------------------------------------------------------------ */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue ); #else magma_ccopy( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue ); #endif if ( kp < n ) { magma_ccopy( n-kp, dA( kp, kk), 1, dA( kp, kp ), 1, queue ); } /* ------------------------------------------------------------------ */ /* Interchange rows kk and kp in first kk columns of A and W */ magmablas_cswap( kk+1, dA( kk, 0 ), ldda, dA( kp, 0 ), ldda, queue ); magmablas_cswap( kk+1, dW( kk, 0 ), lddw, dW( kp, 0 ), lddw, queue ); } if ( kstep == 1 ) { /* 1-by-1 pivot block D(k): column k of W now holds W(k) = L(k)*D(k) where L(k) is the k-th column of L Store L(k) in column k of A */ magma_ccopy( n-k, dW( k, k ), 1, dA( k, k ), 1, queue ); if ( k < n-1 ) { magma_cgetvector_async( 1, dA(k,k), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); R1 = d_one / MAGMA_C_REAL( Z ); magma_csscal((n-1)-k, R1, dA( k+1,k ), 1, queue); /* Conjugate W(k) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( (n-1)-k, dW( k+1,k ), 1, dW( k+1,k ), 1, queue ); #endif } } else { /* 2-by-2 pivot block D(k): columns k and k+1 of W now hold ( W(k) W(k+1) ) = ( L(k) L(k+1) )*D(k) where L(k) and L(k+1) are the k-th and (k+1)-th columns of L */ magmablas_clascl_2x2( MagmaLower, n-(k+2), dW(k,k), lddw, dA(k+2,k), ldda, &iinfo, queue ); /* Copy D(k) to A */ magma_ccopymatrix( 2, 2, dW( k, k ), lddw, dA( k, k ), ldda, queue ); /* Conjugate W(k) and W(k+1) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( (n-1)-k, dW( k+1,k ), 1, dW( k+1,k ), 1, queue ); magmablas_clacpy_cnjg( (n-1)-k-1, dW( k+2,k+1), 1, dW( k+2,k+1 ), 1, queue ); #endif } } /* Store details of the interchanges in ipiv */ if ( kstep == 1 ) { ipiv[k] = kp+1; } else { ipiv[k] = -kp-1; ipiv[k+1] = -kp-1; } } /* Update the lower triangle of A22 (= A(k:n,k:n)) as A22 := A22 - L21*D*L21' = A22 - L21*W' computing blocks of NB columns at a time (note that conjg(W) is actually stored) */ for( int j = k; j < n; j += nb ) { int jb = min( nb, n-j ); /* Update the lower triangle of the diagonal block */ #ifdef SYMMETRIC_UPDATE for (int jj = j; jj < j + jb; jj++) { int jnb = j + jb - jj; /* -------------------------------------------------------- */ magma_cgemv( MagmaNoTrans, jnb, k, c_mone, dA( jj, 0 ), ldda, dW( jj, 0 ), lddw, c_one, dA( jj, jj ), ione, queue ); /* -------------------------------------------------------- */ } /* Update the rectangular subdiagonal block */ if( j+jb < n ) { int nk = n - (j+jb); /* -------------------------------------------- */ magma_cgemm( MagmaNoTrans, MagmaTrans, nk, jb, k, c_mone, dA( j+jb, 0 ), ldda, dW( j, 0 ), lddw, c_one, dA( j+jb, j ), ldda, queue ); /* ------------------------------------------- */ } #else #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #endif magma_cgemm( MagmaNoTrans, MagmaTrans, n-j, jb, k, c_mone, dA( j, 0 ), ldda, dW( j, 0 ), lddw, c_one, dA( j, j ), ldda, queue ); #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #endif #endif } /* Put L21 in standard form by partially undoing the interchanges in columns 1:k-1 */ for (int j = k; j > 0;) { int jj = j; int jp = ipiv[j-1]; if( jp < 0 ) { jp = -jp; j--; } j--; if ( jp != jj && j >= 1 ) { magmablas_cswap( j, dA( jp-1,0 ), ldda, dA( jj-1,0 ), ldda, queue ); } } // copying the panel back to CPU magma_cgetmatrix_async( n, k, dA(0,0), ldda, A(0,0), lda, queue, &event ); magma_queue_sync( queue ); /* Set KB to the number of columns factorized */ *kb = k; } return *info; /* End of CLAHEF */ }