int CORE_zttlqt(int M, int N, int IB, PLASMA_Complex64_t *A1, int LDA1, PLASMA_Complex64_t *A2, int LDA2, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK) { static PLASMA_Complex64_t zone = 1.0; static PLASMA_Complex64_t zzero = 0.0; #ifdef COMPLEX static int ione = 1; #endif PLASMA_Complex64_t alpha; int i, j, l, ii, sb, mi, ni; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA2 < max(1,M)) && (M > 0)) { coreblas_error(7, "Illegal value of LDA2"); return -7; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; /* TODO: Need to check why some cases require * this to not have uninitialized values */ CORE_zlaset( PlasmaUpperLower, IB, N, 0., 0., T, LDT); for(ii = 0; ii < M; ii += IB) { sb = min(M-ii, IB); for(i = 0; i < sb; i++) { j = ii + i; mi = sb-i-1; ni = min( j + 1, N); /* * Generate elementary reflector H( II*IB+I ) to annihilate A( II*IB+I, II*IB+I:M ). */ #ifdef COMPLEX LAPACKE_zlacgv_work(ni, &A2[j], LDA2); LAPACKE_zlacgv_work(ione, &A1[LDA1*j+j], LDA1); #endif LAPACKE_zlarfg_work(ni+1, &A1[LDA1*j+j], &A2[j], LDA2, &TAU[j]); if (mi > 0) { /* * Apply H( j-1 ) to A( j:II+IB-1, j-1:M ) from the right. */ cblas_zcopy( mi, &A1[LDA1*j+(j+1)], 1, WORK, 1); cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, mi, ni, CBLAS_SADDR(zone), &A2[j+1], LDA2, &A2[j], LDA2, CBLAS_SADDR(zone), WORK, 1); alpha = -(TAU[j]); cblas_zaxpy( mi, CBLAS_SADDR(alpha), WORK, 1, &A1[LDA1*j+j+1], 1); cblas_zgerc( CblasColMajor, mi, ni, CBLAS_SADDR(alpha), WORK, 1, &A2[j], LDA2, &A2[j+1], LDA2); } /* * Calculate T. */ if (i > 0 ) { l = min(i, max(0, N-ii)); alpha = -(TAU[j]); CORE_zpemv( PlasmaNoTrans, PlasmaRowwise, i , min(j, N), l, alpha, &A2[ii], LDA2, &A2[j], LDA2, zzero, &T[LDT*j], 1, WORK); /* T(0:i-1, j) = T(0:i-1, ii:j-1) * T(0:i-1, j) */ cblas_ztrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &T[LDT*ii], LDT, &T[LDT*j], 1); } #ifdef COMPLEX LAPACKE_zlacgv_work(ni, &A2[j], LDA2 ); LAPACKE_zlacgv_work(ione, &A1[LDA1*j+j], LDA1 ); #endif T[LDT*j+i] = TAU[j]; } /* Apply Q to the rest of the matrix to the right */ if (M > ii+sb) { mi = M-(ii+sb); ni = min(ii+sb, N); l = min(sb, max(0, ni-ii)); CORE_zparfb( PlasmaRight, PlasmaNoTrans, PlasmaForward, PlasmaRowwise, mi, IB, mi, ni, sb, l, &A1[LDA1*ii+ii+sb], LDA1, &A2[ii+sb], LDA2, &A2[ii], LDA2, &T[LDT*ii], LDT, WORK, M); } } return PLASMA_SUCCESS; }
int CORE_zttqrt(int M, int N, int IB, PLASMA_Complex64_t *A1, int LDA1, PLASMA_Complex64_t *A2, int LDA2, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK) { static PLASMA_Complex64_t zone = 1.0; static PLASMA_Complex64_t zzero = 0.0; static int ione = 1; PLASMA_Complex64_t alpha; int i, j, ii, sb, mi, ni; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA2 < max(1,M)) && (M > 0)) { coreblas_error(7, "Illegal value of LDA2"); return -7; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; for(ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for(i = 0; i < sb; i++) { /* * Generate elementary reflector H( II*IB+I ) to annihilate * A( II*IB+I:mi, II*IB+I ). */ mi = ii + i + 1; LAPACKE_zlarfg_work(mi+1, &A1[LDA1*(ii+i)+ii+i], &A2[LDA2*(ii+i)], ione, &TAU[ii+i]); if (sb-i-1>0) { /* * Apply H( II*IB+I ) to A( II*IB+I:M, II*IB+I+1:II*IB+IB ) from the left. */ ni = sb-i-1; cblas_zcopy( ni, &A1[LDA1*(ii+i+1)+(ii+i)], LDA1, WORK, 1); #ifdef COMPLEX LAPACKE_zlacgv_work(ni, WORK, ione); #endif cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, mi, ni, CBLAS_SADDR(zone), &A2[LDA2*(ii+i+1)], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zone), WORK, 1); #ifdef COMPLEX LAPACKE_zlacgv_work(ni, WORK, ione); #endif alpha = -conj(TAU[ii+i]); cblas_zaxpy( ni, CBLAS_SADDR(alpha), WORK, 1, &A1[LDA1*(ii+i+1)+ii+i], LDA1); #ifdef COMPLEX LAPACKE_zlacgv_work(ni, WORK, ione); #endif cblas_zgerc( CblasColMajor, mi, ni, CBLAS_SADDR(alpha), &A2[LDA2*(ii+i)], 1, WORK, 1, &A2[LDA2*(ii+i+1)], LDA2); } /* * Calculate T. */ if (i > 0 ) { cblas_zcopy(i, &A2[LDA2*(ii+i)+ii], 1, &WORK[ii], 1); cblas_ztrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaConjTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &A2[LDA2*ii+ii], LDA2, &WORK[ii], 1); alpha = -(TAU[ii+i]); for(j = 0; j < i; j++) { WORK[ii+j] = alpha * WORK[ii+j]; } if (ii > 0) { cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, ii, i, CBLAS_SADDR(alpha), &A2[LDA2*ii], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zzero), WORK, 1); cblas_zaxpy(i, CBLAS_SADDR(zone), &WORK[ii], 1, WORK, 1); } cblas_zcopy(i, WORK, 1, &T[LDT*(ii+i)], 1); cblas_ztrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &T[LDT*ii], LDT, &T[LDT*(ii+i)], 1); } T[LDT*(ii+i)+i] = TAU[ii+i]; } /* Apply Q' to the rest of the matrix to the left */ if (N > ii+sb) { CORE_zttrfb( PlasmaLeft, PlasmaConjTrans, PlasmaForward, PlasmaColumnwise, sb, N-(ii+sb), ii+sb, N-(ii+sb), sb, &A1[LDA1*(ii+sb)+ii], LDA1, &A2[LDA2*(ii+sb)], LDA2, &A2[LDA2*ii], LDA2, &T[LDT*ii], LDT, WORK, sb); } } return PLASMA_SUCCESS; }
int CORE_ztsqrt(int M, int N, int IB, PLASMA_Complex64_t *A1, int LDA1, PLASMA_Complex64_t *A2, int LDA2, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK) { static PLASMA_Complex64_t zone = 1.0; static PLASMA_Complex64_t zzero = 0.0; PLASMA_Complex64_t alpha; int i, ii, sb; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA2 < max(1,M)) && (M > 0)) { coreblas_error(8, "Illegal value of LDA2"); return -8; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; for(ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for(i = 0; i < sb; i++) { /* * Generate elementary reflector H( II*IB+I ) to annihilate * A( II*IB+I:M, II*IB+I ) */ LAPACKE_zlarfg_work(M+1, &A1[LDA1*(ii+i)+ii+i], &A2[LDA2*(ii+i)], 1, &TAU[ii+i]); if (ii+i+1 < N) { /* * Apply H( II*IB+I ) to A( II*IB+I:M, II*IB+I+1:II*IB+IB ) from the left */ alpha = -conj(TAU[ii+i]); cblas_zcopy( sb-i-1, &A1[LDA1*(ii+i+1)+(ii+i)], LDA1, WORK, 1); #ifdef COMPLEX LAPACKE_zlacgv_work(sb-i-1, WORK, 1); #endif cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, M, sb-i-1, CBLAS_SADDR(zone), &A2[LDA2*(ii+i+1)], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zone), WORK, 1); #ifdef COMPLEX LAPACKE_zlacgv_work(sb-i-1, WORK, 1 ); #endif cblas_zaxpy( sb-i-1, CBLAS_SADDR(alpha), WORK, 1, &A1[LDA1*(ii+i+1)+ii+i], LDA1); #ifdef COMPLEX LAPACKE_zlacgv_work(sb-i-1, WORK, 1 ); #endif cblas_zgerc( CblasColMajor, M, sb-i-1, CBLAS_SADDR(alpha), &A2[LDA2*(ii+i)], 1, WORK, 1, &A2[LDA2*(ii+i+1)], LDA2); } /* * Calculate T */ alpha = -TAU[ii+i]; cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, M, i, CBLAS_SADDR(alpha), &A2[LDA2*ii], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zzero), &T[LDT*(ii+i)], 1); cblas_ztrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &T[LDT*ii], LDT, &T[LDT*(ii+i)], 1); T[LDT*(ii+i)+i] = TAU[ii+i]; } if (N > ii+sb) { CORE_ztsmqr( PlasmaLeft, PlasmaConjTrans, sb, N-(ii+sb), M, N-(ii+sb), IB, IB, &A1[LDA1*(ii+sb)+ii], LDA1, &A2[LDA2*(ii+sb)], LDA2, &A2[LDA2*ii], LDA2, &T[LDT*ii], LDT, WORK, sb); } } return PLASMA_SUCCESS; }
/***************************************************************************//** * @ingroup CORE_PLASMA_Complex64_t ******************************************************************************* * * Purpose * ======= * * CORE_zlarfx2c applies a complex elementary reflector H to a diagonal corner * C=[C1, C2, C3], from both the left and the right side. C = H * C * H. * It is used in the case of general matrices, where it create a nnz at the * NEW_NNZ position, then it eliminate it and update the reflector V and TAU. * If PlasmaLower, a left apply is followed by a right apply. * If PlasmaUpper, a right apply is followed by a left apply. * H is represented in the form * * This routine is a special code for a corner C diagonal block C1 NEW_NNZ * C2 C3 * * * H = I - tau * v * v' * * where tau is a complex scalar and v is a complex vector. * * If tau = 0, then H is taken to be the unit matrix * * This version uses inline code if H has order < 11. * * Arguments * ========= * * @param[in] uplo * = PlasmaUpper: Upper triangle of A is stored; * = PlasmaLower: Lower triangle of A is stored. * * @param[in, out] V * On entry, the double complex V in the representation of H. * On exit, the double complex V in the representation of H, * updated by the elimination of the NEW_NNZ created by the * left apply in case of PlasmaLower or the right apply in * case of PlasmaUpper. * * @param[in] TAU * On entry, the value tau in the representation of H. * On exit, the value tau in the representation of H, * updated by the elimination of the NEW_NNZ created by the * left apply in case of PlasmaLower or the right apply in * case of PlasmaUpper. * * @param[in,out] C1 * On entry, the element C1. * On exit, C1 is overwritten by the result H * C * H. * * @param[in,out] C2 * On entry, the element C2. * On exit, C2 is overwritten by the result H * C * H. * * @param[in,out] C3 * On entry, the element C3. * On exit, C3 is overwritten by the result H * C * H. * * ===================================================================== * * @return * \retval PLASMA_SUCCESS successful exit * \retval <0 if -i, the i-th argument had an illegal value * ******************************************************************************/ int CORE_zlarfx2ce(PLASMA_enum uplo, PLASMA_Complex64_t *V, PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *C1, PLASMA_Complex64_t *C2, PLASMA_Complex64_t *C3) { PLASMA_Complex64_t T2, SUM, TEMP, VIN, TAUIN; /* Quick return */ if (*TAU == (PLASMA_Complex64_t)0.0) return PLASMA_SUCCESS; /* * Special code for a diagonal block C1 * C2 C3 */ if(uplo==PlasmaLower){ /* * Do the corner for the lower case BIDIAG ==> Left then will * create a new nnz. eliminate it and modify V TAU and then * Right L and R for the 2x2 corner * C(N-1, N-1) C(N-1,N) C1 TEMP * C(N , N-1) C(N ,N) C2 C3 */ VIN = *V; TAUIN = conj(*TAU); /* Left 1 ==> C1 */ /* C2 */ VIN = conj(VIN); T2 = TAUIN * conj(VIN); SUM = *C1 + VIN*(*C2); *C1 = *C1 - SUM*TAUIN; *C2 = *C2 - SUM*T2; /* new nnz at TEMP and update C3 */ SUM = VIN * (*C3); TEMP = - SUM * TAUIN; *C3 = *C3 - SUM * T2; /* generate Householder to annihilate the nonzero created at TEMP */ *V = TEMP; LAPACKE_zlarfg_work( 2, C1, V, 1, TAU); VIN = conj(*V); TAUIN = conj(*TAU); /* Right 1 ==> C2 C3 */ /* VIN = VIN */ T2 = TAUIN * conj(VIN); SUM = *C2 + VIN*(*C3); *C2 = *C2 - SUM*TAUIN; *C3 = *C3 - SUM*T2; }else if(uplo==PlasmaUpper){ /* * Do the corner for the upper case BIDIAG ==> Right then will * create a new nnz. eliminate it and modify V TAU and then * Left * C(N-1, N-1) C(N-1,N) C1 C2 * C(N , N-1) C(N ,N) TEMP C3 * For Left : use conj(TAU) and V. * For Right: use conj(TAU) and conj(V) as input. */ VIN = conj(*V); TAUIN = conj(*TAU); /* Right 1 ==> C1 C2 */ /* VIN = VIN */ T2 = TAUIN*conj(VIN); SUM = *C1 + VIN*(*C2); *C1 = *C1 - SUM*TAUIN; *C2 = *C2 - SUM*T2; /* new nnz at TEMP and update C3 */ SUM = VIN * (*C3); TEMP = - SUM * TAUIN; *C3 = *C3 - SUM * T2; /* generate Householder to annihilate the nonzero created at TEMP */ *V = TEMP; LAPACKE_zlarfg_work( 2, C1, V, 1, TAU); VIN = *V; TAUIN = conj(*TAU); /* apply from the Left using the NEW V TAU to the remaining 2 elements [C2 C3] */ /* Left 2 ==> C2 */ /* C3 */ VIN = conj(VIN); T2 = TAUIN*conj(VIN); SUM = *C2 + VIN*(*C3); *C2 = *C2 - SUM*TAUIN; *C3 = *C3 - SUM*T2; } return PLASMA_SUCCESS; }
/*************************************************************************** * TYPE 2-BAND Lower-columnwise-Householder ***************************************************************************/ void CORE_zhbtype2cb(int N, int NB, PLASMA_Complex64_t *A, int LDA, PLASMA_Complex64_t *V, PLASMA_Complex64_t *TAU, int st, int ed, int sweep, int Vblksiz, int WANTZ, PLASMA_Complex64_t *WORK) { PLASMA_Complex64_t ctmp; int J1, J2, len, lem, LDX; int blkid, vpos, taupos, tpos; if( WANTZ == 0 ) { vpos = ((sweep+1)%2)*N + st; taupos = ((sweep+1)%2)*N + st; } else { findVTpos(N, NB, Vblksiz, sweep, st, &vpos, &taupos, &tpos, &blkid); } LDX = LDA-1; J1 = ed+1; J2 = min(ed+NB,N-1); len = ed-st+1; lem = J2-J1+1; if( lem > 0 ) { /* Apply remaining right commming from the top block */ LAPACKE_zlarfx_work(LAPACK_COL_MAJOR, lapack_const(PlasmaRight), lem, len, V(vpos), *(TAU(taupos)), A(J1, st), LDX, WORK); } if( lem > 1 ) { if( WANTZ == 0 ) { vpos = ((sweep+1)%2)*N + J1; taupos = ((sweep+1)%2)*N + J1; } else { findVTpos(N,NB,Vblksiz,sweep,J1, &vpos, &taupos, &tpos, &blkid); } /* Remove the first column of the created bulge */ *V(vpos) = 1.; memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(PLASMA_Complex64_t)); memset(A(J1+1, st), 0, (lem-1)*sizeof(PLASMA_Complex64_t)); /* Eliminate the col at st */ LAPACKE_zlarfg_work( lem, A(J1, st), V(vpos+1), 1, TAU(taupos) ); /* * Apply left on A(J1:J2,st+1:ed) * We decrease len because we start at col st+1 instead of st. * col st is the col that has been revomved; */ len = len-1; ctmp = conj(*TAU(taupos)); LAPACKE_zlarfx_work(LAPACK_COL_MAJOR, lapack_const(PlasmaLeft), lem, len, V(vpos), ctmp, A(J1, st+1), LDX, WORK); } return; }