예제 #1
0
int CORE_zttlqt(int M, int N, int IB,
                PLASMA_Complex64_t *A1, int LDA1,
                PLASMA_Complex64_t *A2, int LDA2,
                PLASMA_Complex64_t *T, int LDT,
                PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK)
{
    static PLASMA_Complex64_t zone  = 1.0;
    static PLASMA_Complex64_t zzero = 0.0;
#ifdef COMPLEX
    static int                ione  = 1;
#endif

    PLASMA_Complex64_t alpha;
    int i, j, l, ii, sb, mi, ni;

    /* Check input arguments */
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (IB < 0) {
        coreblas_error(3, "Illegal value of IB");
        return -3;
    }
    if ((LDA2 < max(1,M)) && (M > 0)) {
        coreblas_error(7, "Illegal value of LDA2");
        return -7;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    /* TODO: Need to check why some cases require
     *  this to not have uninitialized values */
    CORE_zlaset( PlasmaUpperLower, IB, N,
                 0., 0., T, LDT);

    for(ii = 0; ii < M; ii += IB) {
        sb = min(M-ii, IB);
        for(i = 0; i < sb; i++) {
            j  = ii + i;
            mi = sb-i-1;
            ni = min( j + 1, N);
            /*
             * Generate elementary reflector H( II*IB+I ) to annihilate A( II*IB+I, II*IB+I:M ).
             */
#ifdef COMPLEX
            LAPACKE_zlacgv_work(ni, &A2[j], LDA2);
            LAPACKE_zlacgv_work(ione, &A1[LDA1*j+j], LDA1);
#endif
            LAPACKE_zlarfg_work(ni+1, &A1[LDA1*j+j], &A2[j], LDA2, &TAU[j]);

            if (mi > 0) {
                /*
                 * Apply H( j-1 ) to A( j:II+IB-1, j-1:M  ) from the right.
                 */
                cblas_zcopy(
                    mi,
                    &A1[LDA1*j+(j+1)], 1,
                    WORK, 1);

                cblas_zgemv(
                    CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans,
                    mi, ni,
                    CBLAS_SADDR(zone), &A2[j+1], LDA2,
                    &A2[j], LDA2,
                    CBLAS_SADDR(zone), WORK, 1);

                alpha = -(TAU[j]);
                cblas_zaxpy(
                    mi, CBLAS_SADDR(alpha),
                    WORK, 1,
                    &A1[LDA1*j+j+1], 1);

                cblas_zgerc(
                    CblasColMajor, mi, ni,
                    CBLAS_SADDR(alpha), WORK, 1,
                    &A2[j], LDA2,
                    &A2[j+1], LDA2);
            }

            /*
             * Calculate T.
             */

            if (i > 0 ) {

                l = min(i, max(0, N-ii));
                alpha = -(TAU[j]);

                CORE_zpemv(
                        PlasmaNoTrans, PlasmaRowwise,
                        i , min(j, N), l,
                        alpha, &A2[ii], LDA2,
                        &A2[j], LDA2,
                        zzero, &T[LDT*j], 1,
                        WORK);

                /* T(0:i-1, j) = T(0:i-1, ii:j-1) * T(0:i-1, j) */
                cblas_ztrmv(
                        CblasColMajor, (CBLAS_UPLO)PlasmaUpper,
                        (CBLAS_TRANSPOSE)PlasmaNoTrans,
                        (CBLAS_DIAG)PlasmaNonUnit,
                        i, &T[LDT*ii], LDT,
                        &T[LDT*j], 1);

            }

#ifdef COMPLEX
            LAPACKE_zlacgv_work(ni, &A2[j], LDA2 );
            LAPACKE_zlacgv_work(ione, &A1[LDA1*j+j], LDA1 );
#endif

            T[LDT*j+i] = TAU[j];
        }

        /* Apply Q to the rest of the matrix to the right */
        if (M > ii+sb) {
            mi = M-(ii+sb);
            ni = min(ii+sb, N);
            l  = min(sb, max(0, ni-ii));
            CORE_zparfb(
                PlasmaRight, PlasmaNoTrans,
                PlasmaForward, PlasmaRowwise,
                mi, IB, mi, ni, sb, l,
                &A1[LDA1*ii+ii+sb], LDA1,
                &A2[ii+sb], LDA2,
                &A2[ii], LDA2,
                &T[LDT*ii], LDT,
                WORK, M);

        }
    }
    return PLASMA_SUCCESS;
}
예제 #2
0
int CORE_zttqrt(int M, int N, int IB,
                PLASMA_Complex64_t *A1, int LDA1,
                PLASMA_Complex64_t *A2, int LDA2,
                PLASMA_Complex64_t *T, int LDT,
                PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK)
{
    static PLASMA_Complex64_t zone  = 1.0;
    static PLASMA_Complex64_t zzero = 0.0;
    static int                ione  = 1;

    PLASMA_Complex64_t alpha;
    int i, j, ii, sb, mi, ni;

    /* Check input arguments */
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (IB < 0) {
        coreblas_error(3, "Illegal value of IB");
        return -3;
    }
    if ((LDA2 < max(1,M)) && (M > 0)) {
        coreblas_error(7, "Illegal value of LDA2");
        return -7;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    for(ii = 0; ii < N; ii += IB) {
        sb = min(N-ii, IB);
        for(i = 0; i < sb; i++) {
            /*
             * Generate elementary reflector H( II*IB+I ) to annihilate
             * A( II*IB+I:mi, II*IB+I ).
             */
            mi = ii + i + 1;
            LAPACKE_zlarfg_work(mi+1, &A1[LDA1*(ii+i)+ii+i], &A2[LDA2*(ii+i)], ione, &TAU[ii+i]);

            if (sb-i-1>0) {
                /*
                 * Apply H( II*IB+I ) to A( II*IB+I:M, II*IB+I+1:II*IB+IB ) from the left.
                 */
                ni = sb-i-1;
                cblas_zcopy(
                    ni,
                    &A1[LDA1*(ii+i+1)+(ii+i)], LDA1,
                    WORK, 1);

#ifdef COMPLEX
                LAPACKE_zlacgv_work(ni, WORK, ione);
#endif
                cblas_zgemv(
                    CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans,
                    mi, ni,
                    CBLAS_SADDR(zone), &A2[LDA2*(ii+i+1)], LDA2,
                    &A2[LDA2*(ii+i)], 1,
                    CBLAS_SADDR(zone), WORK, 1);
#ifdef COMPLEX
                LAPACKE_zlacgv_work(ni, WORK, ione);
#endif
                alpha = -conj(TAU[ii+i]);
                cblas_zaxpy(
                    ni, CBLAS_SADDR(alpha),
                    WORK, 1,
                    &A1[LDA1*(ii+i+1)+ii+i], LDA1);
#ifdef COMPLEX
                LAPACKE_zlacgv_work(ni, WORK, ione);
#endif
                cblas_zgerc(
                    CblasColMajor, mi, ni,
                    CBLAS_SADDR(alpha), &A2[LDA2*(ii+i)], 1,
                    WORK, 1,
                    &A2[LDA2*(ii+i+1)], LDA2);
            }

            /*
             * Calculate T.
             */

            if (i > 0 ) {

                cblas_zcopy(i, &A2[LDA2*(ii+i)+ii], 1, &WORK[ii], 1);
    
                cblas_ztrmv(
                    CblasColMajor, (CBLAS_UPLO)PlasmaUpper,
                    (CBLAS_TRANSPOSE)PlasmaConjTrans, (CBLAS_DIAG)PlasmaNonUnit,
                    i, &A2[LDA2*ii+ii], LDA2,
                    &WORK[ii], 1);
    
                alpha = -(TAU[ii+i]);
                for(j = 0; j < i; j++) {
                    WORK[ii+j] = alpha * WORK[ii+j];
                }
    
                if (ii > 0) {
                    cblas_zgemv(
                        CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, ii, i,
                        CBLAS_SADDR(alpha), &A2[LDA2*ii], LDA2,
                        &A2[LDA2*(ii+i)], 1,
                        CBLAS_SADDR(zzero), WORK, 1);
    
                    cblas_zaxpy(i, CBLAS_SADDR(zone), &WORK[ii], 1, WORK, 1);
                }
    
                cblas_zcopy(i, WORK, 1, &T[LDT*(ii+i)], 1);
    
                cblas_ztrmv(
                    CblasColMajor, (CBLAS_UPLO)PlasmaUpper,
                    (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit,
                    i, &T[LDT*ii], LDT,
                    &T[LDT*(ii+i)], 1);

            }

            T[LDT*(ii+i)+i] = TAU[ii+i];
        }

        /* Apply Q' to the rest of the matrix to the left  */
        if (N > ii+sb) {
            CORE_zttrfb(
                PlasmaLeft, PlasmaConjTrans,
                PlasmaForward, PlasmaColumnwise,
                sb, N-(ii+sb), ii+sb, N-(ii+sb), sb,
                &A1[LDA1*(ii+sb)+ii], LDA1,
                &A2[LDA2*(ii+sb)], LDA2,
                &A2[LDA2*ii], LDA2,
                &T[LDT*ii], LDT,
                WORK, sb);
        }
    }
    return PLASMA_SUCCESS;
}
예제 #3
0
int CORE_ztsqrt(int M, int N, int IB,
                PLASMA_Complex64_t *A1, int LDA1,
                PLASMA_Complex64_t *A2, int LDA2,
                PLASMA_Complex64_t *T, int LDT,
                PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK)
{
    static PLASMA_Complex64_t zone  = 1.0;
    static PLASMA_Complex64_t zzero = 0.0;

    PLASMA_Complex64_t alpha;
    int i, ii, sb;

    /* Check input arguments */
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (IB < 0) {
        coreblas_error(3, "Illegal value of IB");
        return -3;
    }
    if ((LDA2 < max(1,M)) && (M > 0)) {
        coreblas_error(8, "Illegal value of LDA2");
        return -8;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    for(ii = 0; ii < N; ii += IB) {
        sb = min(N-ii, IB);
        for(i = 0; i < sb; i++) {
            /*
             * Generate elementary reflector H( II*IB+I ) to annihilate
             * A( II*IB+I:M, II*IB+I )
             */
            LAPACKE_zlarfg_work(M+1, &A1[LDA1*(ii+i)+ii+i], &A2[LDA2*(ii+i)], 1, &TAU[ii+i]);

            if (ii+i+1 < N) {
                /*
                 * Apply H( II*IB+I ) to A( II*IB+I:M, II*IB+I+1:II*IB+IB ) from the left
                 */
                alpha = -conj(TAU[ii+i]);
                cblas_zcopy(
                    sb-i-1,
                    &A1[LDA1*(ii+i+1)+(ii+i)], LDA1,
                    WORK, 1);
#ifdef COMPLEX
                LAPACKE_zlacgv_work(sb-i-1, WORK, 1);
#endif
                cblas_zgemv(
                    CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans,
                    M, sb-i-1,
                    CBLAS_SADDR(zone), &A2[LDA2*(ii+i+1)], LDA2,
                    &A2[LDA2*(ii+i)], 1,
                    CBLAS_SADDR(zone), WORK, 1);
#ifdef COMPLEX
                LAPACKE_zlacgv_work(sb-i-1, WORK, 1 );
#endif
                cblas_zaxpy(
                    sb-i-1, CBLAS_SADDR(alpha),
                    WORK, 1,
                    &A1[LDA1*(ii+i+1)+ii+i], LDA1);
#ifdef COMPLEX
                LAPACKE_zlacgv_work(sb-i-1, WORK, 1 );
#endif
                cblas_zgerc(
                    CblasColMajor, M, sb-i-1, CBLAS_SADDR(alpha),
                    &A2[LDA2*(ii+i)], 1,
                    WORK, 1,
                    &A2[LDA2*(ii+i+1)], LDA2);
            }
            /*
             * Calculate T
             */
            alpha = -TAU[ii+i];
            cblas_zgemv(
                CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, M, i,
                CBLAS_SADDR(alpha), &A2[LDA2*ii], LDA2,
                &A2[LDA2*(ii+i)], 1,
                CBLAS_SADDR(zzero), &T[LDT*(ii+i)], 1);

            cblas_ztrmv(
                CblasColMajor, (CBLAS_UPLO)PlasmaUpper,
                (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i,
                &T[LDT*ii], LDT,
                &T[LDT*(ii+i)], 1);

            T[LDT*(ii+i)+i] = TAU[ii+i];
        }
        if (N > ii+sb) {
            CORE_ztsmqr(
                PlasmaLeft, PlasmaConjTrans,
                sb, N-(ii+sb), M, N-(ii+sb), IB, IB,
                &A1[LDA1*(ii+sb)+ii], LDA1,
                &A2[LDA2*(ii+sb)], LDA2,
                &A2[LDA2*ii], LDA2,
                &T[LDT*ii], LDT,
                WORK, sb);
        }
    }
    return PLASMA_SUCCESS;
}
예제 #4
0
/***************************************************************************//**
 * @ingroup CORE_PLASMA_Complex64_t
 *******************************************************************************
 *
 *  Purpose
 *  =======
 *
 *  CORE_zlarfx2c applies a complex elementary reflector H to a diagonal corner 
 *  C=[C1, C2, C3], from both the left and the right side. C = H * C * H.
 *  It is used in the case of general matrices, where it create a nnz at the 
 *  NEW_NNZ position, then it eliminate it and update the reflector V and TAU.
 *  If PlasmaLower, a left apply is followed by a right apply.
 *  If PlasmaUpper, a right apply is followed by a left apply.
 *  H is represented in the form
 *   
 *  This routine is a special code for a corner C diagonal block  C1  NEW_NNZ
 *                                                                C2  C3
 *   
 *
 *        H = I - tau * v * v'
 *
 *  where tau is a complex scalar and v is a complex vector.
 *
 *  If tau = 0, then H is taken to be the unit matrix
 *
 *  This version uses inline code if H has order < 11.
 *
 *  Arguments
 *  =========
 *
 * @param[in] uplo
 *          = PlasmaUpper: Upper triangle of A is stored;
 *          = PlasmaLower: Lower triangle of A is stored.
 *
 * @param[in, out] V
 *          On entry, the double complex V in the representation of H.
 *          On exit, the double complex V in the representation of H, 
 *          updated by the elimination of the NEW_NNZ created by the 
 *          left apply in case of PlasmaLower or the right apply in 
 *          case of PlasmaUpper.
 *
 * @param[in] TAU
 *          On entry, the value tau in the representation of H.
 *          On exit, the value tau in the representation of H, 
 *          updated by the elimination of the NEW_NNZ created by the 
 *          left apply in case of PlasmaLower or the right apply in 
 *          case of PlasmaUpper.
 *
 * @param[in,out] C1
 *          On entry, the element C1.
 *          On exit, C1 is overwritten by the result H * C * H.
 *
 * @param[in,out] C2
 *          On entry, the element C2.
 *          On exit, C2 is overwritten by the result H * C * H.
  *
 * @param[in,out] C3
 *          On entry, the element C3.
 *          On exit, C3 is overwritten by the result H * C * H.
 *
 *  =====================================================================
 *
 * @return
 *          \retval PLASMA_SUCCESS successful exit
 *          \retval <0 if -i, the i-th argument had an illegal value
 *
 ******************************************************************************/
int 
CORE_zlarfx2ce(PLASMA_enum uplo,
               PLASMA_Complex64_t *V,
               PLASMA_Complex64_t *TAU,
               PLASMA_Complex64_t *C1,
               PLASMA_Complex64_t *C2,
               PLASMA_Complex64_t *C3)
{
    PLASMA_Complex64_t T2, SUM, TEMP, VIN, TAUIN;

    /* Quick return */
    if (*TAU == (PLASMA_Complex64_t)0.0)
        return PLASMA_SUCCESS;

    /*
     *        Special code for a diagonal block  C1
     *                                           C2  C3
     */
    if(uplo==PlasmaLower){    
        /*
         *  Do the corner for the lower case BIDIAG ==> Left then will
         *  create a new nnz. eliminate it and modify V TAU and then
         *  Right L and R for the 2x2 corner
         *             C(N-1, N-1)  C(N-1,N)        C1  TEMP 
         *             C(N  , N-1)  C(N  ,N)        C2  C3 
         */
        VIN   = *V;
        TAUIN = conj(*TAU);
        /*  Left 1 ==> C1 */
        /*             C2 */
        VIN  = conj(VIN);
        T2   = TAUIN * conj(VIN);
        SUM  = *C1   + VIN*(*C2);
        *C1  = *C1   - SUM*TAUIN;
        *C2  = *C2   - SUM*T2;
        /*   new nnz at TEMP and update C3 */
        SUM  =        VIN * (*C3);
        TEMP =      - SUM * TAUIN;
        *C3  = *C3  - SUM * T2;
        /*  generate Householder to annihilate the nonzero created at TEMP */
        *V    = TEMP;
        LAPACKE_zlarfg_work( 2, C1, V, 1, TAU);
        VIN   = conj(*V);
        TAUIN = conj(*TAU);
        /*  Right 1 ==> C2 C3 */
        /* VIN     = VIN */
        T2  = TAUIN * conj(VIN);
        SUM = *C2   + VIN*(*C3);
        *C2 = *C2   - SUM*TAUIN;
        *C3 = *C3   - SUM*T2;
    }else if(uplo==PlasmaUpper){ 
        /*  
         * Do the corner for the upper case BIDIAG ==> Right then will
        *  create a new nnz. eliminate it and modify V TAU and then
        *  Left
        *             C(N-1, N-1)  C(N-1,N)        C1    C2 
        *             C(N  , N-1)  C(N  ,N)        TEMP  C3 
        *  For Left : use conj(TAU) and V. 
        *  For Right: use conj(TAU) and conj(V) as input. 
        */
        VIN   = conj(*V);
        TAUIN = conj(*TAU);
        /*  Right 1 ==> C1 C2 */
        /* VIN     = VIN */
        T2   = TAUIN*conj(VIN);
        SUM  = *C1  + VIN*(*C2);
        *C1  = *C1  - SUM*TAUIN;
        *C2  = *C2  - SUM*T2;
        /*   new nnz at TEMP and update C3 */
        SUM  =        VIN * (*C3);
        TEMP =      - SUM * TAUIN;
        *C3  = *C3  - SUM * T2;
        /*  generate Householder to annihilate the nonzero created at TEMP */
        *V    = TEMP;
        LAPACKE_zlarfg_work( 2, C1, V, 1, TAU);
        VIN   = *V;
        TAUIN = conj(*TAU);
        /*  apply from the Left using the NEW V TAU to the remaining 2 elements [C2 C3] */
        /*  Left 2 ==> C2 */
        /*             C3 */
        VIN = conj(VIN);
        T2  = TAUIN*conj(VIN);
        SUM = *C2 + VIN*(*C3);
        *C2 = *C2 - SUM*TAUIN;
        *C3 = *C3 - SUM*T2;
    }
    return PLASMA_SUCCESS;
}
예제 #5
0
/***************************************************************************
 *          TYPE 2-BAND Lower-columnwise-Householder
 ***************************************************************************/
void
CORE_zhbtype2cb(int N, int NB,
                PLASMA_Complex64_t *A, int LDA,
                PLASMA_Complex64_t *V, PLASMA_Complex64_t *TAU,
                int st, int ed, int sweep, int Vblksiz, int WANTZ,
                PLASMA_Complex64_t *WORK)
{
    PLASMA_Complex64_t ctmp;
    int J1, J2, len, lem, LDX;
    int blkid, vpos, taupos, tpos;

    if( WANTZ == 0 ) {
        vpos   = ((sweep+1)%2)*N + st;
        taupos = ((sweep+1)%2)*N + st;
    } else {
        findVTpos(N, NB, Vblksiz, sweep, st,
                  &vpos, &taupos, &tpos, &blkid);
    }

    LDX = LDA-1;
    J1  = ed+1;
    J2  = min(ed+NB,N-1);
    len = ed-st+1;
    lem = J2-J1+1;

    if( lem > 0 ) {
        /* Apply remaining right commming from the top block */
        LAPACKE_zlarfx_work(LAPACK_COL_MAJOR, lapack_const(PlasmaRight),
                            lem, len, V(vpos), *(TAU(taupos)), A(J1, st), LDX, WORK);
    }

    if( lem > 1 ) {
        if( WANTZ == 0 ) {
            vpos   = ((sweep+1)%2)*N + J1;
            taupos = ((sweep+1)%2)*N + J1;
        } else {
            findVTpos(N,NB,Vblksiz,sweep,J1, &vpos, &taupos, &tpos, &blkid);
        }

        /* Remove the first column of the created bulge */
        *V(vpos) = 1.;

        memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(PLASMA_Complex64_t));
        memset(A(J1+1, st), 0, (lem-1)*sizeof(PLASMA_Complex64_t));

        /* Eliminate the col at st */
        LAPACKE_zlarfg_work( lem, A(J1, st), V(vpos+1), 1, TAU(taupos) );

        /*
         * Apply left on A(J1:J2,st+1:ed)
         * We decrease len because we start at col st+1 instead of st.
         * col st is the col that has been revomved;
         */
        len = len-1;

        ctmp = conj(*TAU(taupos));
        LAPACKE_zlarfx_work(LAPACK_COL_MAJOR, lapack_const(PlasmaLeft),
                            lem, len, V(vpos), ctmp, A(J1, st+1), LDX, WORK);
    }
    return;
}